deunicode/lib.rs
1//! The `deunicode` library transliterates Unicode strings such as "Æneid" into pure
2//! ASCII ones such as "AEneid."
3//!
4//! It started as a Rust port of [`Text::Unidecode`](http://search.cpan.org/~sburke/Text-Unidecode-1.30/lib/Text/Unidecode.pm) Perl module, and was extended to support emoji.
5//!
6//! See [README](https://github.com/kornelski/deunicode/blob/master/README.md) for more info.
7//!
8//! Examples
9//! --------
10#![cfg_attr(feature = "alloc", doc = "```rust")]
11#![cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
12//! extern crate deunicode;
13//! use deunicode::deunicode;
14//!
15//! assert_eq!(deunicode("Æneid"), "AEneid");
16//! assert_eq!(deunicode("étude"), "etude");
17//! assert_eq!(deunicode("北亰"), "Bei Jing");
18//! assert_eq!(deunicode("ᔕᓇᓇ"), "shanana");
19//! assert_eq!(deunicode("げんまい茶"), "genmaiCha");
20#![doc = "```"] // to mollify some syntax highlighters
21
22#![no_std]
23
24#[cfg(any(test, feature = "alloc"))]
25extern crate alloc;
26#[cfg(feature = "alloc")]
27use alloc::borrow::Cow;
28#[cfg(feature = "alloc")]
29use alloc::string::String;
30
31use core::iter::FusedIterator;
32use core::str::Chars;
33
34const MAPPING: &str = include_str!("mapping.txt");
35
36#[repr(C)]
37#[derive(Copy, Clone)]
38struct Ptr {
39 /// if len <= 2, it's the string itself,
40 /// otherwise it's an u16 offset into MAPPING
41 chr: [u8; 2],
42 len: u8,
43}
44
45/// POINTERS format is described by struct Ptr
46const POINTERS: &[u8] = include_bytes!("pointers.bin");
47
48/// This function takes any Unicode string and returns an ASCII transliteration
49/// of that string.
50///
51/// Guarantees and Warnings
52/// -----------------------
53/// Here are some guarantees you have when calling `deunicode()`:
54/// * The `String` returned will be valid ASCII; the decimal representation of
55/// every `char` in the string will be between 0 and 127, inclusive.
56/// * Every ASCII character (0x0000 - 0x007F) is mapped to itself.
57/// * All Unicode characters will translate to a string containing newlines
58/// (`"\n"`) or ASCII characters in the range 0x0020 - 0x007E. So for example,
59/// no Unicode character will translate to `\u{01}`. The exception is if the
60/// ASCII character itself is passed in, in which case it will be mapped to
61/// itself. (So `'\u{01}'` will be mapped to `"\u{01}"`.)
62///
63/// There are, however, some things you should keep in mind:
64/// * As stated, some transliterations do produce `\n` characters.
65/// * Some Unicode characters transliterate to an empty string on purpose.
66/// * Some Unicode characters are unknown and transliterate to `"[?]"` (see `deunicode_with_tofu`)
67/// * Many Unicode characters transliterate to multi-character strings. For
68/// example, 北 is transliterated as "Bei ".
69/// * Han characters are mapped to Mandarin, and will be mostly illegible to Japanese readers.
70#[inline(always)]
71#[cfg(feature = "alloc")]
72pub fn deunicode(s: &str) -> String {
73 deunicode_with_tofu(s, "[?]")
74}
75
76/// Same as `deunicode`, but unknown characters can be replaced with a custom string.
77///
78/// You can use "\u{FFFD}" to use the usual Unicode Replacement Character.
79///
80/// "Tofu" is a nickname for a replacement character, which in Unicode fonts usually
81/// looks like a block of tofu.
82#[inline]
83#[cfg(feature = "alloc")]
84pub fn deunicode_with_tofu(s: &str, custom_placeholder: &str) -> String {
85 deunicode_with_tofu_cow(s, custom_placeholder).into_owned()
86}
87
88/// Same as `deunicode_with_tofu`, but avoids allocating a new `String` if not necessary.
89///
90/// You can use "\u{FFFD}" to use the usual Unicode Replacement Character.
91///
92/// "Tofu" is a nickname for a replacement character, which in Unicode fonts usually
93/// looks like a block of tofu.
94#[cfg(feature = "alloc")]
95pub fn deunicode_with_tofu_cow<'input>(s: &'input str, custom_placeholder: &str) -> Cow<'input, str> {
96 // Fast path to skip over ASCII chars at the beginning of the string
97 let ascii_len = s.as_bytes().iter().take_while(|&&c| c < 0x7F).count();
98 if ascii_len >= s.len() { // >= elides bounds check in split_at
99 return Cow::Borrowed(s);
100 }
101
102 // reserve a bit more space to avoid reallocations on longer transliterations
103 // but instead of `+ 16` uses `| 15` to stay in the smallest allocation bucket for short strings
104 let mut out = String::with_capacity(s.len() | 15);
105
106 let (ascii, rest) = s.as_bytes().split_at(ascii_len);
107
108 // safe, because it's been checked to be ASCII only
109 out.push_str(unsafe { core::str::from_utf8_unchecked(ascii) });
110
111 // safe, because UTF-8 codepoint can't start with < 7F byte
112 debug_assert!(core::str::from_utf8(rest).is_ok());
113 let s = unsafe { core::str::from_utf8_unchecked(rest) };
114
115 out.extend(s.ascii_chars().map(|ch| ch.unwrap_or(custom_placeholder)));
116 Cow::Owned(out)
117}
118
119/// This function takes a single Unicode character and returns an ASCII
120/// transliteration.
121///
122/// The warnings and guarantees of `deunicode()` apply to this function as well.
123///
124/// Examples
125/// --------
126/// ```rust
127/// # extern crate deunicode;
128/// # use deunicode::deunicode_char;
129/// assert_eq!(deunicode_char('Æ'), Some("AE"));
130/// assert_eq!(deunicode_char('北'), Some("Bei "));
131/// ```
132#[inline]
133pub fn deunicode_char(ch: char) -> Option<&'static str> {
134 // when using the global directly, LLVM fails to remove bounds checks
135 let pointers: &'static [Ptr] = unsafe {
136 core::slice::from_raw_parts(POINTERS.as_ptr().cast::<Ptr>(), POINTERS.len()/core::mem::size_of::<Ptr>())
137 };
138
139 if let Some(p) = pointers.get(ch as usize) {
140 // if length is 1 or 2, then the "pointer" data is used to store the char
141 if p.len <= 2 {
142 let chars = &p.chr[..p.len as usize];
143 // safe, because we're returning only ASCII
144 debug_assert!(core::str::from_utf8(chars).is_ok());
145 unsafe {
146 Some(core::str::from_utf8_unchecked(chars))
147 }
148 } else {
149 let map_pos = (p.chr[0] as u16 | (p.chr[1] as u16) << 8) as usize;
150 // unknown characters are intentionally mapped to out of range length
151 MAPPING.get(map_pos..map_pos + p.len as usize)
152 }
153 } else {
154 None
155 }
156}
157
158/// Convenience functions for deunicode. `use deunicode::AsciiChars`
159pub trait AsciiChars {
160 /// Iterate over Unicode characters converted to ASCII sequences.
161 ///
162 /// Items of this iterator may be `None` for some characters.
163 /// Use `.map(|ch| ch.unwrap_or("?"))` to replace invalid characters.
164 fn ascii_chars(&self) -> AsciiCharsIter<'_>;
165 /// Convert any Unicode string to ASCII-only string.
166 ///
167 /// Characters are converted to closest ASCII equivalent.
168 /// Characters that can't be converted are replaced with `"[?]"`.
169 #[cfg(feature = "alloc")]
170 fn to_ascii_lossy(&self) -> String;
171}
172
173#[cfg(feature = "alloc")]
174impl AsciiChars for String {
175 #[inline(always)]
176 fn ascii_chars(&self) -> AsciiCharsIter<'_> {
177 AsciiCharsIter::new(self)
178 }
179 #[inline(always)]
180 fn to_ascii_lossy(&self) -> String {
181 deunicode(self)
182 }
183}
184
185impl AsciiChars for str {
186 #[inline(always)]
187 fn ascii_chars(&self) -> AsciiCharsIter<'_> {
188 AsciiCharsIter::new(self)
189 }
190 #[inline(always)]
191 #[cfg(feature = "alloc")]
192 fn to_ascii_lossy(&self) -> String {
193 deunicode(self)
194 }
195}
196
197/// Iterator that translates Unicode characters to ASCII strings.pub
198///
199/// See `AsciiChars` trait's `str.ascii_chars()` method.
200pub struct AsciiCharsIter<'a> {
201 next_char: Option<Option<&'static str>>,
202 chars: Chars<'a>,
203}
204
205impl<'a> AsciiCharsIter<'a> {
206 #[inline]
207 pub fn new(unicode_string: &'a str) -> Self {
208 let mut chars = unicode_string.chars();
209 Self {
210 next_char: chars.next().map(deunicode_char),
211 chars,
212 }
213 }
214}
215
216impl<'a> FusedIterator for AsciiCharsIter<'a> {}
217
218impl<'a> Iterator for AsciiCharsIter<'a> {
219 type Item = Option<&'static str>;
220
221 #[inline]
222 fn next(&mut self) -> Option<Self::Item> {
223 self.next_char.map(|dch| {
224 self.next_char = self.chars.next().map(deunicode_char);
225 dch.map(|dch| {
226 let bytes = dch.as_bytes();
227 let ends_with_space = bytes.len() > 1 && bytes.last().cloned() == Some(b' ');
228 if !ends_with_space {
229 return dch;
230 }
231 let space_or_end_next = self.next_char.map_or(true, |ch| { // true if end
232 ch.map_or(false, |ch| ch.as_bytes().get(0).cloned() == Some(b' ')) // space next (assume placeholder is not space)
233 });
234 if !space_or_end_next {
235 dch
236 } else {
237 &dch[..dch.len()-1]
238 }
239 })
240 })
241 }
242
243 #[inline]
244 fn count(self) -> usize {
245 self.chars.count() + if self.next_char.is_some() {1} else {0}
246 }
247
248 #[inline]
249 fn size_hint(&self) -> (usize, Option<usize>) {
250 (self.chars.size_hint().0 + if self.next_char.is_some() {1} else {0}, None)
251 }
252}
253
254#[test]
255fn iter_test() {
256 use alloc::vec::Vec;
257 let chars: Vec<_> = AsciiCharsIter::new("中国").filter_map(|ch| ch).collect();
258 assert_eq!(&chars, &["Zhong ", "Guo"]);
259 let chars: Vec<_> = "中国x".ascii_chars().filter_map(|ch| ch).collect();
260 assert_eq!(&chars, &["Zhong ", "Guo ", "x"]);
261 let chars: Vec<_> = "中 国".ascii_chars().filter_map(|ch| ch).collect();
262 assert_eq!(&chars, &["Zhong", " ", "Guo"]);
263}