deunicode/
lib.rs

1//! The `deunicode` library transliterates Unicode strings such as "Æneid" into pure
2//! ASCII ones such as "AEneid."
3//!
4//! It started as a Rust port of [`Text::Unidecode`](http://search.cpan.org/~sburke/Text-Unidecode-1.30/lib/Text/Unidecode.pm) Perl module, and was extended to support emoji.
5//!
6//! See [README](https://github.com/kornelski/deunicode/blob/master/README.md) for more info.
7//!
8//! Examples
9//! --------
10#![cfg_attr(feature = "alloc", doc = "```rust")]
11#![cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
12//! extern crate deunicode;
13//! use deunicode::deunicode;
14//!
15//! assert_eq!(deunicode("Æneid"), "AEneid");
16//! assert_eq!(deunicode("étude"), "etude");
17//! assert_eq!(deunicode("北亰"), "Bei Jing");
18//! assert_eq!(deunicode("ᔕᓇᓇ"), "shanana");
19//! assert_eq!(deunicode("げんまい茶"), "genmaiCha");
20#![doc = "```"] // to mollify some syntax highlighters
21
22#![no_std]
23
24#[cfg(any(test, feature = "alloc"))]
25extern crate alloc;
26#[cfg(feature = "alloc")]
27use alloc::borrow::Cow;
28#[cfg(feature = "alloc")]
29use alloc::string::String;
30
31use core::iter::FusedIterator;
32use core::str::Chars;
33
34const MAPPING: &str = include_str!("mapping.txt");
35
36#[repr(C)]
37#[derive(Copy, Clone)]
38struct Ptr {
39    /// if len <= 2, it's the string itself,
40    /// otherwise it's an u16 offset into MAPPING
41    chr: [u8; 2],
42    len: u8,
43}
44
45/// POINTERS format is described by struct Ptr
46const POINTERS: &[u8] = include_bytes!("pointers.bin");
47
48/// This function takes any Unicode string and returns an ASCII transliteration
49/// of that string.
50///
51/// Guarantees and Warnings
52/// -----------------------
53/// Here are some guarantees you have when calling `deunicode()`:
54///   * The `String` returned will be valid ASCII; the decimal representation of
55///     every `char` in the string will be between 0 and 127, inclusive.
56///   * Every ASCII character (0x0000 - 0x007F) is mapped to itself.
57///   * All Unicode characters will translate to a string containing newlines
58///     (`"\n"`) or ASCII characters in the range 0x0020 - 0x007E. So for example,
59///     no Unicode character will translate to `\u{01}`. The exception is if the
60///     ASCII character itself is passed in, in which case it will be mapped to
61///     itself. (So `'\u{01}'` will be mapped to `"\u{01}"`.)
62///
63/// There are, however, some things you should keep in mind:
64///   * As stated, some transliterations do produce `\n` characters.
65///   * Some Unicode characters transliterate to an empty string on purpose.
66///   * Some Unicode characters are unknown and transliterate to `"[?]"` (see `deunicode_with_tofu`)
67///   * Many Unicode characters transliterate to multi-character strings. For
68///     example, 北 is transliterated as "Bei ".
69///   * Han characters are mapped to Mandarin, and will be mostly illegible to Japanese readers.
70#[inline(always)]
71#[cfg(feature = "alloc")]
72pub fn deunicode(s: &str) -> String {
73    deunicode_with_tofu(s, "[?]")
74}
75
76/// Same as `deunicode`, but unknown characters can be replaced with a custom string.
77///
78/// You can use "\u{FFFD}" to use the usual Unicode Replacement Character.
79///
80/// "Tofu" is a nickname for a replacement character, which in Unicode fonts usually
81/// looks like a block of tofu.
82#[inline]
83#[cfg(feature = "alloc")]
84pub fn deunicode_with_tofu(s: &str, custom_placeholder: &str) -> String {
85    deunicode_with_tofu_cow(s, custom_placeholder).into_owned()
86}
87
88/// Same as `deunicode_with_tofu`, but avoids allocating a new `String` if not necessary.
89///
90/// You can use "\u{FFFD}" to use the usual Unicode Replacement Character.
91///
92/// "Tofu" is a nickname for a replacement character, which in Unicode fonts usually
93/// looks like a block of tofu.
94#[cfg(feature = "alloc")]
95pub fn deunicode_with_tofu_cow<'input>(s: &'input str, custom_placeholder: &str) -> Cow<'input, str> {
96    // Fast path to skip over ASCII chars at the beginning of the string
97    let ascii_len = s.as_bytes().iter().take_while(|&&c| c < 0x7F).count();
98    if ascii_len >= s.len() { // >= elides bounds check in split_at
99        return Cow::Borrowed(s);
100    }
101
102    // reserve a bit more space to avoid reallocations on longer transliterations
103    // but instead of `+ 16` uses `| 15` to stay in the smallest allocation bucket for short strings
104    let mut out = String::with_capacity(s.len() | 15);
105
106    let (ascii, rest) = s.as_bytes().split_at(ascii_len);
107
108    // safe, because it's been checked to be ASCII only
109    out.push_str(unsafe { core::str::from_utf8_unchecked(ascii) });
110
111    // safe, because UTF-8 codepoint can't start with < 7F byte
112    debug_assert!(core::str::from_utf8(rest).is_ok());
113    let s = unsafe { core::str::from_utf8_unchecked(rest) };
114
115    out.extend(s.ascii_chars().map(|ch| ch.unwrap_or(custom_placeholder)));
116    Cow::Owned(out)
117}
118
119/// This function takes a single Unicode character and returns an ASCII
120/// transliteration.
121///
122/// The warnings and guarantees of `deunicode()` apply to this function as well.
123///
124/// Examples
125/// --------
126/// ```rust
127/// # extern crate deunicode;
128/// # use deunicode::deunicode_char;
129/// assert_eq!(deunicode_char('Æ'), Some("AE"));
130/// assert_eq!(deunicode_char('北'), Some("Bei "));
131/// ```
132#[inline]
133pub fn deunicode_char(ch: char) -> Option<&'static str> {
134    // when using the global directly, LLVM fails to remove bounds checks
135    let pointers: &'static [Ptr] = unsafe {
136        core::slice::from_raw_parts(POINTERS.as_ptr().cast::<Ptr>(), POINTERS.len()/core::mem::size_of::<Ptr>())
137    };
138
139    if let Some(p) = pointers.get(ch as usize) {
140        // if length is 1 or 2, then the "pointer" data is used to store the char
141        if p.len <= 2 {
142            let chars = &p.chr[..p.len as usize];
143            // safe, because we're returning only ASCII
144            debug_assert!(core::str::from_utf8(chars).is_ok());
145            unsafe {
146                Some(core::str::from_utf8_unchecked(chars))
147            }
148        } else {
149            let map_pos = (p.chr[0] as u16 | (p.chr[1] as u16) << 8) as usize;
150            // unknown characters are intentionally mapped to out of range length
151            MAPPING.get(map_pos..map_pos + p.len as usize)
152        }
153    } else {
154        None
155    }
156}
157
158/// Convenience functions for deunicode. `use deunicode::AsciiChars`
159pub trait AsciiChars {
160    /// Iterate over Unicode characters converted to ASCII sequences.
161    ///
162    /// Items of this iterator may be `None` for some characters.
163    /// Use `.map(|ch| ch.unwrap_or("?"))` to replace invalid characters.
164    fn ascii_chars(&self) -> AsciiCharsIter<'_>;
165    /// Convert any Unicode string to ASCII-only string.
166    ///
167    /// Characters are converted to closest ASCII equivalent.
168    /// Characters that can't be converted are replaced with `"[?]"`.
169    #[cfg(feature = "alloc")]
170    fn to_ascii_lossy(&self) -> String;
171}
172
173#[cfg(feature = "alloc")]
174impl AsciiChars for String {
175    #[inline(always)]
176    fn ascii_chars(&self) -> AsciiCharsIter<'_> {
177        AsciiCharsIter::new(self)
178    }
179    #[inline(always)]
180    fn to_ascii_lossy(&self) -> String {
181        deunicode(self)
182    }
183}
184
185impl AsciiChars for str {
186    #[inline(always)]
187    fn ascii_chars(&self) -> AsciiCharsIter<'_> {
188        AsciiCharsIter::new(self)
189    }
190    #[inline(always)]
191    #[cfg(feature = "alloc")]
192    fn to_ascii_lossy(&self) -> String {
193        deunicode(self)
194    }
195}
196
197/// Iterator that translates Unicode characters to ASCII strings.pub
198///
199/// See `AsciiChars` trait's `str.ascii_chars()` method.
200pub struct AsciiCharsIter<'a> {
201    next_char: Option<Option<&'static str>>,
202    chars: Chars<'a>,
203}
204
205impl<'a> AsciiCharsIter<'a> {
206    #[inline]
207    pub fn new(unicode_string: &'a str) -> Self {
208        let mut chars = unicode_string.chars();
209        Self {
210            next_char: chars.next().map(deunicode_char),
211            chars,
212        }
213    }
214}
215
216impl<'a> FusedIterator for AsciiCharsIter<'a> {}
217
218impl<'a> Iterator for AsciiCharsIter<'a> {
219    type Item = Option<&'static str>;
220
221    #[inline]
222    fn next(&mut self) -> Option<Self::Item> {
223        self.next_char.map(|dch| {
224            self.next_char = self.chars.next().map(deunicode_char);
225            dch.map(|dch| {
226                let bytes = dch.as_bytes();
227                let ends_with_space = bytes.len() > 1 && bytes.last().cloned() == Some(b' ');
228                if !ends_with_space {
229                    return dch;
230                }
231                let space_or_end_next = self.next_char.map_or(true, |ch| { // true if end
232                    ch.map_or(false, |ch| ch.as_bytes().get(0).cloned() == Some(b' ')) // space next (assume placeholder is not space)
233                });
234                if !space_or_end_next {
235                    dch
236                } else {
237                    &dch[..dch.len()-1]
238                }
239            })
240        })
241    }
242
243    #[inline]
244    fn count(self) -> usize {
245        self.chars.count() + if self.next_char.is_some() {1} else {0}
246    }
247
248    #[inline]
249    fn size_hint(&self) -> (usize, Option<usize>) {
250        (self.chars.size_hint().0 + if self.next_char.is_some() {1} else {0}, None)
251    }
252}
253
254#[test]
255fn iter_test() {
256    use alloc::vec::Vec;
257    let chars: Vec<_> = AsciiCharsIter::new("中国").filter_map(|ch| ch).collect();
258    assert_eq!(&chars, &["Zhong ", "Guo"]);
259    let chars: Vec<_> = "中国x".ascii_chars().filter_map(|ch| ch).collect();
260    assert_eq!(&chars, &["Zhong ", "Guo ", "x"]);
261    let chars: Vec<_> = "中 国".ascii_chars().filter_map(|ch| ch).collect();
262    assert_eq!(&chars, &["Zhong", " ", "Guo"]);
263}
deunicode/lib.rs

deunicode/
lib.rs