diff --git a/.github/workflows/build-rust.yml b/.github/workflows/build-rust.yml new file mode 100644 index 0000000..ae0c280 --- /dev/null +++ b/.github/workflows/build-rust.yml @@ -0,0 +1,22 @@ +name: Rust + +on: + push: + tags: + - '[0-9]**' + pull_request: + branches: [main] + paths: + - 'rust/**' + +jobs: + + build-rust: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Test + working-directory: ${{github.workspace}}/rust + run: cargo test -q diff --git a/readme.md b/readme.md index 7e169e9..00a5267 100644 --- a/readme.md +++ b/readme.md @@ -5,7 +5,7 @@ Ukrainian Cyrillic transliteration to Latin script. [![standwithukraine](docs/StandWithUkraine.svg)](https://ukrainewar.carrd.co/) [![](https://github.com/paiv/uklatn/actions/workflows/test-builds.yml/badge.svg)](https://github.com/paiv/uklatn/actions) -[JavaScript](#javascript-package) | [Python](#python-module) | [C](c/) | [Java](#java-library) | [.NET](#net-package) | [Go](#go-package) | [PHP](#php-package) | [Elixir](#elixir-package) | [Swift](#swift-package) | [Ruby](#ruby-gem) +[JavaScript](#javascript-package) | [Python](#python-module) | [C](c/) | [Java](#java-library) | [.NET](#net-package) | [Go](#go-package) | [PHP](#php-package) | [Elixir](#elixir-package) | [Swift](#swift-package) | [Ruby](#ruby-gem) | [Rust](#rust-crate) Supported transliteration schemes: - [DSTU 9112:2021](https://uk.wikipedia.org/wiki/ДСТУ_9112:2021) @@ -114,6 +114,16 @@ gem 'uklatn' ``` +Rust crate +-- +- [uklatn Rust crate](rust/) + +Add package dependency: +```sh +cargo add uklatn +``` + + Notes -- Input is assumed to be in Ukrainian (Cyrillic or Latin script), and will be processed in full. diff --git a/rust/.gitignore b/rust/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/rust/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/rust/Cargo.lock b/rust/Cargo.lock new file mode 100644 index 0000000..6ce3bdb --- /dev/null +++ b/rust/Cargo.lock @@ -0,0 +1,100 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "tinyvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "uklatn" +version = "1.18.0" +dependencies = [ + "fancy-regex", + "once_cell", + "unicode-normalization", +] + +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000..a90e548 --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "uklatn" +version = "1.18.0" +license = "MIT" +homepage = "https://github.com/paiv/uklatn" +repository = "https://github.com/paiv/uklatn" +description = "Ukrainian Cyrillic transliteration to Latin script" +keywords = ["ukraine", "transliteration", "romanization"] +authors = ["Pavlo Ivashkov"] +edition = "2021" + +[dependencies] +fancy-regex = "0.14.0" +once_cell = "1.20.2" +unicode-normalization = "0.1.20" diff --git a/rust/readme.md b/rust/readme.md new file mode 100644 index 0000000..be2b2bf --- /dev/null +++ b/rust/readme.md @@ -0,0 +1,31 @@ +uklatn +== +Ukrainian Cyrillic transliteration to Latin script. + +Supported transliteration schemes: +- [DSTU 9112:2021](https://uk.wikipedia.org/wiki/ДСТУ_9112:2021) +- [KMU 55:2010](https://zakon.rada.gov.ua/laws/show/55-2010-п) + + +Usage +-- + +```rust +use { uklatn::Table, uklatn::decode, uklatn::encode }; + +encode("Доброго вечора!", Table::default()); +decode("Paljanycja", Table::default()); +``` + +Select a transliteration scheme: + +```rust +encode("Борщ", Table::Dstu9112B); +encode("Шевченко", Table::Kmu55); +``` + +Notes +-- +Input is assumed to be in Ukrainian (Cyrillic or Latin script), and will be processed in full. +If your data has mixed languages, do preprocessing to extract Ukrainian chunks. + diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000..577d894 --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,902 @@ +// Generated by gentables.py, do not edit. + +//! Ukrainian Cyrillic transliteration to and from Latin script. +//! +//! Tables: +//! - DSTU 9112:2021 System A +//! - DSTU 9112:2021 System B +//! - KMU 55:2010, not reversible +//! +//! # Examples +//! ``` +//! let s = uklatn::encode("Доброго вечора!", uklatn::Table::default()); +//! assert_eq!(s, "Dobroğo večora!"); +//! ``` +//! ``` +//! let s = uklatn::decode("Paljanycja", uklatn::Table::default()); +//! assert_eq!(s, "Паляниця"); +//! ``` +//! +//! Select a transliteration scheme: +//! ``` +//! let s = uklatn::encode("Борщ", uklatn::Table::Dstu9112B); +//! assert_eq!(s, "Borshch"); +//! ``` +//! +use { + fancy_regex::Captures, fancy_regex::Regex, once_cell::sync::Lazy, + unicode_normalization::UnicodeNormalization, +}; + +#[derive(Default, Debug)] +pub enum Table { + /// DSTU 9112:2021 System A + #[default] + Dstu9112A = 1, + /// DSTU 9112:2021 System B + Dstu9112B = 2, + /// KMU 55:2010, not reversible + Kmu55 = 3, +} + +/// Transliterates a string of Ukrainian Cyrillic to Latin script. +/// +/// # Examples +/// ``` +/// let s = uklatn::encode("Доброго вечора!", uklatn::Table::default()); +/// assert_eq!(s, "Dobroğo večora!"); +/// ``` +/// ``` +/// let s = uklatn::encode("Шевченко", uklatn::Table::Kmu55); +/// assert_eq!(s, "Shevchenko"); +/// ``` +pub fn encode(text: &str, table: Table) -> String { + match table { + Table::Dstu9112A => encode_dstu9112a(text), + Table::Dstu9112B => encode_dstu9112b(text), + Table::Kmu55 => encode_kmu55(text), + } +} + +/// Re-transliterates a string of Ukrainian Latin to Cyrillic script. +/// +/// # Examples +/// ``` +/// let s = uklatn::decode("Paljanycja", uklatn::Table::default()); +/// assert_eq!(s, "Паляниця"); +/// ``` +/// ``` +/// let s = uklatn::decode("Shevchenko", uklatn::Table::Dstu9112B); +/// assert_eq!(s, "Шевченко"); +/// ``` +/// +pub fn decode(text: &str, table: Table) -> String { + match table { + Table::Dstu9112A => decode_dstu9112a(text), + Table::Dstu9112B => decode_dstu9112b(text), + Table::Kmu55 => panic!("invalid table {:?}", table), + } +} + +fn encode_dstu9112a(text: &str) -> String { + static RX1: Lazy = Lazy::new(|| { + let rx: &str = r"\b([Ьь])|([Ьь](?=[АаЕеУу])|[ЄЮЯ](?=\u0301?[а-щьюяєіїґ’])|(?<=[Б-ДЖЗК-НП-ТФ-Щб-джзк-нп-тф-щҐґ])[Йй])|([ЁЄІЇЎА-яёєіїўҐґ’])"; + Regex::new(rx).unwrap() + }); + static M11: &[(&str, &str); 2] = &[("Ь", "Ĵ"), ("ь", "ĵ")]; + static M12: &[(&str, &str); 7] = &[ + ("Ь", "J'"), + ("ь", "j'"), + ("Є", "Je"), + ("Ю", "Ju"), + ("Я", "Ja"), + ("Й", "'J"), + ("й", "'j"), + ]; + static M13: &[(&str, &str); 77] = &[ + ("А", "A"), + ("а", "a"), + ("Б", "B"), + ("б", "b"), + ("В", "V"), + ("в", "v"), + ("Г", "Ğ"), + ("г", "ğ"), + ("Ґ", "G"), + ("ґ", "g"), + ("Д", "D"), + ("д", "d"), + ("Е", "E"), + ("е", "e"), + ("Є", "JE"), + ("є", "je"), + ("Ж", "Ž"), + ("ж", "ž"), + ("З", "Z"), + ("з", "z"), + ("И", "Y"), + ("и", "y"), + ("І", "I"), + ("і", "i"), + ("Ї", "Ï"), + ("ї", "ï"), + ("К", "K"), + ("к", "k"), + ("Л", "L"), + ("л", "l"), + ("М", "M"), + ("м", "m"), + ("Н", "N"), + ("н", "n"), + ("О", "O"), + ("о", "o"), + ("П", "P"), + ("п", "p"), + ("Р", "R"), + ("р", "r"), + ("С", "S"), + ("с", "s"), + ("Т", "T"), + ("т", "t"), + ("У", "U"), + ("у", "u"), + ("Ф", "F"), + ("ф", "f"), + ("Х", "X"), + ("х", "x"), + ("Ц", "C"), + ("ц", "c"), + ("Ч", "Č"), + ("ч", "č"), + ("Ш", "Š"), + ("ш", "š"), + ("Щ", "Ŝ"), + ("щ", "ŝ"), + ("Ю", "JU"), + ("ю", "ju"), + ("Я", "JA"), + ("я", "ja"), + ("Ь", "J"), + ("ь", "j"), + ("Й", "J"), + ("й", "j"), + ("’", "'"), + ("Ё", "Ö"), + ("ё", "ö"), + ("Ў", "Ŭ"), + ("ў", "ŭ"), + ("Ъ", "Ǒ"), + ("ъ", "ǒ"), + ("Ы", "Ȳ"), + ("ы", "ȳ"), + ("Э", "Ē"), + ("э", "ē"), + ]; + + let tr1 = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M11 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(2) { + let s = m.as_str(); + for p in M12 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(3) { + let s = m.as_str(); + for p in M13 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else { + caps[0].to_string() + } + }; + let text = text.nfc().collect::(); + let text = RX1.replace_all(&text, tr1); + let text = text.nfc().collect::(); + text +} + +fn encode_dstu9112b(text: &str) -> String { + static RX1: Lazy = Lazy::new(|| { + let rx: &str = r"([Ьь](?=[АаЕеІіУу])|(?<=[Б-ДЖЗК-НП-ТФ-Щб-джзк-нп-тф-щҐґ])[Йй])|([ГЄЖЇХЩШЧЮЯЁЎЪЫЭ](?=\u0301?[а-яёєіїўґ’])|\b[Ьь])|([ЁЄІЇЎА-яёєіїўҐґ’])"; + Regex::new(rx).unwrap() + }); + static M11: &[(&str, &str); 4] = &[("Ь", "J'"), ("ь", "j'"), ("Й", "'J"), ("й", "'j")]; + static M12: &[(&str, &str); 17] = &[ + ("Г", "Gh"), + ("Є", "Je"), + ("Ж", "Zh"), + ("Ї", "Ji"), + ("Х", "Kh"), + ("Щ", "Shch"), + ("Ш", "Sh"), + ("Ч", "Ch"), + ("Ю", "Ju"), + ("Я", "Ja"), + ("Ё", "Jow"), + ("Ў", "Uh"), + ("Ъ", "Oh"), + ("Ы", "Yw"), + ("Э", "Ehw"), + ("Ь", "Hj"), + ("ь", "hj"), + ]; + static M13: &[(&str, &str); 77] = &[ + ("А", "A"), + ("а", "a"), + ("Б", "B"), + ("б", "b"), + ("В", "V"), + ("в", "v"), + ("Г", "GH"), + ("г", "gh"), + ("Ґ", "G"), + ("ґ", "g"), + ("Д", "D"), + ("д", "d"), + ("Е", "E"), + ("е", "e"), + ("Є", "JE"), + ("є", "je"), + ("Ж", "ZH"), + ("ж", "zh"), + ("З", "Z"), + ("з", "z"), + ("И", "Y"), + ("и", "y"), + ("І", "I"), + ("і", "i"), + ("Ї", "JI"), + ("ї", "ji"), + ("Х", "KH"), + ("х", "kh"), + ("К", "K"), + ("к", "k"), + ("Л", "L"), + ("л", "l"), + ("М", "M"), + ("м", "m"), + ("Н", "N"), + ("н", "n"), + ("О", "O"), + ("о", "o"), + ("П", "P"), + ("п", "p"), + ("Р", "R"), + ("р", "r"), + ("Щ", "SHCH"), + ("щ", "shch"), + ("Ш", "SH"), + ("ш", "sh"), + ("С", "S"), + ("с", "s"), + ("Т", "T"), + ("т", "t"), + ("У", "U"), + ("у", "u"), + ("Ф", "F"), + ("ф", "f"), + ("Ч", "CH"), + ("ч", "ch"), + ("Ц", "C"), + ("ц", "c"), + ("Ю", "JU"), + ("ю", "ju"), + ("Я", "JA"), + ("я", "ja"), + ("Й", "J"), + ("й", "j"), + ("Ь", "J"), + ("ь", "j"), + ("’", "'"), + ("Ё", "JOW"), + ("ё", "jow"), + ("Ў", "UH"), + ("ў", "uh"), + ("Ъ", "OH"), + ("ъ", "oh"), + ("Ы", "YW"), + ("ы", "yw"), + ("Э", "EHW"), + ("э", "ehw"), + ]; + + let tr1 = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M11 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(2) { + let s = m.as_str(); + for p in M12 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(3) { + let s = m.as_str(); + for p in M13 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else { + caps[0].to_string() + } + }; + let text = text.nfc().collect::(); + let text = RX1.replace_all(&text, tr1); + let text = text.nfc().collect::(); + text +} + +fn encode_kmu55(text: &str) -> String { + static RX1: Lazy = Lazy::new(|| { + let rx: &str = r"(?<=[ЁЄІЇЎА-яёєіїўҐґ])([’\u0027])(?=[ЁЄІЇЎА-яёєіїўҐґ])"; + Regex::new(rx).unwrap() + }); + static M11: &[(&str, &str); 2] = &[("’", ""), ("'", "")]; + + let tr1 = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M11 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else { + caps[0].to_string() + } + }; + static RX2: Lazy = Lazy::new(|| { + let rx: &str = r"\b([ЄЇЮЯ])(?=\u0301?[а-яёєіїўґ’])|\b([ЙйЄЇЮЯєїюя])|([Зз]Г|[ЖХЦЩШЧЄЇЮЯ])(?=\u0301?[а-яёєіїўґ’])|([Зз][Гг]|[ЄІЇА-ЩЬЮ-щьюяєіїҐґ’])"; + Regex::new(rx).unwrap() + }); + static M21: &[(&str, &str); 4] = &[("Є", "Ye"), ("Ї", "Yi"), ("Ю", "Yu"), ("Я", "Ya")]; + static M22: &[(&str, &str); 10] = &[ + ("Й", "Y"), + ("й", "y"), + ("Є", "YE"), + ("є", "ye"), + ("Ї", "YI"), + ("ї", "yi"), + ("Ю", "YU"), + ("ю", "yu"), + ("Я", "YA"), + ("я", "ya"), + ]; + static M23: &[(&str, &str); 12] = &[ + ("ЗГ", "ZGh"), + ("зГ", "zGh"), + ("Ж", "Zh"), + ("Х", "Kh"), + ("Ц", "Ts"), + ("Щ", "Shch"), + ("Ш", "Sh"), + ("Ч", "Ch"), + ("Є", "Ie"), + ("Ї", "I"), + ("Ю", "Iu"), + ("Я", "Ia"), + ]; + static M24: &[(&str, &str); 71] = &[ + ("ЗГ", "ZGH"), + ("Зг", "Zgh"), + ("зГ", "zGH"), + ("зг", "zgh"), + ("А", "A"), + ("а", "a"), + ("Б", "B"), + ("б", "b"), + ("В", "V"), + ("в", "v"), + ("Г", "H"), + ("г", "h"), + ("Ґ", "G"), + ("ґ", "g"), + ("Д", "D"), + ("д", "d"), + ("Е", "E"), + ("е", "e"), + ("Є", "IE"), + ("є", "ie"), + ("Ж", "ZH"), + ("ж", "zh"), + ("З", "Z"), + ("з", "z"), + ("И", "Y"), + ("и", "y"), + ("І", "I"), + ("і", "i"), + ("Ї", "I"), + ("ї", "i"), + ("Х", "KH"), + ("х", "kh"), + ("К", "K"), + ("к", "k"), + ("Л", "L"), + ("л", "l"), + ("М", "M"), + ("м", "m"), + ("Н", "N"), + ("н", "n"), + ("О", "O"), + ("о", "o"), + ("П", "P"), + ("п", "p"), + ("Р", "R"), + ("р", "r"), + ("Щ", "SHCH"), + ("щ", "shch"), + ("Ш", "SH"), + ("ш", "sh"), + ("С", "S"), + ("с", "s"), + ("Т", "T"), + ("т", "t"), + ("У", "U"), + ("у", "u"), + ("Ф", "F"), + ("ф", "f"), + ("Ч", "CH"), + ("ч", "ch"), + ("Ц", "TS"), + ("ц", "ts"), + ("Ю", "IU"), + ("ю", "iu"), + ("Я", "IA"), + ("я", "ia"), + ("Й", "I"), + ("й", "i"), + ("Ь", ""), + ("ь", ""), + ("’", ""), + ]; + + let tr2 = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M21 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(2) { + let s = m.as_str(); + for p in M22 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(3) { + let s = m.as_str(); + for p in M23 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(4) { + let s = m.as_str(); + for p in M24 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else { + caps[0].to_string() + } + }; + let text = text.nfc().collect::(); + let text = RX1.replace_all(&text, tr1); + let text = RX2.replace_all(&text, tr2); + let text = text.nfc().collect::(); + text +} + +fn decode_dstu9112a(text: &str) -> String { + static RX1: Lazy = Lazy::new(|| { + let rx: &str = r"([ÁáÉéÍíÓóÚúÝýḮḯ])"; + Regex::new(rx).unwrap() + }); + static M11: &[(&str, &str); 14] = &[ + ("Á", "Á"), + ("á", "á"), + ("É", "É"), + ("é", "é"), + ("Í", "Í"), + ("í", "í"), + ("Ó", "Ó"), + ("ó", "ó"), + ("Ú", "Ú"), + ("ú", "ú"), + ("Ý", "Ý"), + ("ý", "ý"), + ("Ḯ", "Ḯ"), + ("ḯ", "ḯ"), + ]; + + let tr1 = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M11 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else { + caps[0].to_string() + } + }; + static RX2: Lazy = Lazy::new(|| { + let rx: &str = r"(J[Ee]|j[Ee]|J[Uu]|j[Uu]|J[Aa]|j[Aa]|[A-GIK-PR-VXYZa-gik-pr-vxyzÏÖïöČčĒēĞğĴĵŜŝŠšŬŭŽžǑǒȲȳ])|(?<=[BbCcDdFfGgKkLlMmNnPpRrSsTtVvXxZzČčĞğŜŝŠšŽž])([Jj]\u0027(?=[AaEeUu])|[Jj])|(\u0027[Jj](?![AaEeIiUu])|\u0027(?=[Jj])|[Jj])"; + Regex::new(rx).unwrap() + }); + static M21: &[(&str, &str); 80] = &[ + ("A", "А"), + ("a", "а"), + ("B", "Б"), + ("b", "б"), + ("V", "В"), + ("v", "в"), + ("Ğ", "Г"), + ("ğ", "г"), + ("G", "Ґ"), + ("g", "ґ"), + ("D", "Д"), + ("d", "д"), + ("E", "Е"), + ("e", "е"), + ("JE", "Є"), + ("Je", "Є"), + ("jE", "є"), + ("je", "є"), + ("Ž", "Ж"), + ("ž", "ж"), + ("Z", "З"), + ("z", "з"), + ("Y", "И"), + ("y", "и"), + ("I", "І"), + ("i", "і"), + ("Ï", "Ї"), + ("ï", "ї"), + ("K", "К"), + ("k", "к"), + ("L", "Л"), + ("l", "л"), + ("M", "М"), + ("m", "м"), + ("N", "Н"), + ("n", "н"), + ("O", "О"), + ("o", "о"), + ("P", "П"), + ("p", "п"), + ("R", "Р"), + ("r", "р"), + ("S", "С"), + ("s", "с"), + ("T", "Т"), + ("t", "т"), + ("U", "У"), + ("u", "у"), + ("F", "Ф"), + ("f", "ф"), + ("X", "Х"), + ("x", "х"), + ("C", "Ц"), + ("c", "ц"), + ("Č", "Ч"), + ("č", "ч"), + ("Š", "Ш"), + ("š", "ш"), + ("Ŝ", "Щ"), + ("ŝ", "щ"), + ("JU", "Ю"), + ("Ju", "Ю"), + ("jU", "ю"), + ("ju", "ю"), + ("JA", "Я"), + ("Ja", "Я"), + ("jA", "я"), + ("ja", "я"), + ("Ĵ", "Ь"), + ("ĵ", "ь"), + ("Ö", "Ё"), + ("ö", "ё"), + ("Ŭ", "Ў"), + ("ŭ", "ў"), + ("Ǒ", "Ъ"), + ("ǒ", "ъ"), + ("Ȳ", "Ы"), + ("ȳ", "ы"), + ("Ē", "Э"), + ("ē", "э"), + ]; + static M22: &[(&str, &str); 4] = &[("J", "Ь"), ("j", "ь"), ("J'", "Ь"), ("j'", "ь")]; + static M23: &[(&str, &str); 5] = + &[("'J", "Й"), ("'j", "й"), ("'", "’"), ("J", "Й"), ("j", "й")]; + + let tr2 = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M21 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(2) { + let s = m.as_str(); + for p in M22 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(3) { + let s = m.as_str(); + for p in M23 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else { + caps[0].to_string() + } + }; + let text = text.nfc().collect::(); + let text = RX1.replace_all(&text, tr1); + let text = RX2.replace_all(&text, tr2); + let text = text.nfc().collect::(); + text +} + +fn decode_dstu9112b(text: &str) -> String { + static RX1: Lazy = Lazy::new(|| { + let rx: &str = r"([ÁáÉéÍíÓóÚúÝý])"; + Regex::new(rx).unwrap() + }); + static M11: &[(&str, &str); 12] = &[ + ("Á", "Á"), + ("á", "á"), + ("É", "É"), + ("é", "é"), + ("Í", "Í"), + ("í", "í"), + ("Ó", "Ó"), + ("ó", "ó"), + ("Ú", "Ú"), + ("ú", "ú"), + ("Ý", "Ý"), + ("ý", "ý"), + ]; + + let tr1 = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M11 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else { + caps[0].to_string() + } + }; + static RX2: Lazy = Lazy::new(|| { + let rx: &str = r"([Jj][Oo][Ww]|[Ss][Hh][Cc][Hh]|[CcGgKkSsZzUuOo][Hh]|[Yy][Ww]|[Ee][Hh][Ww]|[Jj][EeIiUuAa]|[Hh][Jj]|[A-GIK-PR-VYZa-gik-pr-vyz])|(?<=[Ss][Hh][Cc][Hh])([Jj]\u0027(?=[AaEeIiUu])|[Jj])|(?<=[CcGgKkSsZz][Hh])([Jj]\u0027(?=[AaEeIiUu])|[Jj])|(?<=[BCDFGKLMNPRSTVZbcdfgklmnprstvzv])([Jj]\u0027(?=[AaEeIiUu])|[Jj])|(\u0027[Jj](?![AaEeIiUu])|\u0027(?=[Jj])|[Jj])"; + Regex::new(rx).unwrap() + }); + static M21: &[(&str, &str); 126] = &[ + ("A", "А"), + ("a", "а"), + ("B", "Б"), + ("b", "б"), + ("V", "В"), + ("v", "в"), + ("GH", "Г"), + ("Gh", "Г"), + ("gH", "г"), + ("gh", "г"), + ("G", "Ґ"), + ("g", "ґ"), + ("D", "Д"), + ("d", "д"), + ("E", "Е"), + ("e", "е"), + ("JE", "Є"), + ("Je", "Є"), + ("jE", "є"), + ("je", "є"), + ("ZH", "Ж"), + ("Zh", "Ж"), + ("zH", "ж"), + ("zh", "ж"), + ("Z", "З"), + ("z", "з"), + ("Y", "И"), + ("y", "и"), + ("I", "І"), + ("i", "і"), + ("JI", "Ї"), + ("Ji", "Ї"), + ("jI", "ї"), + ("ji", "ї"), + ("KH", "Х"), + ("Kh", "Х"), + ("kH", "х"), + ("kh", "х"), + ("K", "К"), + ("k", "к"), + ("L", "Л"), + ("l", "л"), + ("M", "М"), + ("m", "м"), + ("N", "Н"), + ("n", "н"), + ("O", "О"), + ("o", "о"), + ("P", "П"), + ("p", "п"), + ("R", "Р"), + ("r", "р"), + ("SHCH", "Щ"), + ("SHCh", "Щ"), + ("SHcH", "Щ"), + ("SHch", "Щ"), + ("ShCH", "Щ"), + ("ShCh", "Щ"), + ("ShcH", "Щ"), + ("Shch", "Щ"), + ("sHCH", "щ"), + ("sHCh", "щ"), + ("sHcH", "щ"), + ("sHch", "щ"), + ("shCH", "щ"), + ("shCh", "щ"), + ("shcH", "щ"), + ("shch", "щ"), + ("SH", "Ш"), + ("Sh", "Ш"), + ("sH", "ш"), + ("sh", "ш"), + ("S", "С"), + ("s", "с"), + ("T", "Т"), + ("t", "т"), + ("U", "У"), + ("u", "у"), + ("F", "Ф"), + ("f", "ф"), + ("CH", "Ч"), + ("Ch", "Ч"), + ("cH", "ч"), + ("ch", "ч"), + ("C", "Ц"), + ("c", "ц"), + ("JU", "Ю"), + ("Ju", "Ю"), + ("jU", "ю"), + ("ju", "ю"), + ("JA", "Я"), + ("Ja", "Я"), + ("jA", "я"), + ("ja", "я"), + ("HJ", "Ь"), + ("Hj", "Ь"), + ("hJ", "ь"), + ("hj", "ь"), + ("JOW", "Ё"), + ("JOw", "Ё"), + ("JoW", "Ё"), + ("Jow", "Ё"), + ("jOW", "ё"), + ("jOw", "ё"), + ("joW", "ё"), + ("jow", "ё"), + ("UH", "Ў"), + ("Uh", "Ў"), + ("uH", "ў"), + ("uh", "ў"), + ("OH", "Ъ"), + ("Oh", "Ъ"), + ("oH", "ъ"), + ("oh", "ъ"), + ("YW", "Ы"), + ("Yw", "Ы"), + ("yW", "ы"), + ("yw", "ы"), + ("EHW", "Э"), + ("EHw", "Э"), + ("EhW", "Э"), + ("Ehw", "Э"), + ("eHW", "э"), + ("eHw", "э"), + ("ehW", "э"), + ("ehw", "э"), + ]; + static M22: &[(&str, &str); 4] = &[("J", "Ь"), ("j", "ь"), ("J'", "Ь"), ("j'", "ь")]; + static M23: &[(&str, &str); 4] = &[("J", "Ь"), ("j", "ь"), ("J'", "Ь"), ("j'", "ь")]; + static M24: &[(&str, &str); 4] = &[("J", "Ь"), ("j", "ь"), ("J'", "Ь"), ("j'", "ь")]; + static M25: &[(&str, &str); 5] = + &[("'J", "Й"), ("'j", "й"), ("'", "’"), ("J", "Й"), ("j", "й")]; + + let tr2 = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M21 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(2) { + let s = m.as_str(); + for p in M22 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(3) { + let s = m.as_str(); + for p in M23 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(4) { + let s = m.as_str(); + for p in M24 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else if let Some(m) = caps.get(5) { + let s = m.as_str(); + for p in M25 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + } else { + caps[0].to_string() + } + }; + let text = text.nfc().collect::(); + let text = RX1.replace_all(&text, tr1); + let text = RX2.replace_all(&text, tr2); + let text = text.nfc().collect::(); + text +} diff --git a/rust/tests/uklatn_test.rs b/rust/tests/uklatn_test.rs new file mode 100644 index 0000000..603e7e9 --- /dev/null +++ b/rust/tests/uklatn_test.rs @@ -0,0 +1,809 @@ +// Generated by gentests.py, do not edit. + +use {uklatn::decode, uklatn::encode, uklatn::Table}; + +#[test] +fn dstu9112a_t1() { + let cyr = "Україна, Хмельницький"; + let lat = "Ukraïna, Xmeljnycjkyj"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t1_default() { + let cyr = "Україна, Хмельницький"; + let lat = "Ukraïna, Xmeljnycjkyj"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t2() { + let cyr = "Щастям б’єш жук їх глицю в фон й ґедзь пріч."; + let lat = "Ŝastjam b'ješ žuk ïx ğlycju v fon j gedzj prič."; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t2_default() { + let cyr = "Щастям б’єш жук їх глицю в фон й ґедзь пріч."; + let lat = "Ŝastjam b'ješ žuk ïx ğlycju v fon j gedzj prič."; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t3() { + let cyr = "ь Ь ль льє льї лью лья лье льі льу льа льйо льо"; + let lat = "ĵ Ĵ lj ljje ljï ljju ljja lj'e lji lj'u lj'a ljjo ljo"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t3_default() { + let cyr = "ь Ь ль льє льї лью лья лье льі льу льа льйо льо"; + let lat = "ĵ Ĵ lj ljje ljï ljju ljja lj'e lji lj'u lj'a ljjo ljo"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t4() { + let cyr = "Єл Їл Юл Ял"; + let lat = "Jel Ïl Jul Jal"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t4_default() { + let cyr = "Єл Їл Юл Ял"; + let lat = "Jel Ïl Jul Jal"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t5() { + let cyr = "бь вь гь ґь дь жь зь кь ль мь нь пь рь сь ть фь хь ць чь шь щь"; + let lat = "bj vj ğj gj dj žj zj kj lj mj nj pj rj sj tj fj xj cj čj šj ŝj"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t5_default() { + let cyr = "бь вь гь ґь дь жь зь кь ль мь нь пь рь сь ть фь хь ць чь шь щь"; + let lat = "bj vj ğj gj dj žj zj kj lj mj nj pj rj sj tj fj xj cj čj šj ŝj"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t6() { + let cyr = "бя вя гя ґя дя жя зя кя ля мя ня пя ря ся тя фя хя ця чя шя щя"; + let lat = "bja vja ğja gja dja žja zja kja lja mja nja pja rja sja tja fja xja cja čja šja ŝja"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t6_default() { + let cyr = "бя вя гя ґя дя жя зя кя ля мя ня пя ря ся тя фя хя ця чя шя щя"; + let lat = "bja vja ğja gja dja žja zja kja lja mja nja pja rja sja tja fja xja cja čja šja ŝja"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t7() { + let cyr = "б’я в’я г’я ґ’я д’я ж’я з’я к’я л’я м’я н’я п’я р’я с’я т’я ф’я х’я ц’я ч’я ш’я щ’я"; + let lat = "b'ja v'ja ğ'ja g'ja d'ja ž'ja z'ja k'ja l'ja m'ja n'ja p'ja r'ja s'ja t'ja f'ja x'ja c'ja č'ja š'ja ŝ'ja"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t7_default() { + let cyr = "б’я в’я г’я ґ’я д’я ж’я з’я к’я л’я м’я н’я п’я р’я с’я т’я ф’я х’я ц’я ч’я ш’я щ’я"; + let lat = "b'ja v'ja ğ'ja g'ja d'ja ž'ja z'ja k'ja l'ja m'ja n'ja p'ja r'ja s'ja t'ja f'ja x'ja c'ja č'ja š'ja ŝ'ja"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t8() { + let cyr = "бй бйо вй гй ґй дй жй зй кй лй мй нй пй рй сй тй фй хй цй чй шй щй"; + let lat = + "b'j b'jo v'j ğ'j g'j d'j ž'j z'j k'j l'j m'j n'j p'j r'j s'j t'j f'j x'j c'j č'j š'j ŝ'j"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t8_default() { + let cyr = "бй бйо вй гй ґй дй жй зй кй лй мй нй пй рй сй тй фй хй цй чй шй щй"; + let lat = + "b'j b'jo v'j ğ'j g'j d'j ž'j z'j k'j l'j m'j n'j p'j r'j s'j t'j f'j x'j c'j č'j š'j ŝ'j"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t9() { + let cyr = "ня ньа н’я нь'н ньн"; + let lat = "nja nj'a n'ja nj'n njn"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t9_default() { + let cyr = "ня ньа н’я нь'н ньн"; + let lat = "nja nj'a n'ja nj'n njn"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t10() { + let cyr = "рос дыня эзёдынъ. бр кроў."; + let lat = "ros dȳnja ēzödȳnǒ. br kroŭ."; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t10_default() { + let cyr = "рос дыня эзёдынъ. бр кроў."; + let lat = "ros dȳnja ēzödȳnǒ. br kroŭ."; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t11() { + let cyr = "А́ а́ Е́ е́ Є́ є́ И́ и́ І́ і́ Ї́ ї́ О́ о́ У́ у́ Ю́ ю́ Я́ я́"; + let lat = "Á á É é JÉ jé Ý ý Í í Ḯ ḯ Ó ó Ú ú JÚ jú JÁ já"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t11_default() { + let cyr = "А́ а́ Е́ е́ Є́ є́ И́ и́ І́ і́ Ї́ ї́ О́ о́ У́ у́ Ю́ ю́ Я́ я́"; + let lat = "Á á É é JÉ jé Ý ý Í í Ḯ ḯ Ó ó Ú ú JÚ jú JÁ já"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t12() { + let cyr = "Є́с сЄ́с є́с сє́с Ї́с сЇ́с ї́с сї́с Ю́с сЮ́с ю́с сю́с Я́с сЯ́с я́с ся́с"; + let lat = "Jés sJés jés sjés Ḯs sḮs ḯs sḯs Jús sJús jús sjús Jás sJás jás sjás"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t12_default() { + let cyr = "Є́с сЄ́с є́с сє́с Ї́с сЇ́с ї́с сї́с Ю́с сЮ́с ю́с сю́с Я́с сЯ́с я́с ся́с"; + let lat = "Jés sJés jés sjés Ḯs sḮs ḯs sḯs Jús sJús jús sjús Jás sJás jás sjás"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t13() { + let cyr = "' ім’я 'жук' \"жук\" ' '"; + let lat = "' im'ja 'žuk' \"žuk\" ' '"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t13_default() { + let cyr = "' ім’я 'жук' \"жук\" ' '"; + let lat = "' im'ja 'žuk' \"žuk\" ' '"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t14() { + let cyr = "Сонце світить майже білим світлом, однак через сильніше розсіювання і поглинання короткохвильової частини спектра атмосферою Землі пряме світло Сонця біля поверхні нашої планети набуває певного жовтого відтінку. Якщо небо ясне, то блакитний відтінок розсіяного світла складається з жовтуватим прямим сонячним світлом і загальне освітлення об’єктів на Землі стає білим."; + let lat = "Sonce svitytj majže bilym svitlom, odnak čerez syljniše rozsijuvannja i poğlynannja korotkoxvyljovoï častyny spektra atmosferoju Zemli prjame svitlo Soncja bilja poverxni našoï planety nabuvaje pevnoğo žovtoğo vidtinku. Jakŝo nebo jasne, to blakytnyj vidtinok rozsijanoğo svitla skladajetjsja z žovtuvatym prjamym sonjačnym svitlom i zağaljne osvitlennja ob'jektiv na Zemli staje bilym."; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t14_default() { + let cyr = "Сонце світить майже білим світлом, однак через сильніше розсіювання і поглинання короткохвильової частини спектра атмосферою Землі пряме світло Сонця біля поверхні нашої планети набуває певного жовтого відтінку. Якщо небо ясне, то блакитний відтінок розсіяного світла складається з жовтуватим прямим сонячним світлом і загальне освітлення об’єктів на Землі стає білим."; + let lat = "Sonce svitytj majže bilym svitlom, odnak čerez syljniše rozsijuvannja i poğlynannja korotkoxvyljovoï častyny spektra atmosferoju Zemli prjame svitlo Soncja bilja poverxni našoï planety nabuvaje pevnoğo žovtoğo vidtinku. Jakŝo nebo jasne, to blakytnyj vidtinok rozsijanoğo svitla skladajetjsja z žovtuvatym prjamym sonjačnym svitlom i zağaljne osvitlennja ob'jektiv na Zemli staje bilym."; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t22() { + let cyr = "дуб!дуб\"дуб#дуб$дуб%дуб&дуб'дуб(дуб)дуб*дуб+дуб,дуб-дуб.дуб/дуб:дуб;дуб<дуб=дуб>дуб?дуб@дуб[дуб\\дуб]дуб^дуб_дуб`дуб{дуб|дуб}дуб~дуб"; + let lat = "dub!dub\"dub#dub$dub%dub&dub'dub(dub)dub*dub+dub,dub-dub.dub/dub:dub;dubdub?dub@dub[dub\\dub]dub^dub_dub`dub{dub|dub}dub~dub"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t22_default() { + let cyr = "дуб!дуб\"дуб#дуб$дуб%дуб&дуб'дуб(дуб)дуб*дуб+дуб,дуб-дуб.дуб/дуб:дуб;дуб<дуб=дуб>дуб?дуб@дуб[дуб\\дуб]дуб^дуб_дуб`дуб{дуб|дуб}дуб~дуб"; + let lat = "dub!dub\"dub#dub$dub%dub&dub'dub(dub)dub*dub+dub,dub-dub.dub/dub:dub;dubdub?dub@dub[dub\\dub]dub^dub_dub`dub{dub|dub}dub~dub"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t23() { + let cyr = "бод бод\tбод\nбод\rбод"; + let lat = "bod bod\tbod\nbod\rbod"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t23_default() { + let cyr = "бод бод\tбод\nбод\rбод"; + let lat = "bod bod\tbod\nbod\rbod"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t24() { + let cyr = "об😎нап😘неп😭нєп🧐нїп😍нюп😀няп"; + let lat = "ob😎nap😘nep😭njep🧐nïp😍njup😀njap"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112A); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t24_default() { + let cyr = "об😎нап😘неп😭нєп🧐нїп😍нюп😀няп"; + let lat = "ob😎nap😘nep😭njep🧐nïp😍njup😀njap"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); + let t = decode(lat, Table::default()); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112a_t15() { + let cyr = "в’я в'я"; + let lat = "v'ja v'ja"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); +} + +#[test] +fn dstu9112a_t15_default() { + let cyr = "в’я в'я"; + let lat = "v'ja v'ja"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); +} + +#[test] +fn dstu9112a_t16() { + let cyr = "Ї ї Й й Ё ё Ў ў"; + let lat = "Ï ï J j Ö ö Ŭ ŭ"; + let q = encode(cyr, Table::Dstu9112A); + assert_eq!(q, lat); +} + +#[test] +fn dstu9112a_t16_default() { + let cyr = "Ї ї Й й Ё ё Ў ў"; + let lat = "Ï ï J j Ö ö Ŭ ŭ"; + let q = encode(cyr, Table::default()); + assert_eq!(q, lat); +} + +#[test] +fn dstu9112a_t17() { + let cyr = "я є ю"; + let lat = "jA jE jU"; + let q = decode(lat, Table::Dstu9112A); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t17_default() { + let cyr = "я є ю"; + let lat = "jA jE jU"; + let q = decode(lat, Table::default()); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t18() { + let cyr = "Ї ї Ь ь Ч ч Г г Щ щ Ш ш Ж ж"; + let lat = "Ï ï Ĵ ĵ Č č Ğ ğ Ŝ ŝ Š š Ž ž"; + let q = decode(lat, Table::Dstu9112A); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t18_default() { + let cyr = "Ї ї Ь ь Ч ч Г г Щ щ Ш ш Ж ж"; + let lat = "Ï ï Ĵ ĵ Č č Ğ ğ Ŝ ŝ Š š Ž ž"; + let q = decode(lat, Table::default()); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t19() { + let cyr = "Ё ё Ў ў Ъ ъ Ы ы Э э"; + let lat = "Ö ö Ŭ ŭ Ǒ ǒ Ȳ ȳ Ē ē"; + let q = decode(lat, Table::Dstu9112A); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t19_default() { + let cyr = "Ё ё Ў ў Ъ ъ Ы ы Э э"; + let lat = "Ö ö Ŭ ŭ Ǒ ǒ Ȳ ȳ Ē ē"; + let q = decode(lat, Table::default()); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t20() { + let cyr = "А́ а́ Е́ е́ Є́ Є́ є́ є́ И́ и́ І́ і́ Ї́ ї́ О́ о́ У́ у́ Ю́ Ю́ ю́ ю́ Я́ Я́ я́ я́"; + let lat = "Á á É é JÉ Jé jÉ jé Ý ý Í í Ḯ ḯ Ó ó Ú ú JÚ Jú jÚ jú JÁ Já jÁ já"; + let q = decode(lat, Table::Dstu9112A); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t20_default() { + let cyr = "А́ а́ Е́ е́ Є́ Є́ є́ є́ И́ и́ І́ і́ Ї́ ї́ О́ о́ У́ у́ Ю́ Ю́ ю́ ю́ Я́ Я́ я́ я́"; + let lat = "Á á É é JÉ Jé jÉ jé Ý ý Í í Ḯ ḯ Ó ó Ú ú JÚ Jú jÚ jú JÁ Já jÁ já"; + let q = decode(lat, Table::default()); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t21() { + let cyr = "Є́с сЄ́с є́с сє́с Ї́с сЇ́с ї́с сї́с Ю́с сЮ́с ю́с сю́с Я́с сЯ́с я́с ся́с"; + let lat = "Jés sJés jés sjés Ḯs sḮs ḯs sḯs Jús sJús jús sjús Jás sJás jás sjás"; + let q = decode(lat, Table::Dstu9112A); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112a_t21_default() { + let cyr = "Є́с сЄ́с є́с сє́с Ї́с сЇ́с ї́с сї́с Ю́с сЮ́с ю́с сю́с Я́с сЯ́с я́с ся́с"; + let lat = "Jés sJés jés sjés Ḯs sḮs ḯs sḯs Jús sJús jús sjús Jás sJás jás sjás"; + let q = decode(lat, Table::default()); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112b_t1() { + let cyr = "Україна, Хмельницький"; + let lat = "Ukrajina, Khmeljnycjkyj"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t2() { + let cyr = "Щастям б’єш жук їх глицю в фон й ґедзь пріч."; + let lat = "Shchastjam b'jesh zhuk jikh ghlycju v fon j gedzj prich."; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t3() { + let cyr = "ь Ь ль льє льї лью лья лье льі льу льа льйо льо"; + let lat = "hj Hj lj ljje ljji ljju ljja lj'e lj'i lj'u lj'a ljjo ljo"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t4() { + let cyr = "Єл Їл Юл Ял"; + let lat = "Jel Jil Jul Jal"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t5() { + let cyr = "бь вь гь ґь дь жь зь кь ль мь нь пь рь сь ть фь хь ць чь шь щь"; + let lat = "bj vj ghj gj dj zhj zj kj lj mj nj pj rj sj tj fj khj cj chj shj shchj"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t6() { + let cyr = "бя вя гя ґя дя жя зя кя ля мя ня пя ря ся тя фя хя ця чя шя щя"; + let lat = "bja vja ghja gja dja zhja zja kja lja mja nja pja rja sja tja fja khja cja chja shja shchja"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t7() { + let cyr = "б’я в’я г’я ґ’я д’я ж’я з’я к’я л’я м’я н’я п’я р’я с’я т’я ф’я х’я ц’я ч’я ш’я щ’я"; + let lat = "b'ja v'ja gh'ja g'ja d'ja zh'ja z'ja k'ja l'ja m'ja n'ja p'ja r'ja s'ja t'ja f'ja kh'ja c'ja ch'ja sh'ja shch'ja"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t8() { + let cyr = "бй бйо вй гй ґй дй жй зй кй лй мй нй пй рй сй тй фй хй цй чй шй щй"; + let lat = "b'j b'jo v'j gh'j g'j d'j zh'j z'j k'j l'j m'j n'j p'j r'j s'j t'j f'j kh'j c'j ch'j sh'j shch'j"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t9() { + let cyr = "ня ньа н’я нь'н ньн"; + let lat = "nja nj'a n'ja nj'n njn"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t10() { + let cyr = "рос дыня эзёдынъ. бр кроў."; + let lat = "ros dywnja ehwzjowdywnoh. br krouh."; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t11() { + let cyr = "А́ а́ Е́ е́ Є́ є́ И́ и́ І́ і́ Ї́ ї́ О́ о́ У́ у́ Ю́ ю́ Я́ я́"; + let lat = "Á á É é JÉ jé Ý ý Í í JÍ jí Ó ó Ú ú JÚ jú JÁ já"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t12() { + let cyr = "Є́с сЄ́с є́с сє́с Ї́с сЇ́с ї́с сї́с Ю́с сЮ́с ю́с сю́с Я́с сЯ́с я́с ся́с"; + let lat = "Jés sJés jés sjés Jís sJís jís sjís Jús sJús jús sjús Jás sJás jás sjás"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t13() { + let cyr = "' ім’я 'жук' \"жук\" ' '"; + let lat = "' im'ja 'zhuk' \"zhuk\" ' '"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t14() { + let cyr = "Сонце світить майже білим світлом, однак через сильніше розсіювання і поглинання короткохвильової частини спектра атмосферою Землі пряме світло Сонця біля поверхні нашої планети набуває певного жовтого відтінку. Якщо небо ясне, то блакитний відтінок розсіяного світла складається з жовтуватим прямим сонячним світлом і загальне освітлення об’єктів на Землі стає білим."; + let lat = "Sonce svitytj majzhe bilym svitlom, odnak cherez syljnishe rozsijuvannja i poghlynannja korotkokhvyljovoji chastyny spektra atmosferoju Zemli prjame svitlo Soncja bilja poverkhni nashoji planety nabuvaje pevnogho zhovtogho vidtinku. Jakshcho nebo jasne, to blakytnyj vidtinok rozsijanogho svitla skladajetjsja z zhovtuvatym prjamym sonjachnym svitlom i zaghaljne osvitlennja ob'jektiv na Zemli staje bilym."; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t20() { + let cyr = "дуб!дуб\"дуб#дуб$дуб%дуб&дуб'дуб(дуб)дуб*дуб+дуб,дуб-дуб.дуб/дуб:дуб;дуб<дуб=дуб>дуб?дуб@дуб[дуб\\дуб]дуб^дуб_дуб`дуб{дуб|дуб}дуб~дуб"; + let lat = "dub!dub\"dub#dub$dub%dub&dub'dub(dub)dub*dub+dub,dub-dub.dub/dub:dub;dubdub?dub@dub[dub\\dub]dub^dub_dub`dub{dub|dub}dub~dub"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t21() { + let cyr = "бод бод\tбод\nбод\rбод"; + let lat = "bod bod\tbod\nbod\rbod"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t22() { + let cyr = "об😎нап😘неп😭нєп🧐нїп😍нюп😀няп"; + let lat = "ob😎nap😘nep😭njep🧐njip😍njup😀njap"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); + let t = decode(lat, Table::Dstu9112B); + assert_eq!(t, cyr); +} + +#[test] +fn dstu9112b_t15() { + let cyr = "в’я в'я"; + let lat = "v'ja v'ja"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); +} + +#[test] +fn dstu9112b_t16() { + let cyr = "Ї ї Й й Ё ё Ў ў"; + let lat = "JI ji J j JOW jow UH uh"; + let q = encode(cyr, Table::Dstu9112B); + assert_eq!(q, lat); +} + +#[test] +fn dstu9112b_t17() { + let cyr = "я ї є ю г ж х щ ш ч ь"; + let lat = "jA jI jE jU gH zH kH sHcH sH cH hJ"; + let q = decode(lat, Table::Dstu9112B); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112b_t18() { + let cyr = "А́ а́ Е́ е́ Є́ Є́ є́ є́ И́ и́ І́ і́ Ї́ Ї́ ї́ ї́ О́ о́ У́ у́ Ю́ Ю́ ю́ ю́ Я́ Я́ я́ я́"; + let lat = "Á á É é JÉ Jé jÉ jé Ý ý Í í JÍ Jí jÍ jí Ó ó Ú ú JÚ Jú jÚ jú JÁ Já jÁ já"; + let q = decode(lat, Table::Dstu9112B); + assert_eq!(q, cyr); +} + +#[test] +fn dstu9112b_t19() { + let cyr = "Є́с сЄ́с є́с сє́с Ї́с сЇ́с ї́с сї́с Ю́с сЮ́с ю́с сю́с Я́с сЯ́с я́с ся́с"; + let lat = "Jés sJés jés sjés Jís sJís jís sjís Jús sJús jús sjús Jás sJás jás sjás"; + let q = decode(lat, Table::Dstu9112B); + assert_eq!(q, cyr); +} + +#[test] +fn kmu55_t1() { + let cyr = "Україна, Хмельницький"; + let lat = "Ukraina, Khmelnytskyi"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t2() { + let cyr = "Щастям б’єш жук їх глицю в фон й ґедзь пріч."; + let lat = "Shchastiam biesh zhuk yikh hlytsiu v fon y gedz prich."; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t3() { + let cyr = "згин зГ зГин Згин Зг ЗГ ЗГИН"; + let lat = "zghyn zGH zGhyn Zghyn Zgh ZGH ZGHYN"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t4() { + let cyr = "ь Ь ль льє льї лью лья лье льі льу льа льйо льо"; + let lat = " l lie li liu lia le li lu la lio lo"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t5() { + let cyr = "Єл Їл Юл Ял"; + let lat = "Yel Yil Yul Yal"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t6() { + let cyr = "бь вь гь ґь дь жь зь кь ль мь нь пь рь сь ть фь хь ць чь шь щь"; + let lat = "b v h g d zh z k l m n p r s t f kh ts ch sh shch"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t7() { + let cyr = "бя вя гя ґя дя жя зя кя ля мя ня пя ря ся тя фя хя ця чя шя щя"; + let lat = "bia via hia gia dia zhia zia kia lia mia nia pia ria sia tia fia khia tsia chia shia shchia"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t8() { + let cyr = "б’я в’я г’я ґ’я д’я ж’я з’я к’я л’я м’я н’я п’я р’я с’я т’я ф’я х’я ц’я ч’я ш’я щ’я"; + let lat = "bia via hia gia dia zhia zia kia lia mia nia pia ria sia tia fia khia tsia chia shia shchia"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t9() { + let cyr = "бй бйо вй гй ґй дй жй зй кй лй мй нй пй рй сй тй фй хй цй чй шй щй"; + let lat = "bi bio vi hi gi di zhi zi ki li mi ni pi ri si ti fi khi tsi chi shi shchi"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t10() { + let cyr = "А́ а́ Е́ е́ Є́ є́ И́ и́ І́ і́ Ї́ ї́ О́ о́ У́ у́ Ю́ ю́ Я́ я́"; + let lat = "Á á É é YÉ yé Ý ý Í í YÍ yí Ó ó Ú ú YÚ yú YÁ yá"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t11() { + let cyr = "Є́с сЄ́с є́с сє́с Ї́с сЇ́с ї́с сї́с Ю́с сЮ́с ю́с сю́с Я́с сЯ́с я́с ся́с"; + let lat = "Yés sIés yés siés Yís sÍs yís sís Yús sIús yús siús Yás sIás yás siás"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t12() { + let cyr = "' ім’я 'жук' \"жук\" ' '"; + let lat = "' imia 'zhuk' \"zhuk\" ' '"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t13() { + let cyr = "Сонце світить майже білим світлом, однак через сильніше розсіювання і поглинання короткохвильової частини спектра атмосферою Землі пряме світло Сонця біля поверхні нашої планети набуває певного жовтого відтінку. Якщо небо ясне, то блакитний відтінок розсіяного світла складається з жовтуватим прямим сонячним світлом і загальне освітлення об’єктів на Землі стає білим."; + let lat = "Sontse svityt maizhe bilym svitlom, odnak cherez sylnishe rozsiiuvannia i pohlynannia korotkokhvylovoi chastyny spektra atmosferoiu Zemli priame svitlo Sontsia bilia poverkhni nashoi planety nabuvaie pevnoho zhovtoho vidtinku. Yakshcho nebo yasne, to blakytnyi vidtinok rozsiianoho svitla skladaietsia z zhovtuvatym priamym soniachnym svitlom i zahalne osvitlennia obiektiv na Zemli staie bilym."; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t14() { + let cyr = "в’я в'я"; + let lat = "via via"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t15() { + let cyr = "дуб!дуб\"дуб#дуб$дуб%дуб&дуб'дуб(дуб)дуб*дуб+дуб,дуб-дуб.дуб/дуб:дуб;дуб<дуб=дуб>дуб?дуб@дуб[дуб\\дуб]дуб^дуб_дуб`дуб{дуб|дуб}дуб~дуб"; + let lat = "dub!dub\"dub#dub$dub%dub&dubdub(dub)dub*dub+dub,dub-dub.dub/dub:dub;dubdub?dub@dub[dub\\dub]dub^dub_dub`dub{dub|dub}dub~dub"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t16() { + let cyr = "бод бод\tбод\nбод\rбод"; + let lat = "bod bod\tbod\nbod\rbod"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +fn kmu55_t17() { + let cyr = "об😎нап😘неп😭нєп🧐нїп😍нюп😀няп"; + let lat = "ob😎nap😘nep😭niep🧐nip😍niup😀niap"; + let q = encode(cyr, Table::Kmu55); + assert_eq!(q, lat); +} + +#[test] +#[should_panic] +fn kmu55_decode_panic() { + decode(" ", Table::Kmu55); +} diff --git a/tools/Makefile b/tools/Makefile index 6f0f28b..28402f3 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,19 +1,10 @@ -.PHONY: all -.PHONY: c -.PHONY: csharp -.PHONY: elixir -.PHONY: go -.PHONY: java -.PHONY: js -.PHONY: php -.PHONY: py -.PHONY: ruby -.PHONY: swift +.PHONY: all c csharp elixir go java js php py ruby rust swift PYTHON ?= python PROJECT_ROOT ?= .. +FORMAT ?= 1 -all: c csharp elixir go js php py java ruby swift +all: c csharp elixir go java js php py ruby rust swift c: @echo "/* Generated by gentables.py, do not edit. */\n" > "$(PROJECT_ROOT)/c/_tables.c" @@ -32,14 +23,18 @@ elixir: @$(PYTHON) gentables.py elixir >> "$(PROJECT_ROOT)/elixir/lib/paiv/ukrainian_latin.ex" @echo "# Generated by gentests.py, do not edit.\n" > "$(PROJECT_ROOT)/elixir/test/paiv/ukrainian_latin_test.exs" @$(PYTHON) gentests.py elixir >> "$(PROJECT_ROOT)/elixir/test/paiv/ukrainian_latin_test.exs" +ifneq ($(FORMAT),0) @(cd $(PROJECT_ROOT)/elixir && mix format) +endif go: @echo "/* Generated by gentables.py, do not edit. */\n" > "$(PROJECT_ROOT)/go/uklatn/uklatn.go" @$(PYTHON) gentables.py go >> "$(PROJECT_ROOT)/go/uklatn/uklatn.go" @echo "/* Generated by gentests.py, do not edit. */\n" > "$(PROJECT_ROOT)/go/uklatn/uklatn_test.go" @$(PYTHON) gentests.py go >> "$(PROJECT_ROOT)/go/uklatn/uklatn_test.go" +ifneq ($(FORMAT),0) @(cd $(PROJECT_ROOT)/go/uklatn && go fmt -x) +endif java: @echo "/* Generated by gentables.py, do not edit. */\n" > "$(PROJECT_ROOT)/java/src/main/java/io/github/paiv/uklatn/UkrainianLatin.java" @@ -69,6 +64,15 @@ ruby: @echo "# Generated by gentests.py, do not edit.\n" > "$(PROJECT_ROOT)/ruby/test/uklatn/test_uklatn.rb" @$(PYTHON) gentests.py ruby >> "$(PROJECT_ROOT)/ruby/test/uklatn/test_uklatn.rb" +rust: + @echo "// Generated by gentables.py, do not edit.\n" > "$(PROJECT_ROOT)/rust/src/lib.rs" + @$(PYTHON) gentables.py rust >> "$(PROJECT_ROOT)/rust/src/lib.rs" + @echo "// Generated by gentests.py, do not edit.\n" > "$(PROJECT_ROOT)/rust/tests/uklatn_test.rs" + @$(PYTHON) gentests.py rust >> "$(PROJECT_ROOT)/rust/tests/uklatn_test.rs" +ifneq ($(FORMAT),0) + @(cd $(PROJECT_ROOT)/rust && cargo fmt) +endif + swift: @echo "/* Generated by gentables.py, do not edit. */\n" > "$(PROJECT_ROOT)/swift/Sources/UkrainianLatin/UKLatn.swift" @$(PYTHON) gentables.py swift >> "$(PROJECT_ROOT)/swift/Sources/UkrainianLatin/UKLatn.swift" diff --git a/tools/gen/gen_rust.py b/tools/gen/gen_rust.py new file mode 100755 index 0000000..f4a2f07 --- /dev/null +++ b/tools/gen/gen_rust.py @@ -0,0 +1,338 @@ +import json +import logging +import re +from pathlib import Path +import template + + +logger = logging.getLogger(Path(__file__).stem) + + +def gen_tests(fns, default_table): + def _parse_tests(fn): + def parse_kind(s): + match s.lower().split(): + case ['cyr', '<>', 'lat']: return 'c2lr' + case ['lat', '<>', 'cyr']: return 'l2cr' + case ['cyr', '>', 'lat']: return 'c2l' + case ['lat', '>', 'cyr']: return 'l2c' + case _: + raise Exception(f'unknown test kind: {s!r}') + with fn.open() as fp: + data = json.load(fp) + return [[parse_kind(obj['test']), obj['cyr'], obj['lat']] for obj in data] + + def table_name(s): + return re.sub(r'test_', '', s) + def _j(s): + return json.dumps(s, ensure_ascii=False) + def camel(s): + return ''.join(s.title() for s in re.findall(r'[A-Za-z]+|[0-9]+', s)) + + def _emit_testdata(kind, data, table): + spl = '''\ + [ + &cyr, + &lat + ], + ''' + for cyr, lat in data: + yield template.format(spl, cyr=_j(cyr), lat=_j(lat)+'\n') + + def _emit_tests(kind, table): + if kind[0] == 'c': + yield f'let q = encode(cyr, Table::{table});\n' + yield 'assert_eq!(q, lat);\n' + else: + yield f'let q = decode(lat, Table::{table});\n' + yield 'assert_eq!(q, cyr);\n' + if kind[-1] == 'r': + if kind[0] == 'c': + yield f'let t = decode(lat, Table::{table});\n' + yield 'assert_eq!(t, cyr);\n' + else: + yield f'let t = encode(cyr, Table::{table});\n' + yield 'assert_eq!(t, lat);\n' + + def _emit_tests_default(kind): + if kind[0] == 'c': + yield 'let q = encode(cyr, Table::default());\n' + yield 'assert_eq!(q, lat);\n' + else: + yield 'let q = decode(lat, Table::default());\n' + yield 'assert_eq!(q, cyr);\n' + if kind[-1] == 'r': + if kind[0] == 'c': + yield 'let t = decode(lat, Table::default());\n' + yield 'assert_eq!(t, cyr);\n' + else: + yield 'let t = encode(cyr, Table::default());\n' + yield 'assert_eq!(t, lat);\n' + + def _emit_testset(data, table): + tpl = ''' + #[test] + fn &{table}_t&{tid}&ex() { + let cyr = &cyr; + let lat = ⪫ + &tests + } + ''' + cname = camel(table) + lname = cname.lower() + for kind in ('c2lr', 'l2cr', 'c2l', 'l2c'): + xs = [(i,cyr,lat) for i,(k,cyr,lat) in enumerate(data, 1) if k == kind] + if not xs: continue + ctx = dict(table=lname, kind=kind) + for tid, cyr, lat in xs: + ctx['tests'] = _emit_tests(kind, cname) + yield template.format(tpl, ctx, tid=tid, ex='', cyr=_j(cyr), lat=_j(lat)) + if table == default_table: + ctx['tests'] = _emit_tests_default(kind) + yield template.format(tpl, ctx, tid=tid, ex='_default', cyr=_j(cyr), lat=_j(lat)) + + if data and all(k == 'c2l' for k,_,_ in data): + tpl = ''' + #[test] + #[should_panic] + fn &{lname}_decode_panic() { + decode(" ", Table::&cname); + } + ''' + yield template.format(tpl, cname=cname, lname=lname) + + def _test_cases(): + for fn in fns: + logger.info(f'processing {fn!s}') + name = fn.stem + table = table_name(name) + data = _parse_tests(fn) + yield from _emit_testset(data, table) + + context = dict() + context['test_cases'] = _test_cases + + tpl = '''\ + use {uklatn::decode, uklatn::encode, uklatn::Table}; + &{test_cases} + ''' + text = template.format(tpl, context) + return text + + +def gen_transforms(fns, default_table=None): + def table_name(s): + s, = re.findall(r'uk_Latn_(.*?)(?:-uk)?\s*$', s, flags=re.I) + return s.replace('-', '_') + def _isdec(s): + return s.startswith('uk_Latn_') + def _j(s): + return json.dumps(s, ensure_ascii=False) + def camel(s): + return ''.join(s.title() for s in re.findall(r'[A-Za-z]+|[0-9]+', s)) + + def _load_rules(data): + return [s if isinstance(s, str) else [ + '|'.join(r['regex'] for r in s), + [r['map'] for r in s] + ] for s in data] + + def _emit_trrules(rules): + tpl = '''\ + static RX&sid: Lazy = Lazy::new(|| { + let rx: &str = r"&rx"; + Regex::new(rx).unwrap() + }); + ''' + mpl = '''\ + static M&sid&mid: &[(&str, &str); &mn] = &[ + &mappi + ]; + ''' + kvl = '(&k, &v),\n' + qpl = ''' + let tr&sid = |caps: &Captures| -> String { + if let Some(m) = caps.get(1) { + let s = m.as_str(); + for p in M&{sid}1 { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + &mappi + } else { + caps[0].to_string() + } + }; + ''' + rtl = '''\ + } else if let Some(m) = caps.get(&mid) { + let s = m.as_str(); + for p in M&sid&mid { + if p.0 == s { + return p.1.to_string(); + } + } + return s.to_string(); + ''' + def _trs(maps): + for mid in range(2, len(maps)+1): + yield template.format(rtl, sid=sid, mid=mid) + for sid, section in enumerate(rules): + if not isinstance(section, str): + rx, maps = section + yield template.format(tpl, sid=sid, rx=rx) + for mid, d in enumerate(maps, 1): + ms = (template.format(kvl, k=_j(k), v=_j(v)) for k,v in d.items()) + yield template.format(mpl, sid=sid, mid=mid, mn=len(d), mappi=ms) + yield template.format(qpl, sid=sid, mappi=_trs(maps)) + + def _emit_trbody(rules): + for sid, section in enumerate(rules): + if isinstance(section, str): + if section not in ('NFC', 'NFD', 'NFKC', 'NFKD'): + raise Exception(f'invalid transform: {section!r}') + yield f'let text = text.{section.lower()}().collect::();\n' + else: + yield f'let text = RX{sid}.replace_all(&text, tr{sid});' + + def _emit_tr(cname, verb, rules): + ctx = dict(cname=cname, verb=verb, lname=cname.lower()) + ctx['trrules'] = _emit_trrules(rules) + ctx['trbody'] = _emit_trbody(rules) + tpl = ''' + fn &{verb}_&{lname}(text: &str) -> String { + &trrules + &trbody + text + } + ''' + return template.format(tpl, ctx) + + tables = dict() + for fn in fns: + logger.info(f'processing {fn!s}') + with fn.open() as fp: + rules = json.load(fp) + rules = _load_rules(rules) + name = fn.stem + table = table_name(name) + cname = camel(table) + if table not in tables: + tables[table] = [None, None] + tables[table][_isdec(name)] = (cname, rules) + + def _emit_tables(): + for ar in [0,1]: + for table, codec in tables.items(): + if codec[ar] is not None: + cname, rules = codec[ar] + verb = ['encode', 'decode'][ar] + yield _emit_tr(cname, verb, rules) + + def _emit_match_tables(verb): + ar = ['encode', 'decode'].index(verb) + for table, codec in tables.items(): + if codec[ar] is not None: + cname, _ = codec[ar] + lname = cname.lower() + yield f'Table::{cname} => {verb}_{lname}(text),' + else: + cname = camel(table) + yield f'Table::{cname} => panic!("invalid table {{:?}}", table),' + + tdoc = { + 'DSTU_9112_A': 'DSTU 9112:2021 System A', + 'DSTU_9112_B': 'DSTU 9112:2021 System B', + 'KMU_55': 'KMU 55:2010, not reversible', + } + def _emit_tenum(): + for i, t in enumerate(tables, 1): + if (doc := tdoc.get(t, '')): + yield f'/// {doc}\n' + if t == default_table: + yield '#[default]' + n = camel(t) + yield f'{n} = {i},\n' + + context = dict() + context['tables_enum'] = _emit_tenum + context['global_tables'] = _emit_tables + context['match_encode'] = _emit_match_tables('encode') + context['match_decode'] = _emit_match_tables('decode') + + tpl = '''\ +//! Ukrainian Cyrillic transliteration to and from Latin script. +//! +//! Tables: +//! - DSTU 9112:2021 System A +//! - DSTU 9112:2021 System B +//! - KMU 55:2010, not reversible +//! +//! # Examples +//! ``` +//! let s = uklatn::encode("Доброго вечора!", uklatn::Table::default()); +//! assert_eq!(s, "Dobroğo večora!"); +//! ``` +//! ``` +//! let s = uklatn::decode("Paljanycja", uklatn::Table::default()); +//! assert_eq!(s, "Паляниця"); +//! ``` +//! +//! Select a transliteration scheme: +//! ``` +//! let s = uklatn::encode("Борщ", uklatn::Table::Dstu9112B); +//! assert_eq!(s, "Borshch"); +//! ``` +//! +use { + once_cell::sync::Lazy, fancy_regex::Captures, fancy_regex::Regex, + unicode_normalization::UnicodeNormalization, +}; + +#[derive(Default, Debug)] +pub enum Table { + &{tables_enum} +} + + +/// Transliterates a string of Ukrainian Cyrillic to Latin script. +/// +/// # Examples +/// ``` +/// let s = uklatn::encode("Доброго вечора!", uklatn::Table::default()); +/// assert_eq!(s, "Dobroğo večora!"); +/// ``` +/// ``` +/// let s = uklatn::encode("Шевченко", uklatn::Table::Kmu55); +/// assert_eq!(s, "Shevchenko"); +/// ``` +pub fn encode(text: &str, table: Table) -> String { + match table { + &{match_encode} + } +} + +/// Re-transliterates a string of Ukrainian Latin to Cyrillic script. +/// +/// # Examples +/// ``` +/// let s = uklatn::decode("Paljanycja", uklatn::Table::default()); +/// assert_eq!(s, "Паляниця"); +/// ``` +/// ``` +/// let s = uklatn::decode("Shevchenko", uklatn::Table::Dstu9112B); +/// assert_eq!(s, "Шевченко"); +/// ``` +/// +pub fn decode(text: &str, table: Table) -> String { + match table { + &{match_decode} + } +} +&{global_tables} +''' + text = template.format(tpl, context) + return text + diff --git a/tools/gentables.py b/tools/gentables.py index 2c45bb7..85621cb 100755 --- a/tools/gentables.py +++ b/tools/gentables.py @@ -128,6 +128,17 @@ def gen_ruby(src): logger.info('Ruby generator end') +def gen_rust(src): + logger.info('Rust generator start') + from gen import gen_rust + + source = _basegen(args, 'src/regex', 'uk*.json', gen_rust.gen_transforms) + for text in source: + print(text, end='') + + logger.info('Rust generator end') + + def gen_swift(src): logger.info('Swift generator start') from gen import gen_swift @@ -181,6 +192,10 @@ def gen_swift(src): parse_ruby.add_argument('source', nargs='*', help='source directory') parse_ruby.set_defaults(func=gen_ruby) + parse_rust = subpar.add_parser('rust', help='Rust code generator') + parse_rust.add_argument('source', nargs='*', help='source directory') + parse_rust.set_defaults(func=gen_rust) + parse_swift = subpar.add_parser('swift', help='Swift code generator') parse_swift.add_argument('source', nargs='*', help='source directory') parse_swift.set_defaults(func=gen_swift) diff --git a/tools/gentests.py b/tools/gentests.py index d679db6..1c3db49 100755 --- a/tools/gentests.py +++ b/tools/gentests.py @@ -128,6 +128,17 @@ def gen_ruby(args): logger.info('Ruby generator end') +def gen_rust(args): + logger.info('Rust generator start') + from gen import gen_rust + + source = _basegen(args, 'src/tests', 'test*.json', gen_rust.gen_tests) + for text in source: + print(text, end='') + + logger.info('Rust generator end') + + def gen_swift(args): logger.info('Swift generator start') from gen import gen_swift @@ -181,6 +192,10 @@ def gen_swift(args): parse_ruby.add_argument('source', nargs='*', help='source directory') parse_ruby.set_defaults(func=gen_ruby) + parse_rust = subpar.add_parser('rust', help='Rust code generator') + parse_rust.add_argument('source', nargs='*', help='source directory') + parse_rust.set_defaults(func=gen_rust) + parse_swift = subpar.add_parser('swift', help='Swfit code generator') parse_swift.add_argument('source', nargs='*', help='source directory') parse_swift.set_defaults(func=gen_swift)