diff --git a/.github/workflows/test-main-lib.yml b/.github/workflows/test-main-lib.yml index 553ea21..fef1d83 100644 --- a/.github/workflows/test-main-lib.yml +++ b/.github/workflows/test-main-lib.yml @@ -1,4 +1,4 @@ -name: Test main lib (Rust crate) +name: Test main lib on: push: branches: @@ -19,8 +19,6 @@ on: jobs: test: - runs-on: ${{ matrix.os }} - strategy: fail-fast: false matrix: @@ -30,6 +28,8 @@ jobs: - os: windows-latest bitness: 32 + runs-on: ${{ matrix.os }} + steps: - name: Checkout source code uses: actions/checkout@v4 diff --git a/.github/workflows/test-nlpo3-cli.yml b/.github/workflows/test-nlpo3-cli.yml index ccd78ed..2c33c2d 100644 --- a/.github/workflows/test-nlpo3-cli.yml +++ b/.github/workflows/test-nlpo3-cli.yml @@ -1,4 +1,4 @@ -name: Test nlpo3-cli (command line) +name: Test nlpo3-cli on: push: branches: @@ -17,8 +17,6 @@ defaults: jobs: test: - runs-on: ${{ matrix.os }} - strategy: fail-fast: false matrix: @@ -28,9 +26,11 @@ jobs: - os: windows-latest bitness: 32 + runs-on: ${{ matrix.os }} + steps: - name: Checkout source code - uses: actions/checkout@master + uses: actions/checkout@v4 - name: Setup Rust toolchain - non-win32 uses: actions-rs/toolchain@v1 diff --git a/.github/workflows/test-nlpo3-python.yml b/.github/workflows/test-nlpo3-python.yml new file mode 100644 index 0000000..aa0c557 --- /dev/null +++ b/.github/workflows/test-nlpo3-python.yml @@ -0,0 +1,69 @@ +name: Test nlpo3-python +on: + push: + branches: + - main + paths: + - 'nlpo3-python/**' + - '!notebooks/' + - '!LICENSE' + - '!*.md' + pull_request: + branches: + - main + paths: + - 'nlpo3-python/**' + - '!notebooks/' + - '!LICENSE' + - '!*.md' + +defaults: + run: + working-directory: nlpo3-python + +jobs: + test: + strategy: + fail-fast: false + matrix: + os: [macos-latest, ubuntu-latest, windows-latest] + python-version: ["3.13", "3.12", "3.11", "3.10", "3.9", "3.8", "3.7"] + bitness: [64] # 32, 64 + include: + - os: windows-latest + python-version: "3.9" + bitness: 32 + exclude: + - os: macos-latest + python-version: "3.7" + + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout source code + uses: actions/checkout@v4 + + - name: Setup Rust toolchain + uses: actions-rust-lang/setup-rust-toolchain@v1 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Build wheel + run: | + pip install -U pip + pip install -U build setuptools setuptools-rust wheel + python -m build --wheel + + - name: Install wheel + run: pip install --no-index --find-links=dist nlpo3 + # Since we don't know the exact name of the wheel from previous step, + # use --find-links instead. + + - name: Test + run: | + cd tests + python -m unittest diff --git a/CITATION.cff b/CITATION.cff index 4d0be53..804d84d 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -7,6 +7,12 @@ type: software authors: - family-names: Suntorntip given-names: Thanathip + - family-names: Suriyawongkul + given-names: Arthit + orcid: "https://orcid.org/0000-0002-9698-1899" + - family-names: Phatthiyaphaibun + given-names: Wannaphong + orcid: "https://orcid.org/0000-0002-4153-4354" repository-code: "https://github.com/PyThaiNLP/nlpo3/" repository: "https://github.com/PyThaiNLP/nlpo3/" url: "https://github.com/PyThaiNLP/nlpo3/" diff --git a/Cargo.toml b/Cargo.toml index 9e04e29..1205841 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,14 +3,14 @@ name = "nlpo3" version = "1.4.0" edition = "2018" license = "Apache-2.0" -authors = ["Thanathip Suntorntip Gorlph"] +authors = ["Thanathip Suntorntip Gorlph", "Arthit Suriyawongkul"] description = "Thai natural language processing library, with Python and Node bindings" +categories = ["text-processing"] +keywords = ["thai", "tokenizer", "nlp", "word-segmentation"] homepage = "https://github.com/PyThaiNLP/nlpo3/" -documentation = "https://github.com/PyThaiNLP/nlpo3/blob/main/README.md" repository = "https://github.com/PyThaiNLP/nlpo3/" +documentation = "https://github.com/PyThaiNLP/nlpo3/blob/main/README.md" readme = "README.md" -keywords = ["thai", "tokenizer", "nlp", "word-segmentation"] -categories = ["text-processing"] exclude = [ ".gitignore", ".github/*", diff --git a/README.md b/README.md index d10ff94..bae6f95 100644 --- a/README.md +++ b/README.md @@ -191,5 +191,6 @@ Issues: ## License -nlpO3 is copyrighted by its authors and licensed under terms of the Apache -Software License 2.0 (Apache-2.0) - see file [LICENSE](./LICENSE) for details. +nlpO3 is copyrighted by its authors +and licensed under terms of the Apache Software License 2.0 (Apache-2.0). +See file [LICENSE](./LICENSE) for details. diff --git a/nlpo3-cli/Cargo.toml b/nlpo3-cli/Cargo.toml index 2ae7ea3..333b3aa 100644 --- a/nlpo3-cli/Cargo.toml +++ b/nlpo3-cli/Cargo.toml @@ -5,12 +5,12 @@ edition = "2018" license = "Apache-2.0" authors = ["Vee Satayamas <5ssgdxltv@relay.firefox.com>"] description = "Command line interface for nlpO3, a Thai natural language processing library" -documentation = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-cli/README.md" +categories = ["text-processing", "command-line-utilities"] +keywords = ["thai", "tokenizer", "nlp", "word-segmentation", "cli"] homepage = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-cli/" repository = "https://github.com/PyThaiNLP/nlpo3/" +documentation = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-cli/README.md" readme = "README.md" -keywords = ["thai", "tokenizer", "nlp", "word-segmentation", "cli"] -categories = ["text-processing", "command-line-utilities"] [[bin]] name = "nlpo3" diff --git a/nlpo3-cli/README.md b/nlpo3-cli/README.md index 044ade3..c2af967 100644 --- a/nlpo3-cli/README.md +++ b/nlpo3-cli/README.md @@ -27,3 +27,9 @@ nlpo3 help ```bash echo "ฉันกินข้าว" | nlpo3 segment ``` + +## License + +nlpo3-cli is copyrighted by its authors +and licensed under terms of the Apache Software License 2.0 (Apache-2.0). +See file [LICENSE](./LICENSE) for details. diff --git a/nlpo3-nodejs/Cargo.toml b/nlpo3-nodejs/Cargo.toml index 1e86fae..e51753d 100644 --- a/nlpo3-nodejs/Cargo.toml +++ b/nlpo3-nodejs/Cargo.toml @@ -5,12 +5,12 @@ edition = "2018" license = "Apache-2.0" authors = ["Thanathip Suntorntip Gorlph"] description = "Node binding for nlpO3 Thai language processing library" -documentation = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-nodejs/README.md" +categories = ["text-processing"] +keywords = ["thai", "tokenizer", "nlp", "word-segmentation"] homepage = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-nodejs/" repository = "https://github.com/PyThaiNLP/nlpo3/" +documentation = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-nodejs/README.md" readme = "README.md" -keywords = ["thai", "tokenizer", "nlp", "word-segmentation"] -categories = ["text-processing"] exclude = ["index.node"] [lib] diff --git a/nlpo3-nodejs/README.md b/nlpo3-nodejs/README.md index 6611086..7d7756c 100644 --- a/nlpo3-nodejs/README.md +++ b/nlpo3-nodejs/README.md @@ -92,3 +92,9 @@ Please report issues at ## TODO - Find a way to build binaries and publish on npm. + +## License + +nlpO3 Node binding is copyrighted by its authors +and licensed under terms of the Apache Software License 2.0 (Apache-2.0). +See file [LICENSE](./LICENSE) for details. diff --git a/nlpo3-nodejs/src/lib.rs b/nlpo3-nodejs/src/lib.rs index dfd164e..2660b0e 100644 --- a/nlpo3-nodejs/src/lib.rs +++ b/nlpo3-nodejs/src/lib.rs @@ -9,22 +9,28 @@ use neon::prelude::*; use nlpo3::tokenizer::{newmm::NewmmTokenizer, tokenizer_trait::Tokenizer}; lazy_static! { - static ref DICT_COLLECTION: Mutex>> = + static ref TOKENIZER_COLLECTION: Mutex>> = Mutex::new(HashMap::new()); } +// Load a dictionary file to a tokenizer, +// and add that tokenizer to the tokenizer collection. +// +// Dictionary file must one word per line. +// If successful, will insert a NewmmTokenizer to TOKENIZER_COLLECTION. +// returns a tuple of string of loading result and a boolean fn load_dict(mut cx: FunctionContext) -> JsResult { - let mut dict_col_lock = DICT_COLLECTION.lock().unwrap(); + let mut tokenizer_col_lock = TOKENIZER_COLLECTION.lock().unwrap(); let file_path = cx.argument::(0)?.value(&mut cx); let dict_name = cx.argument::(1)?.value(&mut cx); - if let Some(_) = dict_col_lock.get(&dict_name) { + if let Some(_) = tokenizer_col_lock.get(&dict_name) { Ok(cx.string(format!( "Failed: dictionary {} exists, please use another name.", dict_name ))) } else { let tokenizer = NewmmTokenizer::new(&file_path); - dict_col_lock.insert(dict_name.to_owned(), Box::new(tokenizer)); + tokenizer_col_lock.insert(dict_name.to_owned(), Box::new(tokenizer)); Ok(cx.string(format!( "Successful: dictionary name {} from file {} has been successfully loaded", @@ -33,13 +39,17 @@ fn load_dict(mut cx: FunctionContext) -> JsResult { } } +// Break text into tokens. +// Use newmm algorithm. +/// Can use multithreading, but takes a lot of memory. +/// returns an array of string fn segment(mut cx: FunctionContext) -> JsResult { let text = cx.argument::(0)?.value(&mut cx); let dict_name = cx.argument::(1)?.value(&mut cx); let safe = cx.argument::(2)?.value(&mut cx); let parallel = cx.argument::(3)?.value(&mut cx); - if let Some(loaded_dict) = DICT_COLLECTION.lock().unwrap().get(&dict_name) { - let result = loaded_dict.segment_to_string(&text, safe, parallel); + if let Some(loaded_tokenizer) = TOKENIZER_COLLECTION.lock().unwrap().get(&dict_name) { + let result = loaded_tokenizer.segment_to_string(&text, safe, parallel); let js_result_array = JsArray::new(&mut cx, result.len() as u32); for (i, obj) in result.iter().enumerate() { let js_string = cx.string(obj); diff --git a/nlpo3-python/Cargo.lock b/nlpo3-python/Cargo.lock index f58fbe6..9d9d273 100644 --- a/nlpo3-python/Cargo.lock +++ b/nlpo3-python/Cargo.lock @@ -199,7 +199,7 @@ dependencies = [ [[package]] name = "nlpo3-python" -version = "1.3.1-dev" +version = "1.3.1" dependencies = [ "ahash", "lazy_static", diff --git a/nlpo3-python/Cargo.toml b/nlpo3-python/Cargo.toml index 6b26879..130feab 100644 --- a/nlpo3-python/Cargo.toml +++ b/nlpo3-python/Cargo.toml @@ -1,13 +1,30 @@ [package] name = "nlpo3-python" -version = "1.3.1-dev" +version = "1.3.1" edition = "2018" license = "Apache-2.0" -authors = ["Thanathip Suntorntip Gorlph"] +authors = [ + "Thanathip Suntorntip Gorlph", + "Arthit Suriyawongkul", + "Wannaphong Phatthiyaphaibun ", +] description = "Python binding for nlpO3 Thai language processing library" -exclude = ["notebooks"] -keywords = ["thai", "tokenizer", "nlp", "word-segmentation", "python"] categories = ["text-processing"] +keywords = ["thai", "tokenizer", "nlp", "word-segmentation", "python"] +homepage = "https://github.com/PyThaiNLP/nlpo3/tree/main/nlpo3-python" +repository = "https://github.com/PyThaiNLP/nlpo3/" +documentation = "https://github.com/PyThaiNLP/nlpo3/blob/main/nlpo3-python/README.md" +readme = "README.md" +exclude = [ + ".gitignore", + ".github/", + "build/", + "dist/", + "notebooks/", + "target", + "tests/", + "*.sh", +] [lib] name = "_nlpo3_python_backend" diff --git a/nlpo3-python/README.md b/nlpo3-python/README.md index 4652b7e..21619e8 100644 --- a/nlpo3-python/README.md +++ b/nlpo3-python/README.md @@ -6,7 +6,7 @@ SPDX-License-Identifier: Apache-2.0 # nlpO3 Python binding [![PyPI](https://img.shields.io/pypi/v/nlpo3.svg "PyPI")](https://pypi.python.org/pypi/nlpo3) -[![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg "Python 3.6")](https://www.python.org/downloads/) +[![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg "Python 3.7")](https://www.python.org/downloads/) [![Apache-2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg "Apache-2.0")](https://opensource.org/license/apache-2-0) Python binding for nlpO3, a Thai natural language processing library in Rust. @@ -86,7 +86,7 @@ segment("สวัสดีครับ", dict_name="dict_name", safe=True) ### Requirements - [Rust 2018 Edition](https://www.rust-lang.org/tools/install) -- Python 3.6 or newer +- Python 3.7 or newer (PyO3's minimum supported version) - Python Development Headers - Ubuntu: `sudo apt-get install python3-dev` - macOS: No action needed @@ -103,6 +103,27 @@ python -m build This should generate a wheel file, in `dist/` directory, which can be installed by pip. +To install a wheel from a local directory: + +```bash +pip install dist/nlpo3-1.3.1-cp311-cp311-macosx_12_0_x86_64.whl +``` + +## Test + +To run the Python unit test: + +```bash +cd tests +python -m unittest +``` + ## Issues Please report issues at + +## License + +nlpO3 Python binding is copyrighted by its authors +and licensed under terms of the Apache Software License 2.0 (Apache-2.0). +See file [LICENSE](./LICENSE) for details. diff --git a/nlpo3-python/pyproject.toml b/nlpo3-python/pyproject.toml index 2c22692..e9d386c 100644 --- a/nlpo3-python/pyproject.toml +++ b/nlpo3-python/pyproject.toml @@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta" [project] name = "nlpo3" -version = "1.3.1-dev" +version = "1.3.1" description = "Python binding for nlpO3 Thai language processing library in Rust" readme = "README.md" -requires-python = ">=3.6" +requires-python = ">=3.7" license = { text = "Apache-2.0" } keywords = ["thai", "tokenizer", "nlp", "word-segmentation", "pythainlp"] authors = [ @@ -18,7 +18,6 @@ authors = [ classifiers = [ "Development Status :: 5 - Production/Stable", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", @@ -38,7 +37,7 @@ homepage = "https://github.com/PyThaiNLP/nlpo3/" repository = "https://github.com/PyThaiNLP/nlpo3/" [tool.poetry.dependencies] -python = "^3.6" +python = "^3.7" [tool.poetry.dev-dependencies] pytest = "*" diff --git a/nlpo3-python/setup.cfg b/nlpo3-python/setup.cfg index 5014268..2a5b3e7 100644 --- a/nlpo3-python/setup.cfg +++ b/nlpo3-python/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = nlpo3 -version = 1.3.1-dev +version = 1.3.1 description = Python binding for nlpO3 Thai language processing library long_description = Python binding for nlpO3, a Thai natural language processing library in Rust. @@ -67,7 +67,7 @@ classifiers = #obsoletes = pythainlp-rust-modules [options] -python_requires = >=3.6 +python_requires = >=3.7 include_package_data = True packages = nlpo3 zip_safe = False diff --git a/nlpo3-python/src/lib.rs b/nlpo3-python/src/lib.rs index 6401743..9dcea49 100644 --- a/nlpo3-python/src/lib.rs +++ b/nlpo3-python/src/lib.rs @@ -21,38 +21,23 @@ use pyo3::types::PyString; use pyo3::{exceptions, wrap_pyfunction}; lazy_static! { - static ref DICT_COLLECTION: Mutex>> = + static ref TOKENIZER_COLLECTION: Mutex>> = Mutex::new(HashMap::new()); } -/// Break text into tokens. -/// Use newmm algorithm. -/// Can use multithreading, but takes a lot of memory. -/// returns list of valid utf-8 bytes list -/// signature: (text: str, dict_name: str, safe: boolean = false, parallel: boolean = false) -> List[List[u8]] -#[pyfunction] -#[pyo3(signature = (text, dict_name, safe=false, parallel=false))] -fn segment(text: &PyString, dict_name: &str, safe: bool, parallel: bool) -> PyResult> { - if let Some(loaded_dict) = DICT_COLLECTION.lock().unwrap().get(dict_name) { - let result = loaded_dict.segment_to_string(text.to_str()?, safe, parallel); - Ok(result) - } else { - Err(exceptions::PyRuntimeError::new_err(format!( - "Dictionary name {} does not exist.", - dict_name - ))) - } -} - -/// Load a dictionary file to the dict collection. +/// Load a dictionary file to a tokenizer, +/// and add that tokenizer to the tokenizer collection. +/// /// Dictionary file must one word per line. +/// If successful, will insert a NewmmTokenizer to TOKENIZER_COLLECTION. /// returns a tuple of string of loading result and a boolean +/// /// signature: (file_path: str, dict_name: str) -> (str, boolean) #[pyfunction] #[pyo3(signature = (file_path, dict_name))] fn load_dict(file_path: &str, dict_name: &str) -> PyResult<(String, bool)> { - let mut dict_col_lock = DICT_COLLECTION.lock().unwrap(); - if dict_col_lock.get(dict_name).is_some() { + let mut tokenizer_col_lock = TOKENIZER_COLLECTION.lock().unwrap(); + if tokenizer_col_lock.get(dict_name).is_some() { Ok(( format!( "Failed: dictionary name {} already exists, please use another name.", @@ -62,7 +47,7 @@ fn load_dict(file_path: &str, dict_name: &str) -> PyResult<(String, bool)> { )) } else { let tokenizer = NewmmTokenizer::new(file_path); - dict_col_lock.insert(dict_name.to_owned(), Box::new(tokenizer)); + tokenizer_col_lock.insert(dict_name.to_owned(), Box::new(tokenizer)); Ok(( format!( @@ -74,12 +59,38 @@ fn load_dict(file_path: &str, dict_name: &str) -> PyResult<(String, bool)> { } } +/// Break text into tokens. +/// Use newmm algorithm. +/// Can use multithreading, but takes a lot of memory. +/// returns list of valid utf-8 bytes list +/// +/// signature: (text: str, dict_name: str, safe: boolean = false, parallel: boolean = false) -> List[List[u8]] +/// +#[pyfunction] +#[pyo3(signature = (text, dict_name, safe=false, parallel=false))] +fn segment( + text: &Bound<'_, PyString>, + dict_name: &str, + safe: bool, + parallel: bool, +) -> PyResult> { + if let Some(loaded_tokenizer) = TOKENIZER_COLLECTION.lock().unwrap().get(dict_name) { + let result = loaded_tokenizer.segment_to_string(text.to_str()?, safe, parallel); + Ok(result) + } else { + Err(exceptions::PyRuntimeError::new_err(format!( + "Dictionary name {} does not exist.", + dict_name + ))) + } +} + /* /// Add words to existing dictionary #[pyfunction] fn add_word(dict_name: &str, words: Vec<&str>) -> PyResult<(String, bool)> { - let mut dict_col_lock = DICT_COLLECTION.lock().unwrap(); - if let Some(newmm_dict) = dict_col_lock.get(dict_name) { + let mut tokenizer_col_lock = TOKENIZER_COLLECTION.lock().unwrap(); + if let Some(newmm_dict) = tokenizer_col_lock.get(dict_name) { newmm_dict.add_word(&words); Ok((format!("Add new word(s) successfully."), true)) } else { @@ -96,8 +107,8 @@ fn add_word(dict_name: &str, words: Vec<&str>) -> PyResult<(String, bool)> { /// Remove words from existing dictionary #[pyfunction] fn remove_word(dict_name: &str, words: Vec<&str>) -> PyResult<(String, bool)> { - let mut dict_col_lock = DICT_COLLECTION.lock().unwrap(); - if let Some(newmm_dict) = dict_col_lock.get(dict_name) { + let mut tokenizer_col_lock = TOKENIZER_COLLECTION.lock().unwrap(); + if let Some(newmm_dict) = tokenizer_col_lock.get(dict_name) { newmm_dict.remove_word(&words); Ok((format!("Remove word(s) successfully."), true)) } else { @@ -113,7 +124,7 @@ fn remove_word(dict_name: &str, words: Vec<&str>) -> PyResult<(String, bool)> { */ #[pymodule] -fn _nlpo3_python_backend(_py: Python, m: &PyModule) -> PyResult<()> { +fn _nlpo3_python_backend(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(load_dict, m)?)?; m.add_function(wrap_pyfunction!(segment, m)?)?; Ok(()) diff --git a/nlpo3-python/tests/test_tokenize.py b/nlpo3-python/tests/test_tokenize.py index 4cf1566..705db7b 100644 --- a/nlpo3-python/tests/test_tokenize.py +++ b/nlpo3-python/tests/test_tokenize.py @@ -175,7 +175,7 @@ def setUp(self): ) def test_segment(self): - DICT_FILENAME = "tests/data/test_dict.txt" + DICT_FILENAME = "data/test_dict.txt" DICT_NAME = "test_dict" load_dict(DICT_FILENAME, DICT_NAME)