diff --git a/.github/workflows/build-python-wheels.yml b/.github/workflows/build-python-wheels.yml index 705ff65..5ad4449 100644 --- a/.github/workflows/build-python-wheels.yml +++ b/.github/workflows/build-python-wheels.yml @@ -1,7 +1,7 @@ -# Build wheels for many platforms +# Build wheels for many platforms, use cibuildwheel # see: https://github.com/pypa/cibuildwheel -name: Build Python wheels +name: Build wheels on: push: @@ -27,7 +27,7 @@ on: jobs: echo_github_env: - name: Echo GitHub environment variables + name: Echo env variables runs-on: ubuntu-latest steps: - run: | @@ -56,102 +56,41 @@ jobs: # To trigger the build steps, add "[cd build]" to commit message build_wheels: - name: Build ${{ matrix.python-version }}-${{ matrix.platform_id }} + name: Build ${{ matrix.os }} runs-on: ${{ matrix.os }} needs: check_build_trigger if: needs.check_build_trigger.outputs.build strategy: # Ensure that a wheel builder finishes even if another fails fail-fast: false - # For build identier, see: - # https://cibuildwheel.pypa.io/en/stable/options/#build-skip matrix: os: [macos-latest, ubuntu-latest, windows-latest] - python-version: ["3.13", "3.12", "3.11", "3.10", "3.9", "3.8", "3.7", "pypy3.10", "pypy3.9"] - bitness: [64, 32] - include: - - os: macos-latest - bitness: 64 - platform_id: macosx_x86_64 - - os: macos-latest - bitness: 64 - platform_id: macosx_arm64 - - os: ubuntu-latest - bitness: 64 - platform_id: manylinux_x86_64 - - os: ubuntu-latest - bitness: 32 - platform_id: manylinux_i686 - # - os: ubuntu-latest - # bitness: 64 - # platform_id: musllinux_x86_64 - # - os: ubuntu-latest - # bitness: 32 - # platform_id: musllinux_i686 - # - os: ubuntu-latest - # bitness: 64 - # platform_id: manylinux_aarch64 - # - os: ubuntu-latest - # bitness: 64 - # platform_id: musllinux_aarch64 - - os: windows-latest - bitness: 64 - platform_id: win_amd64 - - os: windows-latest - bitness: 32 - platform_id: win32 - exclude: - - os: macos-latest - bitness: 32 - - os: macos-latest - python-version: "3.7" - - python-version: "pypy3.10" - bitness: 32 - - python-version: "pypy3.9" - bitness: 32 + python-version: ["3.13"] env: - CP_VER: "" + CIBW_BUILD: "" # blank, let cibuildwheel build all supported platforms + steps: - # cibuildwheel needs a specific Python implementation ID - - name: Make Python implementation ID - id: convert-version-id - if: startsWith(matrix.os, 'windows-') == false - run: | - PYTHON_VERSION=${{ matrix.python-version }} - if [[ "$PYTHON_VERSION" == pypy* ]]; then - CP_VER="pp${PYTHON_VERSION:4}" - CP_VER="${CP_VER//./}" - else - CP_VER="cp${PYTHON_VERSION//./}" - fi - echo "Python version: $CP_VER" - echo "CP_VER=$CP_VER" >> $GITHUB_ENV - - name: Make Python implementation ID (Windows) - id: convert-version-id-win - if: startsWith(matrix.os, 'windows-') - shell: powershell - run: | - $PYTHON_VERSION = "${{ matrix.python-version }}" - if ($PYTHON_VERSION -like "pypy*") { - $CP_VER = "pp" + $PYTHON_VERSION.Substring(4).Replace(".", "") - } else { - $CP_VER = "cp" + $PYTHON_VERSION.Replace(".", "") - } - echo "Python version: $CP_VER" - echo "CP_VER=$CP_VER" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append - name: Checkout source code uses: actions/checkout@v4 + - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "pip" + - name: Install Python dependencies + run: python -m pip install --upgrade pip + - name: Setup Rust toolchain if: startsWith(matrix.os, 'ubuntu-') == false uses: actions-rust-lang/setup-rust-toolchain@v1 # For Linux, Rust will be installed inside a cibuildwheel container later - - name: Install Python dependencies - run: python -m pip install --upgrade pip + + - name: Setup rustup target + if: startsWith(matrix.os, 'macos-') + run: rustup target add x86_64-apple-darwin + # For cross-compile x86 on GitHub arm64 runner + - name: Build Python wheels uses: pypa/cibuildwheel@v2.21.3 with: @@ -159,14 +98,16 @@ jobs: output-dir: wheelhouse env: CIBW_BUILD_VERBOSITY: 1 - # See build selector name at: + # See CIBW_BUILD, CIBW_SKIP, CIBW_ARCHS and other build selectors at: # https://cibuildwheel.readthedocs.io/en/stable/options/#build-skip - CIBW_BUILD: ${{ env.CP_VER }}-${{ matrix.platform_id }} + CIBW_SKIP: "*-musllinux_i686" + CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_ENVIRONMENT_MACOS: | MACOSX_DEPLOYMENT_TARGET=10.9 PATH="$HOME/.cargo/bin:$PATH" CC=/usr/bin/clang CXX=/usr/bin/clang++ + CIBW_ARCHS_LINUX: "auto" CIBW_ENVIRONMENT_LINUX: PATH="$HOME/.cargo/bin:$PATH" CIBW_BEFORE_BUILD_LINUX: | pip install --upgrade setuptools-rust @@ -174,13 +115,15 @@ jobs: # It is needed to install Rust for Linux, # because cibuildwheel on Linux runs inside a container # and the container does not have Rust. + CIBW_ARCHS_WINDOWS: "AMD64 x86" + - name: Store artifacts uses: actions/upload-artifact@v3 with: path: ./wheelhouse/*.whl build_sdist: - name: Build Python source distribution + name: Build source distribution runs-on: ubuntu-latest needs: check_build_trigger if: needs.check_build_trigger.outputs.build @@ -202,13 +145,13 @@ jobs: path: nlpo3-python/dist/*.tar.gz publish_pypi: - name: Publish Python package to PyPI + name: Publish to PyPI runs-on: ubuntu-latest needs: [build_wheels, build_sdist] - # Upload to PyPI on every tag starting with 'v' - #if: github.event_name == 'push' && startsWith(github.event.ref, 'v') - # Alternatively, to publish when a GitHub Release is created, use the following rule: + # Publish when a GitHub Release is created: if: github.event_name == 'release' && github.event.action == 'published' + # Alternatively, upload to PyPI on every tag starting with 'v': + #if: github.event_name == 'push' && startsWith(github.event.ref, 'v') steps: - name: Retrieve artifacts uses: actions/download-artifact@v3 diff --git a/README.md b/README.md index bae6f95..aa45578 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,14 @@ pip install nlpo3 ## Table of contents - [Features](#features) -- [Dictionary file](#dictionary-file) -- [Usage](#usage) +- [Use](#use) - [Node.js binding](#nodejs-binding) - [Python binding](#python-binding) - [Rust library](#rust-library) - [Command-line interface](#command-line-interface) + - [Dictionary](#dictionary) - [Build](#build) -- [Development](#development) +- [Develop](#develop) - [License](#license) ## Features @@ -48,25 +48,7 @@ pip install nlpo3 [tcc]: https://dl.acm.org/doi/10.1145/355214.355225 [benchmark]: ./nlpo3-python/notebooks/nlpo3_segment_benchmarks.ipynb -## Dictionary file - -- For the interest of library size, nlpO3 does not assume what dictionary the - user would like to use, and it does not come with a dictionary. -- A dictionary is needed for the dictionary-based word tokenizer. -- For tokenization dictionary, try - - [words_th.tx][dict-pythainlp] from [PyThaiNLP][pythainlp] - - ~62,000 words - - CC0-1.0 - - [word break dictionary][dict-libthai] from [libthai][libthai] - - consists of dictionaries in different categories, with a make script - - LGPL-2.1 - -[pythainlp]: https://github.com/PyThaiNLP/pythainlp -[libthai]: https://github.com/tlwg/libthai/ -[dict-pythainlp]: https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt -[dict-libthai]: https://github.com/tlwg/libthai/tree/master/data - -## Usage +## Use ### Node.js binding @@ -151,6 +133,24 @@ echo "ฉันกินข้าว" | nlpo3 segment See more at [nlpo3-cli](./nlpo3-cli/). +### Dictionary + +- For the interest of library size, nlpO3 does not assume what dictionary the + user would like to use, and it does not come with a dictionary. +- A dictionary is needed for the dictionary-based word tokenizer. +- For tokenization dictionary, try + - [words_th.tx][dict-pythainlp] from [PyThaiNLP][pythainlp] + - ~62,000 words + - CC0-1.0 + - [word break dictionary][dict-libthai] from [libthai][libthai] + - consists of dictionaries in different categories, with a make script + - LGPL-2.1 + +[pythainlp]: https://github.com/PyThaiNLP/pythainlp +[libthai]: https://github.com/tlwg/libthai/ +[dict-pythainlp]: https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt +[dict-libthai]: https://github.com/tlwg/libthai/tree/master/data + ## Build ### Requirements @@ -179,13 +179,13 @@ cargo build --release Check `target/` for build artifacts. -## Development +## Develop -Development document: +### Development document - [Notes on custom string](src/NOTE_ON_STRING.md) -Issues: +### Issues - Please report issues at diff --git a/nlpo3-python/README.md b/nlpo3-python/README.md index 53f8032..00b0d98 100644 --- a/nlpo3-python/README.md +++ b/nlpo3-python/README.md @@ -11,6 +11,22 @@ SPDX-License-Identifier: Apache-2.0 Python binding for nlpO3, a Thai natural language processing library in Rust. +To install: + +```bash +pip install nlpo3 +``` + +## Table of Contents + +- [Features](#features) +- [Use](#use) + - [Dictionary](#dictionary) +- [Build](#build) +- [Issues](#issues) +- [License](#license) +- [Binary wheels](#binary-wheels) + ## Features - Thai word tokenizer @@ -24,31 +40,7 @@ Python binding for nlpO3, a Thai natural language processing library in Rust. [tcc]: https://dl.acm.org/doi/10.1145/355214.355225 [benchmark]: ./notebooks/nlpo3_segment_benchmarks.ipynb -## Dictionary file - -- For the interest of library size, nlpO3 does not assume what dictionary the - user would like to use, and it does not come with a dictionary. -- A dictionary is needed for the dictionary-based word tokenizer. -- For tokenization dictionary, try - - [words_th.txt][dict-pythainlp] from [PyThaiNLP][pythainlp] - - ~62,000 words - - CC0-1.0 - - [word break dictionary][dict-libthai] from [libthai][libthai] - - consists of dictionaries in different categories, with a make script - - LGPL-2.1 - -[pythainlp]: https://github.com/PyThaiNLP/pythainlp -[libthai]: https://github.com/tlwg/libthai/ -[dict-pythainlp]: https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt -[dict-libthai]: https://github.com/tlwg/libthai/tree/master/data - -## Install - -```bash -pip install nlpo3 -``` - -## Usage +## Use Load file `path/to/dict.file` to memory and assign a name `dict_name` to it. @@ -83,6 +75,24 @@ for text with lots of ambiguous word boundaries: segment("สวัสดีครับ", dict_name="dict_name", safe=True) ``` +### Dictionary + +- For the interest of library size, nlpO3 does not assume what dictionary the + user would like to use, and it does not come with a dictionary. +- A dictionary is needed for the dictionary-based word tokenizer. +- For tokenization dictionary, try + - [words_th.txt][dict-pythainlp] from [PyThaiNLP][pythainlp] + - ~62,000 words + - CC0-1.0 + - [word break dictionary][dict-libthai] from [libthai][libthai] + - consists of dictionaries in different categories, with a make script + - LGPL-2.1 + +[pythainlp]: https://github.com/PyThaiNLP/pythainlp +[libthai]: https://github.com/tlwg/libthai/ +[dict-pythainlp]: https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt +[dict-libthai]: https://github.com/tlwg/libthai/tree/master/data + ## Build ### Requirements @@ -111,9 +121,9 @@ To install a wheel from a local directory: pip install dist/nlpo3-1.3.1-cp311-cp311-macosx_12_0_x86_64.whl ``` -## Test +### Test -To run the Python unit test: +To run a Python unit test: ```bash cd tests @@ -129,3 +139,85 @@ Please report issues at nlpO3 Python binding is copyrighted by its authors and licensed under terms of the Apache Software License 2.0 (Apache-2.0). See file [LICENSE](./LICENSE) for details. + +## Binary wheels + +A pre-built binary package is available from [PyPI][pypi] for these platforms: + +[pypi]: https://pypi.org/project/nlpo3/ + +|Python|OS|Architecture|Has binary wheel?| +|-|-|-|-| +|3.13|Windows|x86|✅| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +||musllinux|x86_64|✅| +|3.12|Windows|x86|✅| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +||musllinux|x86_64|✅| +|3.11|Windows|x86|✅| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +||musllinux|x86_64|✅| +|3.10|Windows|x86|✅| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +||musllinux|x86_64|✅| +|3.9|Windows|x86|✅| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +||musllinux|x86_64|✅| +|3.8|Windows|x86|✅| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +||musllinux|x86_64|✅| +|3.7|Windows|x86|✅| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|❌| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +||musllinux|x86_64|✅| +|PyPy 3.10|Windows|x86|❌| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +|PyPy 3.9|Windows|x86|❌| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +|PyPy 3.8|Windows|x86|❌| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|✅| +||manylinux|x86_64|✅| +||manylinux|i686|✅| +|PyPy 3.7|Windows|x86|❌| +||Windows|AMD64|✅| +||macOS|x86_64|✅| +||macOS|arm64|❌| +||manylinux|x86_64|✅| +||manylinux|i686|✅| diff --git a/nlpo3-python/pyproject.toml b/nlpo3-python/pyproject.toml index edb6711..e31f84b 100644 --- a/nlpo3-python/pyproject.toml +++ b/nlpo3-python/pyproject.toml @@ -25,6 +25,8 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Natural Language :: Thai",