From 89f7e50e6d74f1a6dc784455c9cadaff60bf68f5 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 25 Nov 2024 07:25:14 +1100 Subject: [PATCH 1/7] add spacy-curated-tokenizers --- recipes/curated-tokenizers/meta.yaml | 48 ++++++++++++++++++ recipes/curated-transformers/meta.yaml | 46 ++++++++++++++++++ recipes/spacy-curated-transformers/meta.yaml | 51 ++++++++++++++++++++ 3 files changed, 145 insertions(+) create mode 100644 recipes/curated-tokenizers/meta.yaml create mode 100644 recipes/curated-transformers/meta.yaml create mode 100644 recipes/spacy-curated-transformers/meta.yaml diff --git a/recipes/curated-tokenizers/meta.yaml b/recipes/curated-tokenizers/meta.yaml new file mode 100644 index 0000000000000..e46386ebbd0f8 --- /dev/null +++ b/recipes/curated-tokenizers/meta.yaml @@ -0,0 +1,48 @@ +{% set version = "0.0.9" %} + +package: + name: curated-tokenizers + version: {{ version }} + +source: + url: https://pypi.org/packages/source/c/curated-tokenizers/curated-tokenizers-{{ version }}.tar.gz + sha256: c93d47e54ab3528a6db2796eeb4bdce5d44e8226c671e42c2f23522ab1d0ce25 + +build: + script: python -m pip install . -vv + number: 0 + +requirements: + build: + - {{ stdlib("c") }} + - {{ compiler("c") }} + - {{ compiler("cxx") }} + host: + - python + - pip + - setuptools + - cython + run: + - python + - regex + +test: + requires: + - pip + imports: + - curated_tokenizers + commands: + - pip check + +about: + home: https://github.com/explosion/curated-transformers + summary: 'Lightweight piece tokenization library' + license: MIT + license_family: MIT + license_file: LICENSE + dev_url: https://github.com/explosion/curated-tokenizers + +extra: + recipe-maintainers: + - h-vetinari + - conda-forge/spacy diff --git a/recipes/curated-transformers/meta.yaml b/recipes/curated-transformers/meta.yaml new file mode 100644 index 0000000000000..ee08d41abad2f --- /dev/null +++ b/recipes/curated-transformers/meta.yaml @@ -0,0 +1,46 @@ +{% set version = "0.1.1" %} +{% set python_min = python_min|default("3.9") %} + +package: + name: curated-transformers + version: {{ version }} + +source: + url: https://pypi.org/packages/source/c/curated-transformers/curated-transformers-{{ version }}.tar.gz + sha256: 4671f03314df30efda2ec2b59bc7692ea34fcea44cb65382342c16684e8a2119 + +build: + noarch: python + script: python -m pip install . -vv + number: 0 + +requirements: + host: + - python {{ python_min }}.* + - pip + - setuptools + run: + - python >={{ python_min }} + - pytorch + +test: + requires: + - python ={{ python_min }} + - pip + imports: + - curated_transformers + commands: + - pip check + +about: + home: https://github.com/explosion/curated-transformers + summary: '🤖 A PyTorch library of curated Transformer models and their composable components' + license: MIT + license_family: MIT + license_file: LICENSE + dev_url: https://github.com/explosion/curated-transformers + +extra: + recipe-maintainers: + - h-vetinari + - conda-forge/spacy diff --git a/recipes/spacy-curated-transformers/meta.yaml b/recipes/spacy-curated-transformers/meta.yaml new file mode 100644 index 0000000000000..634e5897aec69 --- /dev/null +++ b/recipes/spacy-curated-transformers/meta.yaml @@ -0,0 +1,51 @@ +{% set version = "0.3.0" %} +{% set python_min = python_min|default("3.9") %} + +package: + name: spacy-curated-transformers + version: {{ version }} + +source: + url: https://pypi.org/packages/source/s/spacy_curated_transformers/spacy_curated_transformers-{{ version }}.tar.gz + sha256: 989a6bf2aa7becd1ac8c3be5f245cd489223d4e16e7218f6b69479c7e2689937 + +build: + noarch: python + script: python -m pip install . -vv + number: 0 + +requirements: + host: + - python {{ python_min }}.* + - pip + - setuptools + run: + - python >={{ python_min }} + - curated-tokenizers >=0.0.9,<0.1.0 + - curated-transformers >=0.1.0,<0.2.0 + - pytorch >=1.12.0 + - spacy >=3.7.0,<4.0.0 + - srsly + - thinc >=8.1.6,<9.1.0 + +test: + requires: + - python ={{ python_min }} + - pip + imports: + - spacy_curated_transformers + commands: + - pip check + +about: + home: https://github.com/explosion/spacy-curated-transformers + summary: 'spaCy entry points for Curated Transformers' + license: MIT + license_family: MIT + license_file: LICENSE + dev_url: https://github.com/explosion/spacy-curated-transformers + +extra: + recipe-maintainers: + - h-vetinari + - conda-forge/spacy From 7c4f68fcb8127a02a8b189cfa67ac4da70d7a707 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 25 Nov 2024 08:08:07 +1100 Subject: [PATCH 2/7] use our own sentencepiece --- recipes/curated-tokenizers/meta.yaml | 4 + ...001-use-our-own-sentencepiece-abseil.patch | 99 +++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 recipes/curated-tokenizers/patches/0001-use-our-own-sentencepiece-abseil.patch diff --git a/recipes/curated-tokenizers/meta.yaml b/recipes/curated-tokenizers/meta.yaml index e46386ebbd0f8..b53d268fad629 100644 --- a/recipes/curated-tokenizers/meta.yaml +++ b/recipes/curated-tokenizers/meta.yaml @@ -7,6 +7,8 @@ package: source: url: https://pypi.org/packages/source/c/curated-tokenizers/curated-tokenizers-{{ version }}.tar.gz sha256: c93d47e54ab3528a6db2796eeb4bdce5d44e8226c671e42c2f23522ab1d0ce25 + patches: + - patches/0001-use-our-own-sentencepiece-abseil.patch build: script: python -m pip install . -vv @@ -22,6 +24,8 @@ requirements: - pip - setuptools - cython + - libprotobuf + - libsentencepiece run: - python - regex diff --git a/recipes/curated-tokenizers/patches/0001-use-our-own-sentencepiece-abseil.patch b/recipes/curated-tokenizers/patches/0001-use-our-own-sentencepiece-abseil.patch new file mode 100644 index 0000000000000..df4ee220a6e08 --- /dev/null +++ b/recipes/curated-tokenizers/patches/0001-use-our-own-sentencepiece-abseil.patch @@ -0,0 +1,99 @@ +From cc968a797278f4862db877a9449fadcc60f455b7 Mon Sep 17 00:00:00 2001 +From: "H. Vetinari" +Date: Mon, 25 Nov 2024 08:07:00 +1100 +Subject: [PATCH] use our own sentencepiece/abseil + +--- + curated_tokenizers/_spp.pxd | 2 +- + setup.py | 38 +++++++++++++++++++++++++------------ + 2 files changed, 27 insertions(+), 13 deletions(-) + +diff --git a/curated_tokenizers/_spp.pxd b/curated_tokenizers/_spp.pxd +index 29f5ec2..4630b45 100644 +--- a/curated_tokenizers/_spp.pxd ++++ b/curated_tokenizers/_spp.pxd +@@ -3,7 +3,7 @@ from libcpp.memory cimport shared_ptr + from libcpp.string cimport string + from libcpp.vector cimport vector + +-cdef extern from "builtin_pb/sentencepiece.pb.h" namespace "sentencepiece": ++cdef extern from "sentencepiece.pb.h" namespace "sentencepiece": + cdef cppclass SentencePieceText_SentencePiece: + uint32_t id() const + const string & piece() const +diff --git a/setup.py b/setup.py +index d0aa11b..0306041 100755 +--- a/setup.py ++++ b/setup.py +@@ -1,4 +1,6 @@ + #!/usr/bin/env python ++import pathlib ++import os + import sys + from setuptools.command.build_ext import build_ext + from setuptools import Extension, setup, find_packages +@@ -79,18 +81,17 @@ MOD_NAMES = [ + ] + COMPILE_OPTIONS = { + "msvc": [ ++ "/std:c++17", + "/Ox", + "/EHsc", +- "/D_USE_INTERNAL_STRING_VIEW", + "/DHAVE_PTHREAD", + "/wd4018", + "/wd4514", + ], + "other": [ +- "--std=c++14", ++ "--std=c++17", + "-Wno-sign-compare" "-Wno-strict-prototypes", + "-Wno-unused-function", +- "-D_USE_INTERNAL_STRING_VIEW", + "-pthread", + "-DHAVE_PTHREAD=1", + ], +@@ -139,21 +140,34 @@ def setup_package(): + if len(sys.argv) > 1 and sys.argv[1] == "clean": + return clean(root / "curated_tokenizers") + ++ library_path = pathlib.Path(os.environ['PREFIX']) ++ EXTENSION_LIBRARIES = ["sentencepiece"] ++ if "win32" in sys.platform: ++ absl_libs = ( ++ 'abseil_dll', 'absl_log_flags', 'absl_flags_commandlineflag', 'absl_flags_commandlineflag_internal', ++ 'absl_flags_config', 'absl_flags_internal', 'absl_flags_marshalling', 'absl_flags_parse', ++ 'absl_flags_private_handle_accessor', 'absl_flags_program_name', 'absl_flags_reflection', ++ 'absl_flags_usage', 'absl_flags_usage_internal', ++ ) ++ EXTENSION_LIBRARIES += ["libprotobuf"] ++ library_path = library_path / "Library" ++ else: ++ absl_glob = pathlib.Path(os.environ['PREFIX']).glob('lib/libabsl_*.so') ++ absl_libs = tuple(lib.stem[3:] for lib in absl_glob) ++ EXTENSION_LIBRARIES += ["protobuf"] ++ ++ EXTENSION_LIBRARIES += absl_libs ++ + ext_modules = [ + Extension( + "curated_tokenizers._spp", +- ["curated_tokenizers/_spp.pyx"] +- + ABSL_SRC +- + PROTOBUF_LIGHT_SRC +- + SENTENCEPIECE_SRC +- + SENTENCEPIECE_PROTOBUF_SRC, ++ ["curated_tokenizers/_spp.pyx"], + include_dirs=[ + "curated_tokenizers", +- "sentencepiece", +- "sentencepiece/src", +- "sentencepiece/src/builtin_pb", +- "sentencepiece/third_party/protobuf-lite", ++ str(library_path / "include") + ], ++ libraries=EXTENSION_LIBRARIES, ++ library_dirs=[str(library_path)], + language="c++", + ), + Extension( From 12415862bf1840892c6054ac99c694bf804a2ab0 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 25 Nov 2024 09:47:40 +1100 Subject: [PATCH 3/7] add spacy-pkuseg --- recipes/spacy-pkuseg/conda_build_config.yaml | 2 + recipes/spacy-pkuseg/meta.yaml | 53 ++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 recipes/spacy-pkuseg/conda_build_config.yaml create mode 100644 recipes/spacy-pkuseg/meta.yaml diff --git a/recipes/spacy-pkuseg/conda_build_config.yaml b/recipes/spacy-pkuseg/conda_build_config.yaml new file mode 100644 index 0000000000000..791696f391930 --- /dev/null +++ b/recipes/spacy-pkuseg/conda_build_config.yaml @@ -0,0 +1,2 @@ +numpy: + - "2.0" diff --git a/recipes/spacy-pkuseg/meta.yaml b/recipes/spacy-pkuseg/meta.yaml new file mode 100644 index 0000000000000..4e781b17047e3 --- /dev/null +++ b/recipes/spacy-pkuseg/meta.yaml @@ -0,0 +1,53 @@ +{% set version = "1.0.0" %} + +package: + name: spacy-pkuseg + version: {{ version }} + +source: + url: https://pypi.org/packages/source/s/spacy-pkuseg/spacy_pkuseg-{{ version }}.tar.gz + sha256: 33531ea8e13fc09ebe3b40bd97e84d07ccd5a1fe67fa8e84173769a25ac03158 + +build: + script: python -m pip install . -vv + number: 0 + +requirements: + build: + - {{ stdlib("c") }} + - {{ compiler("c") }} + - {{ compiler("cxx") }} + host: + - python + - pip + - setuptools + - cython + - numpy + run: + - python + - srsly >=2.3.0,<3.0.0 + +test: + requires: + - pip + imports: + - spacy_pkuseg + commands: + - pip check + +about: + home: https://github.com/explosion/spacy-pkuseg + summary: 'Chinese word segmentation toolkit for spaCy' + description: | + This package is a fork of [pkuseg-python](https://github.com/lancopku/pkuseg-python) + that simplifies installation and serialization for use with spaCy. + The underlying segmentation tools remain unmodified. + license: MIT + license_family: MIT + license_file: LICENSE + dev_url: https://github.com/explosion/spacy-pkuseg + +extra: + recipe-maintainers: + - h-vetinari + - conda-forge/spacy From 40169ea8088837df48658f8a271fcafaabef08bb Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 25 Nov 2024 10:06:15 +1100 Subject: [PATCH 4/7] override the whole python-zip for numpy 2.0 --- recipes/spacy-pkuseg/conda_build_config.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/recipes/spacy-pkuseg/conda_build_config.yaml b/recipes/spacy-pkuseg/conda_build_config.yaml index 791696f391930..495c7b75589e7 100644 --- a/recipes/spacy-pkuseg/conda_build_config.yaml +++ b/recipes/spacy-pkuseg/conda_build_config.yaml @@ -1,2 +1,16 @@ +# we only want to override numpy, but need to override the whole zip numpy: - "2.0" + - "2.0" + - "2.0" + - "2.0" +python: + - 3.9.* *_cpython + - 3.10.* *_cpython + - 3.11.* *_cpython + - 3.12.* *_cpython +python_impl: + - cpython + - cpython + - cpython + - cpython From aa2910cfb58509bbed8e59cda0192ead9dee1f96 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 25 Nov 2024 10:29:04 +1100 Subject: [PATCH 5/7] shut up the linter --- recipes/curated-transformers/meta.yaml | 2 +- recipes/spacy-curated-transformers/meta.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/curated-transformers/meta.yaml b/recipes/curated-transformers/meta.yaml index ee08d41abad2f..cdc8c39ba4545 100644 --- a/recipes/curated-transformers/meta.yaml +++ b/recipes/curated-transformers/meta.yaml @@ -25,7 +25,7 @@ requirements: test: requires: - - python ={{ python_min }} + - python {{ python_min }} - pip imports: - curated_transformers diff --git a/recipes/spacy-curated-transformers/meta.yaml b/recipes/spacy-curated-transformers/meta.yaml index 634e5897aec69..35c80757c287b 100644 --- a/recipes/spacy-curated-transformers/meta.yaml +++ b/recipes/spacy-curated-transformers/meta.yaml @@ -30,7 +30,7 @@ requirements: test: requires: - - python ={{ python_min }} + - python {{ python_min }} - pip imports: - spacy_curated_transformers From 7079ebc4cd7b26f5c81f6eb1601d4a7b59154031 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 25 Nov 2024 23:25:15 +1100 Subject: [PATCH 6/7] ensure we pull in new libsentencepiece builds --- recipes/curated-tokenizers/conda_build_config.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 recipes/curated-tokenizers/conda_build_config.yaml diff --git a/recipes/curated-tokenizers/conda_build_config.yaml b/recipes/curated-tokenizers/conda_build_config.yaml new file mode 100644 index 0000000000000..8099f2c0da99c --- /dev/null +++ b/recipes/curated-tokenizers/conda_build_config.yaml @@ -0,0 +1,4 @@ +libgrpc: +- "1.67" +libprotobuf: +- 5.28.2 From 69bc088f1d4ebe36ff08737cc3da9719c559e4e1 Mon Sep 17 00:00:00 2001 From: h-vetinari Date: Tue, 26 Nov 2024 08:14:27 +1100 Subject: [PATCH 7/7] Adapt to names of sentencepiece import library --- .../patches/0001-use-our-own-sentencepiece-abseil.patch | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/recipes/curated-tokenizers/patches/0001-use-our-own-sentencepiece-abseil.patch b/recipes/curated-tokenizers/patches/0001-use-our-own-sentencepiece-abseil.patch index df4ee220a6e08..6358a4253155d 100644 --- a/recipes/curated-tokenizers/patches/0001-use-our-own-sentencepiece-abseil.patch +++ b/recipes/curated-tokenizers/patches/0001-use-our-own-sentencepiece-abseil.patch @@ -58,7 +58,7 @@ index d0aa11b..0306041 100755 return clean(root / "curated_tokenizers") + library_path = pathlib.Path(os.environ['PREFIX']) -+ EXTENSION_LIBRARIES = ["sentencepiece"] ++ EXTENSION_LIBRARIES = [] + if "win32" in sys.platform: + absl_libs = ( + 'abseil_dll', 'absl_log_flags', 'absl_flags_commandlineflag', 'absl_flags_commandlineflag_internal', @@ -66,12 +66,12 @@ index d0aa11b..0306041 100755 + 'absl_flags_private_handle_accessor', 'absl_flags_program_name', 'absl_flags_reflection', + 'absl_flags_usage', 'absl_flags_usage_internal', + ) -+ EXTENSION_LIBRARIES += ["libprotobuf"] ++ EXTENSION_LIBRARIES += ["sentencepiece_import", "libprotobuf"] + library_path = library_path / "Library" + else: + absl_glob = pathlib.Path(os.environ['PREFIX']).glob('lib/libabsl_*.so') + absl_libs = tuple(lib.stem[3:] for lib in absl_glob) -+ EXTENSION_LIBRARIES += ["protobuf"] ++ EXTENSION_LIBRARIES += ["sentencepiece", "protobuf"] + + EXTENSION_LIBRARIES += absl_libs +