A native output module based on piper TTS.

brailcom · Mar 4, 2025 · 6511762 · 6511762
1 parent b3a0fc2
commit 6511762
Show file tree

Hide file tree

Showing 7 changed files with 1,416 additions and 9 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -75,22 +75,24 @@ jobs:
                         libao-dev libasound2-dev libaudio-dev libpulse-dev libpipewire-0.3-dev libxau-dev libsndfile1-dev
                         libdotconf-dev libglib2.0-dev libltdl-dev systemd
                         gettext help2man texinfo texlive texlive-plain-generic
+                        librubberband-dev
                         pulseaudio ${{ matrix.deps }}
     - name: install piper
       run: |
-        wget https://github.com/rhasspy/piper-phonemize/releases/download/v1.0.0/libpiper_phonemize-amd64.tar.gz
-        echo "698358c5362250f315f400468199c40bfff41b56b4e079d4b93bfa67019e2cd28a88d385dd4de79ea24b923272826cf4e4191b81a7b33d3164cb5615b8c66be9  libpiper_phonemize-amd64.tar.gz" | sha512sum --check
-        tar -C /opt -xvf libpiper_phonemize-amd64.tar.gz
-        wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz
-        echo "e255e804b7ba1f837ea76d5c3693abf32514dd4ca30239d80d2b5131b5886c76a00a741d405b762658ae4444b90a8a857f7d65f1c25da250d14d705cbc580cc1  piper_amd64.tar.gz" | sha512sum --check
-        tar -C /opt -xvf piper_amd64.tar.gz
+        wget https://github.com/rhasspy/piper-phonemize/releases/download/2023.11.14-4/piper-phonemize_linux_x86_64.tar.gz
+        echo "22ae85dba1c07e328b33e8311ae6e67ad1ea2a57068d08238c452c69b0b1761eb39b064bf7e51f7413317a8200f6f92c23426a5a575abbb4dbdde86c81825601  piper-phonemize_linux_x86_64.tar.gz" | sha512sum --check
+        tar -C /opt -xvf piper-phonemize_linux_x86_64.tar.gz
+        wget https://github.com/rhasspy/piper/archive/refs/tags/2023.11.14-2.tar.gz
+        echo "2156c8d384aafc977a41eea181855b5e3fddab1eca353528d919d421958e4c10dbcf6526d03433e4246ac847a2fe4a7c43a57d2cb5666993ad0f28ed9fb68fe6  2023.11.14-2.tar.gz" | sha512sum --check
+        tar -C /opt -xvf 2023.11.14-2.tar.gz
     - name: autoconf
       run: ./build.sh
     - name: configure
       run: ${{ matrix.configenv }}
            CFLAGS="${{ matrix.configflags }}"
-           CXXFLAGS="${{ matrix.configflags }}"
-           ./configure --prefix=/tmp/foobar ${{ matrix.config }}
+           CXXFLAGS="${{ matrix.configflags }} -I/opt/piper_phonemize/include"
+           LDFLAGS="-L/opt/piper_phonemize/lib"
+           ./configure --prefix=/tmp/foobar --with-piper=/opt/piper-2023.11.14-2 ${{ matrix.config }}
     - name: Store the config
       if: ${{ always() }}
       uses: actions/upload-artifact@v4

diff --git a/config/modules/Makefile.am b/config/modules/Makefile.am
@@ -53,6 +53,11 @@ dist_moduleconf_DATA += kali.conf
 dist_moduleconforig_DATA += kali.conf
 endif
 
+if piper_support
+dist_moduleconf_DATA += cxxpiper.conf
+dist_moduleconforig_DATA += cxxpiper.conf
+endif
+
 if ibmtts_support
 dist_moduleconf_DATA += ibmtts.conf
 dist_moduleconforig_DATA += ibmtts.conf

diff --git a/config/modules/cxxpiper.conf b/config/modules/cxxpiper.conf
@@ -0,0 +1,71 @@
+#
+# Configuration for cxxpiper speech dispatcher output module.
+#
+
+Debug 0
+
+# Piper doesn't have voices ala speech- dispatcher.  Piper has a
+# "model" and the model's "configuration".  A model/config may be
+# single speaker or multi-speaker.  Single speaker models produce a
+# single speech dispatcher voice.  The speech dispatcher voice name
+# can be listed with 'spd-say -o cxxpiper -L', but it is
+# not needed as the voice will be the default, and only, voice available.
+#
+# Piper multi-speaker models produce a discrete speech dispatcher
+# voice for each speaker the model suports.  The configuration
+# file for a multi-speaker model contains a "speaker_id_map" object
+# that lists an integer speaker id and string mneumonic for each
+# speaker supported by the model.  Since speech dispatcher has no
+# notion of speaker id, speaker selection details are hidden from the
+# user by instead exposing voices of the form
+# <model-name>~<speaker-id>~mneumonic with the output module
+# mapping between "voice names" and the current model's speakers.
+
+# NB:  Unsure if onnx models may allow different " languages within
+# the same multi-speaker model.  REgardless, if there's sufficient memory: it might be possible to load multiple
+# models and have cxxpiper select between them, while presenting the union of
+# the speakers and languages of each model.
+# For now all directives with language fields require the language code, but ignore it.
+
+# ModelPath and ConfigPath are required.  There should be exactly one of each of them.
+ModelPath "/usr/share/piper/voices/clean100.onnx"
+ConfigPath "/usr/share/piper/voices/clean100.onnx.json"
+
+# For single-speaker models, DefaultVoice is ignored, and logged as such, with
+# a warning.  For multi-speaker models, DefaultVoice is optional.  If it is
+# not specified, the first speaker of the multi-speaker model becomes
+# the default speaker for the lifetime of the cxxpiper output module
+# and also for future runtimes unless this configuration is changed.
+# When specified along with a multi-speaker model, the argument is a string that matches one of the
+# "voices" listed by spd-say -o cxxpiper -L .  Note that piper's
+# notion of "speaker" appears to the user as the "voice" concept of
+# speech dispatcher.  This is pretty much invisible to the user, but
+# note that it means that the strings listed in the .json
+# configuration file in the speaker_id_map object are not the same as
+# the voices listed by spd-say (i.e. the speaker ids are substrings of
+# the listed voices).  We could also match the substrings, but we
+# don't at least for now, only the "voice" string string is
+# recognized.  If the voice string can't be matched against the voices
+# found when the model is loaded, then the first speaker becomes the
+# default for the lifetime of the output module.  A warning is logged
+# if the string can't be matched and the voice name of the first
+# speaker (index 0) is included in the warning message.
+DefaultVoice "clean100~2~5393"
+
+# AddVoice (optional) reused from the generic output module.  This maps types to voice names within a language code.
+# It does not do anything useful for single speaker models and is ignored.  For
+# multi-speaker models the language code is required, but ignored, at least for now.
+AddVoice "en_US" "MALE1" "clean100~33~8419"
+AddVoice "en_US" "FEMALE1" "clean100~25~4137"
+
+# Sound Icons are configured and work like espeak.
+SoundIconFolder "/usr/share/sounds/sound-icons/"
+SoundIconVolume 0
+
+# Piper uses ESpeak NG sometimes, depending on the model.  Piper distributes this and distros
+# may provide it.  Default is "/usr/share/espeak-ng-data/".
+# It should probably be considered required, but if a model doesn't use espeak
+# it might work to omit it.
+ESpeakNGDataDirPath "/usr/share/espeak-ng-data/"
+
+# End of cxxpiper.conf
diff --git a/configure.ac b/configure.ac
@@ -377,6 +377,75 @@ AM_CONDITIONAL([kali_support], [test $with_kali != no])
 AM_CONDITIONAL([kali_shim], [test $with_kali = shim])
 AS_IF([test $with_kali != no], [output_modules="${output_modules} kali"])
 
+# check for piper support
+AC_ARG_WITH([piper],
+	[AS_HELP_STRING([--with-piper=/path], [include Piper support])],
+	[],
+	[with_piper=no])
+AS_IF([test $with_espeak = yes -a $with_piper != no], [
+	# We currently need various headers of piper
+	PIPER_SRC_DIR="$with_piper/src/cpp"
+	OLDCXXFLAGS="$CXXFLAGS"
+	OLDLDFLAGS="$LDFLAGS"
+	CXXFLAGS="$CXXFLAGS -I$PIPER_SRC_DIR"
+
+	AS_IF([test $with_piper != no],
+		[PKG_CHECK_MODULES([ONNXRUNTIME], [libonnxruntime], [
+				CXXFLAGS="$CXXFLAGS $ONNXRUNTIME_CFLAGS"
+				LDFLAGS="$LDFLAGS $ONNXRUNTIME_LIBS"
+			], [AC_CHECK_LIB([onnxruntime], [OrtGetApiBase],
+				[ONNXRUNTIME_LIBS="-lonnxruntime"],
+				[with_piper=no])
+			])])
+
+	AS_IF([test $with_piper != no],
+		[PKG_CHECK_MODULES([RUBBERBAND], [rubberband], [
+				CXXFLAGS="$CXXFLAGS $RUBBERBAND_CFLAGS"
+				LDFLAGS="$LDFLAGS $RUBBERBAND_LIBS"
+			], [with_piper=no])])
+
+	AC_LANG_PUSH(C++)
+	AS_IF([test $with_piper != no],
+		[AC_CHECK_LIB([piper_phonemize], [_ZN5piper19DEFAULT_PHONEME_MAPB5cxx11E],
+			[:],
+			[with_piper=no])])
+
+	AS_IF([test $with_piper != no],
+		[AC_CHECK_HEADER([json.hpp],
+			[],
+			[with_piper=no])])
+	AS_IF([test $with_piper != no],
+		[AC_CHECK_HEADER([piper.hpp],
+			[],
+			[with_piper=no])])
+	AS_IF([test $with_piper != no],
+		[AC_CHECK_HEADER([utf8.h],
+			[],
+			[with_piper=no])])
+	AS_IF([test $with_piper != no],
+		[AC_CHECK_HEADER([wavfile.hpp],
+			[],
+			[with_piper=no])])
+
+	AS_IF([test $with_piper != no],
+		# We need espeak with https://github.com/espeak-ng/espeak-ng/pull/2127 applied
+		[AC_CHECK_LIB([espeak-ng], [espeak_TextToPhonemesWithTerminator],
+			[:],
+			[with_piper=no],
+			[$ESPEAK_NG_LIBS])])
+
+	AC_LANG_POP(C++)
+	CXXFLAGS="$OLDCXXFLAGS"
+	LDFLAGS="$OLDLDFLAGS"
+])
+AM_CONDITIONAL([piper_support], [test $with_piper != no])
+AS_IF([test $with_piper != no], [output_modules="${output_modules} cxxpiper"])
+AC_SUBST([PIPER_SRC_DIR])
+AC_SUBST([ONNXRUNTIME_CFLAGS])
+AC_SUBST([ONNXRUNTIME_LIBS])
+AC_SUBST([RUBBERBAND_CFLAGS])
+AC_SUBST([RUBBERBAND_LIBS])
+
 audio_dlopen='-dlopen force'
 audio_dlopen_modules=
 

diff --git a/src/modules/.gitignore b/src/modules/.gitignore
@@ -26,6 +26,7 @@
 /libvoxin.so
 /sd_baratinoo
 /sd_cicero
+/sd_cxxpiper
 /sd_dummy
 /sd_espeak
 /sd_espeak-ng

diff --git a/src/modules/Makefile.am b/src/modules/Makefile.am
@@ -27,7 +27,6 @@ EXTRA_DIST += dummy-message.txt
 CLEANFILES = dummy-message.wav
 
 inc_local = -I$(top_srcdir)/include -I$(top_srcdir)/src/common
-LDFLAGS =
 
 if DARWIN_HOST
 LDFLAGS += -Wl,-U,_module_speak -Wl,-U,_module_speak_sync
@@ -287,6 +286,22 @@ CLEANFILES += $(EXTRA_sd_kali_DEPENDENCIES)
 endif
 endif
 
+#
+#  cxxpiper
+#
+if piper_support
+modulebin_PROGRAMS += sd_cxxpiper
+sd_cxxpiper_SOURCES = cxxpiper.cpp module_utils_addvoice.c module_utils_play.c $(common_SOURCES)
+sd_cxxpiper_CPPFLAGS = -I$(PIPER_SRC_DIR) $(ONNXRUNTIME_CFLAGS) $(RUBBERBAND_CFLAGS) $(AM_CPPFLAGS)
+sd_cxxpiper_LDADD = $(top_builddir)/src/common/libcommon.la \
+	-lpiper_phonemize \
+	$(ONNXRUNTIME_LIBS) \
+	-lespeak-ng \
+	$(RUBBERBAND_LIBS) \
+	$(SNDFILE_LIBS) \
+	$(common_LDADD)
+endif
+
 #
 # voxin module
 #