From d42510068d9b56f7cc6e86185f25b69cb4a0ba85 Mon Sep 17 00:00:00 2001
From: Ke WANG <wake@microsoft.com>
Date: Fri, 20 Dec 2024 17:21:36 +0800
Subject: [PATCH] feat(pronscore): REST API for prosody

---
 PronunciationAssessment/Python/sample.py | 159 +++++++++++++++++------
 1 file changed, 118 insertions(+), 41 deletions(-)

diff --git a/PronunciationAssessment/Python/sample.py b/PronunciationAssessment/Python/sample.py
index 1c030add..28b8eb8d 100644
--- a/PronunciationAssessment/Python/sample.py
+++ b/PronunciationAssessment/Python/sample.py
@@ -28,56 +28,133 @@
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 
-import requests
 import base64
-import json
+import requests
 import time
+import uuid
+
+subscription_key = "{SubscriptionKey}"  # replace this with your subscription key
+region = "{Region}"  # replace this with the region corresponding to your subscription key, e.g. westus, eastasia
 
-subscriptionKey = "{SubscriptionKey}" # replace this with your subscription key
-region = "{Region}" # replace this with the region corresponding to your subscription key, e.g. westus, eastasia
+# A common wave header, with zero audio length
+# Since stream data doesn't contain header, but the API requires header to fetch format information,
+# so you need post this header as first chunk for each query
+WaveHeader16K16BitMono = bytes(
+    [
+        82,
+        73,
+        70,
+        70,
+        78,
+        128,
+        0,
+        0,
+        87,
+        65,
+        86,
+        69,
+        102,
+        109,
+        116,
+        32,
+        18,
+        0,
+        0,
+        0,
+        1,
+        0,
+        1,
+        0,
+        128,
+        62,
+        0,
+        0,
+        0,
+        125,
+        0,
+        0,
+        2,
+        0,
+        16,
+        0,
+        0,
+        0,
+        100,
+        97,
+        116,
+        97,
+        0,
+        0,
+        0,
+        0,
+    ]
+)
 
-# a common wave header, with zero audio length
-# since stream data doesn't contain header, but the API requires header to fetch format information, so you need post this header as first chunk for each query
-WaveHeader16K16BitMono = bytes([ 82, 73, 70, 70, 78, 128, 0, 0, 87, 65, 86, 69, 102, 109, 116, 32, 18, 0, 0, 0, 1, 0, 1, 0, 128, 62, 0, 0, 0, 125, 0, 0, 2, 0, 16, 0, 0, 0, 100, 97, 116, 97, 0, 0, 0, 0 ])
 
-# a generator which reads audio data chunk by chunk
-# the audio_source can be any audio input stream which provides read() method, e.g. audio file, microphone, memory stream, etc.
+# A generator which reads audio data chunk by chunk.
+# The audio_source can be any audio input stream which provides read() method,
+# e.g. audio file, microphone, memory stream, etc.
 def get_chunk(audio_source, chunk_size=1024):
-  yield WaveHeader16K16BitMono
-  while True:
-    time.sleep(chunk_size / 32000) # to simulate human speaking rate
-    chunk = audio_source.read(chunk_size)
-    if not chunk:
-      global uploadFinishTime
-      uploadFinishTime = time.time()
-      break
-    yield chunk
+    yield WaveHeader16K16BitMono
+    while True:
+        time.sleep(chunk_size / 32000)  # to simulate human speaking rate
+        chunk = audio_source.read(chunk_size)
+        if not chunk:
+            global upload_finish_time
+            upload_finish_time = time.time()
+            break
+        yield chunk
+
+
+# Build pronunciation assessment parameters
+locale = "en-US"
+audio_file = open("../goodmorning.pcm", "rb")
+reference_text = "Good morning."
+enable_prosody_assessment = True
+phoneme_alphabet = "SAPI"  # IPA or SAPI
+enable_miscue = True
+nbest_phoneme_count = 5
+pron_assessment_params_json = (
+    '{"GradingSystem":"HundredMark","Dimension":"Comprehensive","ReferenceText":"%s","EnableProsodyAssessment":"%s",'
+    '"PhonemeAlphabet":"%s","EnableMiscue":"%s","NBestPhonemeCount":"%s"}'
+    % (reference_text, enable_prosody_assessment, phoneme_alphabet, enable_miscue, nbest_phoneme_count)
+)
+pron_assessment_params_base64 = base64.b64encode(bytes(pron_assessment_params_json, "utf-8"))
+pron_assessment_params = str(pron_assessment_params_base64, "utf-8")
+
+# https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-get-speech-session-id#provide-session-id-using-rest-api-for-short-audio
+session_id = uuid.uuid4().hex
 
-# build pronunciation assessment parameters
-referenceText = "Good morning."
-pronAssessmentParamsJson = "{\"ReferenceText\":\"%s\",\"GradingSystem\":\"HundredMark\",\"Dimension\":\"Comprehensive\"}" % referenceText
-pronAssessmentParamsBase64 = base64.b64encode(bytes(pronAssessmentParamsJson, 'utf-8'))
-pronAssessmentParams = str(pronAssessmentParamsBase64, "utf-8")
+# Build request
+url = f"https://{region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1"
+url = f"{url}?format=detailed&language={locale}&X-ConnectionId={session_id}"
+headers = {
+    "Accept": "application/json;text/xml",
+    "Connection": "Keep-Alive",
+    "Content-Type": "audio/wav; codecs=audio/pcm; samplerate=16000",
+    "Ocp-Apim-Subscription-Key": subscription_key,
+    "Pronunciation-Assessment": pron_assessment_params,
+    "Transfer-Encoding": "chunked",
+    "Expect": "100-continue",
+}
 
-# build request
-url = "https://%s.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language=en-us" % region
-headers = { 'Accept': 'application/json;text/xml',
-            'Connection': 'Keep-Alive',
-            'Content-Type': 'audio/wav; codecs=audio/pcm; samplerate=16000',
-            'Ocp-Apim-Subscription-Key': subscriptionKey,
-            'Pronunciation-Assessment': pronAssessmentParams,
-            'Transfer-Encoding': 'chunked',
-            'Expect': '100-continue' }
+print(f"II URL: {url}")
+print(f"II Config: {pron_assessment_params_json}")
 
-audioFile = open('../goodmorning.pcm', 'rb')
+# Send request with chunked data
+response = requests.post(url=url, data=get_chunk(audio_file), headers=headers)
+get_response_time = time.time()
+audio_file.close()
 
-# send request with chunked data
-response = requests.post(url=url, data=get_chunk(audioFile), headers=headers)
-getResponseTime = time.time()
-audioFile.close()
+# Show Session ID
+print(f"II Session ID: {session_id}")
 
-resultJson = json.loads(response.text)
-print(json.dumps(resultJson, indent=4))
+if response.status_code != 200:
+    print(f"EE Error code: {response.status_code}")
+    print(f"EE Error message: {response.text}")
+    exit()
+else:
+    print(f"II Response: {response.json()}")
 
-latency = getResponseTime - uploadFinishTime
-print("Latency = %sms" % int(latency * 1000))
+latency = get_response_time - upload_finish_time
+print(f"II Latency: {int(latency * 1000)}ms")