edge text to speech

from json import dumps as json_stringify, loads as json_prase
from ws4py.client.threadedclient import WebSocketClient
from cryptography.fernet import Fernet
from xml.etree import ElementTree
from datetime import datetime
import ws4py
import time
import os


def get_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        r = f.read()
        return r


def decode_token(key, enc_token):
    fernet = Fernet(key)
    token = fernet.decrypt(enc_token).decode()

    return token


# current date and time
now = str(datetime.timestamp(datetime.now()))

# מפתח ההצפנה
key = b'f5mFTzPJ7g-5ZyzmbM5wWVeJpIG5GtPkLxwizeKKCZ4='

# טוקן מוצפן
enc_token =  \
    b'gAAAAABglyvGOx1ZHrms20ckxjAgVqhC' \
    + b'eyLqOFXy3uQ0HyfVpkqfc3Yo18vgXY' \
    + b'Ids2jp2bYyVR2JheA9b3jAZmK2Pxhm' \
    + b'm7hfwGa_xfsgqjHQLq32LYIM4eRI4v' \
    + b'kaskhiOb2apeadLaOZ'

# פיענוח טוקן
token = decode_token(key, enc_token)

host = "wss://speech.platform.bing.com/"
path = "consumer/speech/synthesize/readaloud/edge/v1"
endpoint_url = host + path + "?" + "TrustedClientToken" + "=" + token

timestr = time.strftime("%Y%m%d-%H%M")
filename = 'sample-' + timestr + '.mp3'

file = []


class Microsofts_tts(WebSocketClient):
    def __init__(self, url, text, voice_name='AvriNeural'):

        self.text = text
        self.voice_name = voice_name

        super(Microsofts_tts, self).__init__(url)

    def opened(self):

        conf_txt = self.set_configs()
        text = self.set_text(self.text)

        self.send(conf_txt)

        self.send(text)

    def received_message(self, m):
        r = self.make_response(m)

        if r["headers"].get("Content-Type") == "audio/mpeg":
            file.append(r["body"])

        if r["headers"].get("Content-Type") == \
            "application/json; charset=utf-8" and \
                r["headers"]["Path"] == "turn.end":

            with open(filename, "wb") as f:
                for row in file:
                    f.write(row)
            self.close()

            os.startfile(os.getcwd() + '\\' + filename, 'open')

    def make_response(self, m):

        res = {
            "headers": [],
            "body": None
        }

        if type(m) == ws4py.messaging.TextMessage:
            m_str = str(m)
            splited_data = m_str.split("\r\n\r\n")

            headers_arr = splited_data[0].split("\r\n")

            headers = {}
            for header_binary in headers_arr:
                header_binary = header_binary.split(":")
                headers[header_binary[0].strip()] = header_binary[1].strip()

            body = json_prase(splited_data[1])

            res["headers"] = headers
            res["body"] = body
            return res

        if type(m) == ws4py.messaging.BinaryMessage:

            data = m.data.replace(b"\x00\x80", b"")

            splited_data = data.split(b"\r\n")

            headers = {}
            for header_binary in splited_data[:3]:
                header = header_binary.decode("utf-8")
                header = header.split(":")
                headers[header[0].strip()] = header[1].strip()

            res["headers"] = headers

            if headers.get("Content-Type"):
                res["body"] = (splited_data[4])
            return res

    def set_text(self, text):

        # '(he-IL, Asaf)'
        # engine = "(zh-CN, XiaoxiaoNeural)"

        lang = 'he-IL'
        engine = self.voice_name
        engine = '(' + lang + ', ' + engine + ')'

        xml_body = ElementTree.Element('speak', version='1.0')
        xml_body.set('xmlns', 'https://www.w3.org/2001/10/synthesis')
        xml_body.set('xml:lang', lang)

        voice = ElementTree.SubElement(xml_body, 'voice')
        voice.set(
            'name', 'Microsoft Server Speech Text to Speech Voice '
            + engine)

        prosody = ElementTree.SubElement(voice, 'prosody')
        prosody.set('rate', '+0%')
        prosody.set('pitch', '+0Hz')
        prosody.set('volume', '+0%')

        p = ElementTree.SubElement(prosody, 'p')
        p.text = text

        body = ElementTree.tostring(xml_body, 'unicode')

        return "X-RequestId:fe83fbefb15c7739fe674d9f3e81d38f" + "\r\n" \
            + "Content-Type:application/ssml+xml" + "\r\n" \
            + "Path:ssml" + "\r\n" + "\r\n" + body

    def set_configs(self, codec="audio-24khz-48kbitrate-mono-mp3"):
        conf_obj = {
            "context": {
                "synthesis": {
                    "audio": {
                        "metadataoptions": {
                            "sentenceBoundaryEnabled": False,
                            "wordBoundaryEnabled": False
                        },
                        "outputFormat": codec
                    }
                }
            }
        }

        return "Content-Type:application/json; charset=utf-8" + "\r\n" \
            + "Path:speech.config" + "\r\n" + "\r\n" \
            + json_stringify(conf_obj)


if __name__ == '__main__':
    text = "שלום ילדים, לילה טוב"

    # text = get_file(os.getcwd() + '\\' + "test.txt")
    engine = 'AvriNeural'
    ws = Microsofts_tts(endpoint_url, text, engine)
    ws.connect()
    ws.run_forever()
MusiCode1/edge-text-to-speech.md