I tried to use MeloTTS as an API in a hurry, but it didn't install properly on my Mac environment.
So I built my environment with Docker and tried to use MeloTTS, which resulted in a Gradio Local Web UI with no API.
It looks like MeloTTS is still working on creating an official API, so I created my own local fetch code that mimics the Gradio client's call to make it work and be usable.
This code calls MeloTTS, which is turned on inside the docker, to get the API from an external server.
I used the docker image below because the official docker builds weirdly and doesn't work well for me, but the script below should work even if you use the official docker image.
https://hub.docker.com/r/darren2046/melotts
import { nanoid } from "nanoid";
const BASE_URL = "http://localhost:8888";
export const textToSpeech = async ({
text,
language,
speakerName,
}: {
text: string;
language: string;
speakerName: string;
}) => {
const sessionHash = nanoid();
await joinQueue({
text,
sessionHash,
language,
speakerName,
});
// Here is where you would play the audio http url
return await listenToTextToSpeechQueue(sessionHash);
};
const joinQueue = async (props: {
text: string;
sessionHash: string;
language: string;
speakerName: string;
}) => {
const data = {
data: [props.language, props.text, 1, props.speakerName],
event_data: null,
fn_index: 1,
trigger_id: 8,
session_hash: props.sessionHash,
};
const response = await fetch(`${BASE_URL}/queue/join`, {
headers: {
"content-type": "application/json",
},
body: JSON.stringify(data),
method: "POST",
});
const json = (await response.json()) as { event_id: string };
return json.event_id;
};
export const listenToTextToSpeechQueue = (
sessionHash: string
): Promise<string> => {
return new Promise((resolve, reject) => {
const eventSource = new EventSource(
`${BASE_URL}/queue/data?session_hash=${sessionHash}`
);
eventSource.onmessage = (event) => {
try {
const data: QueueResponse = JSON.parse(event.data);
if (data.msg === "process_completed" && data.success) {
const audioUrl = data.output?.data[0]?.url;
if (audioUrl) {
eventSource.close();
resolve(audioUrl);
} else {
throw new Error("No audio URL in completed response");
}
}
} catch (error) {
eventSource.close();
reject(error);
}
};
eventSource.onerror = (error) => {
eventSource.close();
reject(error);
};
});
};
export interface QueueResponse {
msg: string;
event_id: string;
output?: {
data: Array<{
path: string;
url: string;
size: number | null;
orig_name: string;
mime_type: string | null;
is_stream: boolean;
meta: {
_type: string;
};
}>;
is_generating: boolean;
duration: number;
average_duration: number;
};
success?: boolean;
rank?: number;
queue_size?: number;
rank_eta?: number;
eta?: number;
progress_data?: Array<{
index: number;
length: number;
unit: string;
progress: number | null;
desc: string | null;
}>;
}
Feel free to write your own code. The license is MIT of course. Thanks to my company for permission to share. LLAMI