Created
February 6, 2020 21:53
-
-
Save lselden/cde51ca2debdd7c7c16ea4abf621f8d5 to your computer and use it in GitHub Desktop.
TTS using powershell
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
.SYNOPSIS | |
Speak text using SSML | |
.DESCRIPTION | |
Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s. This uses the .NET System.Speech library, which uses the older SAPI5 synthesis system. | |
Based on code from https://github.com/marak/say.js/ | |
.PARAMETER Text | |
(default from pipeline) | |
The Text to speak. Text will automatically be wrapped in <speak> if necessary. | |
.PARAMETER Path | |
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le) | |
.PARAMETER listVoices | |
If -listVoices is passed then this function will just output a list of available | |
voices in the format {languageCode, id, name, ssmlGender} | |
.PARAMETER Voice | |
Name of voice to use. To get list of voices use -listVoices option | |
.PARAMETER Rate | |
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast | |
.PARAMETER Volume | |
Volume in range 0-1, 1 (default) is full volume | |
.PARAMETER SampleRate | |
SampleRate of output WAV file. Default is 24000 | |
.PARAMETER Channels | |
Number of channels of output WAV file. Default is 1 | |
.PARAMETER Lang | |
Language to use. Default is "en-US". | |
.INPUTS | |
System.String. You can pipe the "Text" parameter into the script | |
.OUTPUTS | |
Will play sound to speakers by default, or write to disk if -Path is specified. | |
Output is metadata about result: | |
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>} | |
.EXAMPLE | |
PS> ./out-ssml.ps1 "hello world" | |
#> | |
param( | |
[Parameter(ValueFromPipeline = $true)] [string] $text, | |
[Parameter(Mandatory = $false)] [string] $voice, | |
[Parameter(Mandatory = $false)] | |
[ValidateRange(0.33, 3.0)] [double] $rate = 1.0, | |
[Parameter(Mandatory = $false)] | |
[ValidateRange(0.0, 1.0)] [double] $volume = 1.0, | |
[Parameter(Mandatory = $false)] [string] $path, | |
[Parameter(Mandatory = $false)] [int] $sampleRate = 24000, | |
[Parameter(Mandatory = $false)] [int] $channels = 1, | |
[Parameter(Mandatory = $false)] [string] $lang = 'en-US', | |
[Switch] $listVoices | |
) | |
begin { | |
Add-Type -AssemblyName System.speech; | |
# Start-Sleep -Milliseconds 1; | |
$script:finishedBookmarkName = '__psspeak_finished'; | |
$script:endPause = '50ms'; | |
function FixSSML($text, $lang) { | |
$ssmlNamespace = 'http://www.w3.org/2001/10/synthesis'; | |
if (-not $text.Trim().StartsWith('<speak')) { | |
# escape xml | |
$text = [System.Security.SecurityElement]::Escape($text); | |
$text = "<speak version=`"1.0`" xml:lang=`"$lang`">$text</speak>"; | |
} | |
# NOTE will throw error on invalid input | |
$dom = [xml]$text; | |
$dom.speak.SetAttribute('version', '1.0'); | |
if (-not $dom.speak.GetAttribute('xml:lang')) { | |
$dom.speak.SetAttribute('xml:lang', $lang); | |
} | |
$dom.speak.SetAttribute('xmlns', $ssmlNamespace); | |
$lastBreak = $dom.CreateElement('break'); | |
$lastBreak.SetAttribute('time', $script:endPause) | Out-Null; | |
$dom.speak.AppendChild($lastBreak) | Out-Null; | |
$lastMark = $dom.CreateElement('mark'); | |
$lastMark.SetAttribute('name', $script:finishedBookmarkName) | Out-Null; | |
$dom.speak.AppendChild($lastMark) | Out-Null; | |
return $dom.speak.OuterXml; | |
} | |
$speak = [System.Speech.Synthesis.SpeechSynthesizer]::new(); | |
} | |
process { | |
if ($listVoices) { | |
$voices = $speak.GetInstalledVoices(); | |
return $voices | ? { $_.Enabled -eq $true } | % { | |
$info = $_.VoiceInfo; | |
[PSCustomObject]@{ | |
languageCode = $info.Culture.ToString(); | |
id = $info.Id; | |
name = $info.Name; | |
ssmlGender = $info.Gender; | |
} | |
} | |
} | |
if ($voice) { | |
$speak.SelectVoice($voice); | |
if ($speak.Voice.Culture.ToString() -ne $lang) { | |
# write warning? | |
# better option would be to add voice element | |
$lang = $speak.Voice.Culture.ToString(); | |
} | |
} | |
if ($rate -ne 1.0) { | |
$speak.Rate = [math]::max(-10, | |
[math]::Min( | |
[math]::Round((9.0686 * [math]::Log($rate)) - 0.1806), | |
10 | |
) | |
); | |
} | |
if ($volume -ne 1.0) { | |
$speak.Volume = [int]($volume * 100); | |
} | |
if ($path) { | |
$filepath = if ([System.IO.Path]::IsPathRooted($path)) { | |
$path; | |
} else { | |
[System.IO.Path]::GetFullPath((join-path $pwd $path)) | |
} | |
$format = [System.Speech.AudioFormat.SpeechAudioFormatInfo]::new($sampleRate, 16, $channels); | |
$speak.SetOutputToWaveFile($filepath, $format); | |
} | |
$rawtext = $text; | |
$text = FixSSML $text $lang; | |
$script:output = [PSCustomObject]@{ | |
voice = $speak.Voice.Name; | |
input = $rawtext.Substring(0, [math]::Min($rawtext.Length, 2048)); | |
rate = $rate; | |
volume = $volume; | |
duration = [int]0; | |
marks = @(); | |
}; | |
# $script:stats = @{ | |
# start = get-date | |
# } | |
# $speak.Add_VisemeReached({ | |
# param( | |
# [object]$sender, | |
# [System.Speech.Synthesis.VisemeReachedEventArgs]$evt | |
# ); | |
# write-host "progress $($evt.AudioPosition.TotalMilliseconds) dur=$($evt.Duration.TotalMilliseconds) $($evt.Viseme)" | |
# }) | |
$speak.Add_BookmarkReached({ | |
param( | |
[object]$sender, | |
[System.Speech.Synthesis.BookmarkReachedEventArgs]$evt | |
); | |
$name = $evt.Bookmark; | |
$time = $evt.AudioPosition.TotalMilliseconds; | |
if ($name -eq $script:finishedBookmarkName) { | |
$script:output.duration = $time; | |
# Write-Host "done! $time"; | |
# $script:stats.done = get-date; | |
} else { | |
$script:output.marks += [pscustomobject]@{ | |
time = $time; | |
value = $name; | |
} | |
} | |
}); | |
try { | |
$speak.SpeakSsml($text); | |
} catch { | |
Write-Error "Fail! $_"; | |
Write-Host $text; | |
} | |
# $script:stats.end = Get-Date; | |
$speak.SetOutputToNull(); | |
# Write-Host "$(($stats.done - $stats.start).TotalMilliseconds) and to end is $(($stats.end - $stats.start).TotalMilliseconds)" | |
$output; | |
} | |
end { | |
$speak.SetOutputToNull(); | |
$speak.Dispose(); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
.SYNOPSIS | |
Speak text using SSML. | |
.DESCRIPTION | |
Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s | |
It uses the newer WinRT (Universal Windows Runtime) to perform synthesis, rather than the older SAPI5 .Net engine. Therefore, it'll only work on Windows 10. | |
Based on code from https://github.com/marak/say.js/ | |
.PARAMETER Text | |
(default from pipeline) | |
The Text to speak. Text will automatically be wrapped in <speak> if necessary. | |
.PARAMETER Path | |
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le) | |
.PARAMETER listVoices | |
If -listVoices is passed then this function will just output a list of available | |
voices in the format {languageCode, id, name, ssmlGender} | |
.PARAMETER Voice | |
Name of voice to use. To get list of voices use -listVoices option | |
.PARAMETER Rate | |
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast | |
.PARAMETER Volume | |
Volume in range 0-1, 1 (default) is full volume | |
.PARAMETER SampleRate | |
SampleRate of output WAV file. Default is 24000 | |
.PARAMETER Channels | |
Number of channels of output WAV file. Default is 1 | |
.PARAMETER Lang | |
Language to use. Default is the default voice's language. | |
.PARAMETER SpeechMarkTypes | |
Marks to include in output. Default is sentence,word,ssml. set to "" to not output any marks | |
.INPUTS | |
System.String. You can pipe the "Text" parameter into the script | |
.OUTPUTS | |
Will play sound to speakers by default, or write to disk if -Path is specified. | |
Output is metadata about result: | |
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>} | |
.EXAMPLE | |
PS> ./say-ssml.ps1 "hello world" | |
#> | |
param( | |
[Parameter(ValueFromPipeline = $true)] [string] $text, | |
[Parameter(Mandatory = $false)] [string] $voice, | |
[Parameter(Mandatory = $false)] | |
[ValidateRange(0.33, 3.0)] [double] $rate = 1.0, | |
[Parameter(Mandatory = $false)] | |
[ValidateRange(0.0, 1.0)] [double] $volume = 1.0, | |
[Parameter(Mandatory = $false)] [string] $path, | |
[Parameter(Mandatory = $false)] [int] $sampleRate = 24000, | |
[Parameter(Mandatory = $false)] [int] $channels = 1, | |
[Parameter(Mandatory = $false)] [string] $lang, | |
[Parameter(Mandatory = $false)] [string] $speechMarkTypes = "sentence,words,ssml", | |
[Switch] $listVoices | |
) | |
begin { | |
Add-Type -AssemblyName System.Runtime.WindowsRuntime | |
[void][Windows.Foundation.IAsyncOperation`1, Windows.Foundation, ContentType=WindowsRuntime] | |
[void][Windows.Foundation.IAsyncOperationWithProgress`2, Windows.Foundation, ContentType=WindowsRuntime] | |
[void][Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime] | |
[void][Windows.Media.SpeechSynthesis.VoiceInformation, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime] | |
[void][Windows.Media.SpeechSynthesis.SpeechSynthesisStream, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime] | |
[void][Windows.Media.Core.SpeechCue, Windows.Media.Core, ContentType=WindowsRuntime] | |
[void][Windows.Media.Core.TimedMetadataTrack, Windows.Media.Core, ContentType=WindowsRuntime] | |
$_taskMethods = [System.WindowsRuntimeSystemExtensions].GetMethods() | ? { | |
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 | |
} | |
$asTaskGeneric = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1' })[0]; | |
$asTaskGeneric2 = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperationWithProgress`2' })[0]; | |
Function Await($WinRtTask, $ResultType) { | |
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType) | |
$netTask = $asTask.Invoke($null, @($WinRtTask)) | |
$netTask.Wait(-1) | Out-Null | |
$netTask.Result | |
} | |
Function AwaitWithProgress($WinRtTask, $ResultType1, $ResultType2) { | |
$asTask = $asTaskGeneric2.MakeGenericMethod($ResultType1, $ResultType2) | |
$netTask = $asTask.Invoke($null, @($WinRtTask)) | |
$netTask.Wait(-1) | Out-Null | |
} | |
Function ParseMarkers($timedTextTracks) { | |
$list = @() | |
$timedTextTracks | % { | |
$markType = switch($_.Id) { | |
"SpeechWord" { "word" } | |
"SpeechSentence" { "sentence" } | |
"SpeechViseme" { "viseme" } | |
"SpeechBookmark" { "ssml" } | |
Default { "unknown" } | |
} | |
$_.Cues | % { | |
$payload = if ($_.StartPositionInInput) { | |
[PSCustomObject]@{ | |
type = $markType | |
time = [int]$_.StartTime.TotalMilliseconds | |
value = $_.Text | |
start = $_.StartPositionInInput | |
end = $_.EndPositionInInput | |
}; | |
} else { | |
[PSCustomObject]@{ | |
type = $markType | |
time = [int]$_.StartTime.TotalMilliseconds | |
value = $_.Text | |
}; | |
} | |
if ($payload.value) { | |
$list += $payload; | |
} | |
} | |
} | |
$list | |
} | |
Function PlayWave([System.Byte[]]$bytes) { | |
$memstream = [System.IO.MemoryStream]::new($bytes); | |
$player = [System.Media.SoundPlayer]::new($memstream) | |
$player.PlaySync(); | |
$player.Dispose(); | |
$memstream.Dispose(); | |
} | |
Function SaveWave($path, [System.Byte[]]$bytes) { | |
$filepath = if ([System.IO.Path]::IsPathRooted($path)) { | |
$path; | |
} else { | |
[System.IO.Path]::GetFullPath((join-path $pwd $path)) | |
} | |
[System.IO.File]::WriteAllBytes($filepath, $bytes) | |
} | |
$script:voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices; | |
if (-not $voices.Id) { | |
Write-Debug "Unable to get installed voices list. Script will only use default voice"; | |
$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice); | |
} | |
} | |
process { | |
if ($listVoices) { | |
return $script:voices | % { | |
[PSCustomObject]@{ | |
languageCode = $_.Language | |
id = $_.DisplayName; | |
name = $_.Description; | |
ssmlGender = $_.Gender; | |
} | |
} | |
} | |
if (-not $text) { | |
Write-Error "No text specified"; | |
return; | |
} | |
$speech = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new() | |
$speech.Options.AppendedSilence = [Windows.Media.SpeechSynthesis.SpeechAppendedSilence]::Min | |
if ($voice) { | |
$voiceInfo = $script:voices | Where-Object { $_.DisplayName -imatch $voice } | Select-Object -First 1 | |
if ($voiceInfo) { | |
$speech.Voice = $voiceInfo; | |
} else { | |
Write-Debug "No voice found matching $voice" | |
} | |
} | |
if ($speechMarkTypes -match 'sentence') { | |
$speech.Options.IncludeSentenceBoundaryMetadata = $true; | |
} | |
if ($speechMarkTypes -match 'words') { | |
$speech.Options.IncludeWordBoundaryMetadata = $true; | |
} | |
if ($rate -ne 1.0) { | |
$speech.Options.SpeakingRate = [math]::Clamp($rate, 0.5, 6.0); | |
} | |
if ($volume -ne 1.0) { | |
$speech.AudioVolume = [math]::Clamp($volume, 0.0, 1.0); | |
} | |
$ssmlNamespace = 'http://www.w3.org/2001/10/synthesis'; | |
if (-not $text.Trim().StartsWith('<speak')) { | |
$text = [System.Security.SecurityElement]::Escape($text); | |
$text = "<speak version=`"1.0`">$text</speak>"; | |
} | |
$dom = [xml]$text; | |
$dom.speak.SetAttribute('version', '1.0'); | |
$dom.speak.SetAttribute('xml:lang', $speech.Voice.Language); | |
$dom.speak.SetAttribute('xmlns', $ssmlNamespace); | |
$text = $dom.speak.OuterXml; | |
# actually speak - create data stream | |
try { | |
$stream = Await ($speech.SynthesizeSsmlToStreamAsync($text)) ([Windows.Media.SpeechSynthesis.SpeechSynthesisStream]); | |
} catch { | |
Write-Error "Error creating stream $_"; | |
if ($_.InnerExceptions -and $_.InnerExceptions.Count) { | |
$_.InnerExceptions | % { | |
Write-Error "$($_.GetType().Name), $($_.Message)"; | |
} | |
} | |
return; | |
} | |
if (-not $stream.Size) { | |
# error occurred | |
Write-Error "Error Creating Synthesis Stream - no results" | |
return; | |
} | |
if ($speechMarkTypes -ne '') { | |
$markers = ParseMarkers $stream.TimedMetadataTracks | |
$markers | |
} | |
# create destination buffer | |
$bytes = [array]::CreateInstance([byte], $stream.Size); | |
[Windows.Storage.Streams.IBuffer]$buffer = [System.Runtime.InteropServices.WindowsRuntime.WindowsRuntimeBufferExtensions]::AsBuffer($bytes); | |
# wait for buffer copy | |
AwaitWithProgress ($stream.ReadAsync($buffer, [uint32]$stream.Size, [Windows.Storage.Streams.InputStreamOptions]::None)) ([Windows.Storage.Streams.IBuffer]) ([UInt32]) | |
#write out | |
if ($path) { | |
SaveWave $path $bytes; | |
} else { | |
PlayWave $bytes | |
} | |
} | |
end { | |
if ($stream) { | |
$stream.Dispose(); | |
} | |
if ($speech) { | |
$speech.Dispose(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment