New speech api.

This commit is contained in:
retoor 2025-01-18 11:56:40 +01:00
commit b6871a85c1
11 changed files with 549 additions and 0 deletions

166
.gitignore vendored Normal file
View File

@ -0,0 +1,166 @@
.vscode
.history
*.db*
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

8
Makefile Normal file
View File

@ -0,0 +1,8 @@
install:
python3 -m venv .venv
./.venv/bin/pip install -e .
tts:
./.venv/bin/rtts
stt:
./.venv/bin/rstt

3
pyproject.toml Normal file
View File

@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"

6
requirements.txt Normal file
View File

@ -0,0 +1,6 @@
pyaudio
SpeechRecognition
google-cloud-speech
google-cloud-texttospeech
google-auth
pygame

32
setup.cfg Normal file
View File

@ -0,0 +1,32 @@
[metadata]
name = rspeech
version = 1.0.0
description = Library for speech processing
author = retoor
author_email = retoor@molodetz.nl
license = MIT
long_description = file: README.md
long_description_content_type = text/markdown
[options]
packages = find:
package_dir =
= src
python_requires = >=3.7
install_requires =
pyaudio
SpeechRecognition
google-cloud-speech
google-cloud-texttospeech
google-auth
pygame
aiohttp
packaging
[options.packages.find]
where = src
[options.entry_points]
console_scripts =
rtts = rspeech.tts:main
rstt = rspeech.stt:main

0
src/rspeech/__init__.py Normal file
View File

0
src/rspeech/__main__.py Normal file
View File

137
src/rspeech/gcloud.py Normal file
View File

@ -0,0 +1,137 @@
# Written by retoor@molodetz.nl
# This script interfaces with Google's Text-to-Speech API to synthesize spoken audio from text.
# It also includes functionality to handle Google authentication tokens.
# External imports:
# - aiohttp: Asynchronous HTTP requests.
# - google-auth packages: For managing Google authentication tokens.
# - env, play: Local modules for playing audio and environment configurations.
# MIT License
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import aiohttp
import asyncio
from urllib.parse import urlencode
import base64
import sys
from functools import cache
from google.oauth2 import id_token
from google.auth.transport import requests
import google.auth
from rspeech.play import play_audio
import google.oauth2.credentials
import uuid
import pathlib
from rspeech.play import play_audio
# Chars to be ignored in speech
IGNORE_CHARS = ["*", "#", "`","'",'"',"\\","/","---"]
@cache
def google_token():
gcloud_default, project = google.auth.default()
from google.oauth2 import _client as google_auth_client
import google.auth.transport.urllib3 as google_auth_urllib3
import urllib3
http = urllib3.PoolManager()
request = google_auth_urllib3.Request(http)
token_uri = 'https://oauth2.googleapis.com/token'
refresh_token = gcloud_default.refresh_token
client_id = gcloud_default.client_id
client_secret = gcloud_default.client_secret
scopes = ['https://www.googleapis.com/auth/cloud-platform']
access_token, _, _, _ = google_auth_client.refresh_grant(
request, token_uri, refresh_token, client_id, client_secret, scopes)
return access_token
async def tts(text:str ,google_project:str="lisa-448004", language_code:str="nl-NL",ssml_gender:str="FEMALE",speaking_rate:float=1.0,pitch:float=0.0,name:str="nl-NL-Standard-D",ignore_chars=None):
if ignore_chars is None:
ignore_chars = IGNORE_CHARS
url = "https://texttospeech.googleapis.com/v1/text:synthesize"
# Remove markdown
for char in ignore_chars:
text = text.replace(char, "")
text = text.strip()
if not text:
return
headers = {
"Authorization": f"Bearer {google_token()}",
"Content-Type": "application/json",
"X-Goog-User-Project": google_project
}
data = {
"input": {
"text": text
},
"voice": {
"languageCode": language_code,
"name": name,
"ssmlGender": ssml_gender
},
"audioConfig": {
"audioEncoding": "MP3",
"speakingRate": speaking_rate,
"pitch": pitch
}
}
async with aiohttp.ClientSession() as session:
response = await session.post(url, headers=headers, json=data)
response_json = await response.json()
audio_content = response_json.get("audioContent")
file = pathlib.Path(str(uuid.uuid4()) + ".mp3")
with file.open("wb") as audio_file:
audio_file.write(base64.b64decode(audio_content.encode('latin1')))
play_audio(file)
file.unlink()
return
def oud():
client = speech.SpeechClient()
with open(file_path, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
)
response = client.recognize(config=config, audio=audio)
for result in response.results:
print("Transcript:", result.alternatives[0].transcript)
async def main_async():
print(google_token())
await tts("If you hear this sentence, the google part works fine. Congrats.")
def main():
asyncio.run(main_async())
if __name__ == '__main__':
main()

76
src/rspeech/play.py Normal file
View File

@ -0,0 +1,76 @@
# Written by retoor@molodetz.nl
# This source code initializes a Text-to-Speech (TTS) engine, plays text as audio using the TTS engine, and plays audio files using both the VLC media player and PyAudio.
# Libraries imported: 'pyaudio', 'wave', 'pyttsx3', 'functools', 'os', 'simpleaudio'
# The MIT License (MIT)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import pyaudio
import functools
import os
import subprocess
import sys
import pygame
def play_audio(filename):
pygame.mixer.init()
pygame.mixer.music.load(filename)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
def play_audio2(filename):
ffmpeg_cmd = [
"ffmpeg",
"-i", filename,
"-f", "s16le",
"-ar", "44100",
"-ac", "2",
"pipe:1"
]
process = subprocess.Popen(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=10**6)
p = pyaudio.PyAudio()
stream = p.open(
format=p.get_format_from_width(2),
channels=2,
rate=44100,
output=True
)
chunk_size = 4096
try:
while True:
data = process.stdout.read(chunk_size)
if not data:
break
stream.write(data)
finally:
stream.stop_stream()
stream.close()
p.terminate()
process.stdout.close()
process.wait()

103
src/rspeech/stt.py Normal file
View File

@ -0,0 +1,103 @@
# Written by retoor@molodetz.nl
# This script listens to audio input via a microphone, recognizes speech using the Google API, sends the recognized text to a server for processing, and uses Google Cloud to convert the server response to speech.
# Imports:
# - speech_recognition: For speech recognition functionality.
# - xmlrpc.client: To communicate with a remote server using the XML-RPC protocol.
# - gcloud: Presumably for Google Cloud services, though this requires clarification or specific library inclusion.
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import speech_recognition as sr
from rspeech import gcloud
import logging
import asyncio
import time
logger = logging.getLogger(__name__)
def listen(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True):
"""
Function for listening to audio input via a microphone and recognizing speech using the Google API. For this function there are no credentials or gcloud account required.
Args:
timeout (int): The maximum amount of time in seconds to listen for audio input.
phrase_time_limit (int): The maximum amount of time in seconds for a single phrase of speech.
language (str): The language code for the speech recognition. Default is "nl-NL". For English use "en-US".
adjust_ambiance_seconds (int): The number of seconds to adjust to ambient noise. Default is 5.
save_to (str): The path to save the audio data to. Default is None.
recognize (bool): Whether to recognize speech or not. Default is True.
Returns:
str: The recognized speech as a string or True if recognize is set to False.
"""
recognizer = sr.Recognizer()
with sr.Microphone() as source:
if adjust_ambiance_seconds:
logger.info("Adjusting to surroundings for {adjust_ambiance_seconds} seconds.")
recognizer.adjust_for_ambient_noise(source, duration=adjust_ambiance_seconds)
while True:
logger.info("Listening...")
try:
audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
if save_to:
with open(save_to, "wb") as f:
logger.debug(f"Saved to {save_to}")
f.write(audio_data.get_wav_data())
if not recognize:
logger.info(f"Recognition is disabled so returning True.")
return True
text = recognizer.recognize_google(audio_data, language=language)
source = None
recognizer = None
logger.info(f"Returning {text}")
return text
except sr.WaitTimeoutError:
continue
except sr.UnknownValueError:
continue
except sr.RequestError:
continue
async def listen_async(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True):
def listen_sync():
return listen(timeout=timeout, phrase_time_limit=phrase_time_limit,language=language,adjust_ambiance_seconds=adjust_ambiance_seconds,save_to=save_to,recognize=recognize)
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, listen_sync)
async def main_async():
while True:
print("Listening...")
print(await listen_async())
time.sleep(3)
def main():
try:
asyncio.run(main_async())
except KeyboardInterrupt:
pass
if __name__ == "__main__":
main()

18
src/rspeech/tts.py Normal file
View File

@ -0,0 +1,18 @@
import asyncio
from rspeech.gcloud import tts
async def main_async():
await tts("Type a few times return to stop.")
while True:
text = input("> ").strip()
if not text:
break
await tts(text)
def main():
asyncio.run(main_async())
if __name__ == '__main__':
main()