commit b6871a85c18245bc323ab766c4721661ea7fd95f Author: retoor Date: Sat Jan 18 11:56:40 2025 +0100 New speech api. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8cb2598 --- /dev/null +++ b/.gitignore @@ -0,0 +1,166 @@ +.vscode +.history +*.db* + +# ---> Python +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..230dc72 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +install: + python3 -m venv .venv + ./.venv/bin/pip install -e . + +tts: + ./.venv/bin/rtts +stt: + ./.venv/bin/rstt diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..07de284 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f4d3b6f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +pyaudio +SpeechRecognition +google-cloud-speech +google-cloud-texttospeech +google-auth +pygame diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..e3ba3be --- /dev/null +++ b/setup.cfg @@ -0,0 +1,32 @@ +[metadata] +name = rspeech +version = 1.0.0 +description = Library for speech processing +author = retoor +author_email = retoor@molodetz.nl +license = MIT +long_description = file: README.md +long_description_content_type = text/markdown + +[options] +packages = find: +package_dir = + = src +python_requires = >=3.7 +install_requires = + pyaudio + SpeechRecognition + google-cloud-speech + google-cloud-texttospeech + google-auth + pygame + aiohttp + packaging + +[options.packages.find] +where = src + +[options.entry_points] +console_scripts = + rtts = rspeech.tts:main + rstt = rspeech.stt:main diff --git a/src/rspeech/__init__.py b/src/rspeech/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/rspeech/__main__.py b/src/rspeech/__main__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/rspeech/gcloud.py b/src/rspeech/gcloud.py new file mode 100644 index 0000000..6f1b453 --- /dev/null +++ b/src/rspeech/gcloud.py @@ -0,0 +1,137 @@ +# Written by retoor@molodetz.nl + +# This script interfaces with Google's Text-to-Speech API to synthesize spoken audio from text. +# It also includes functionality to handle Google authentication tokens. + +# External imports: +# - aiohttp: Asynchronous HTTP requests. +# - google-auth packages: For managing Google authentication tokens. +# - env, play: Local modules for playing audio and environment configurations. + +# MIT License +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +import aiohttp +import asyncio +from urllib.parse import urlencode +import base64 +import sys +from functools import cache +from google.oauth2 import id_token +from google.auth.transport import requests +import google.auth +from rspeech.play import play_audio +import google.oauth2.credentials +import uuid +import pathlib +from rspeech.play import play_audio + +# Chars to be ignored in speech +IGNORE_CHARS = ["*", "#", "`","'",'"',"\\","/","---"] + +@cache +def google_token(): + gcloud_default, project = google.auth.default() + from google.oauth2 import _client as google_auth_client + import google.auth.transport.urllib3 as google_auth_urllib3 + import urllib3 + http = urllib3.PoolManager() + request = google_auth_urllib3.Request(http) + token_uri = 'https://oauth2.googleapis.com/token' + refresh_token = gcloud_default.refresh_token + client_id = gcloud_default.client_id + client_secret = gcloud_default.client_secret + + scopes = ['https://www.googleapis.com/auth/cloud-platform'] + + access_token, _, _, _ = google_auth_client.refresh_grant( + request, token_uri, refresh_token, client_id, client_secret, scopes) + return access_token + + +async def tts(text:str ,google_project:str="lisa-448004", language_code:str="nl-NL",ssml_gender:str="FEMALE",speaking_rate:float=1.0,pitch:float=0.0,name:str="nl-NL-Standard-D",ignore_chars=None): + if ignore_chars is None: + ignore_chars = IGNORE_CHARS + + url = "https://texttospeech.googleapis.com/v1/text:synthesize" + + # Remove markdown + for char in ignore_chars: + text = text.replace(char, "") + text = text.strip() + if not text: + return + + headers = { + "Authorization": f"Bearer {google_token()}", + "Content-Type": "application/json", + "X-Goog-User-Project": google_project + } + data = { + "input": { + "text": text + }, + "voice": { + "languageCode": language_code, + "name": name, + "ssmlGender": ssml_gender + }, + "audioConfig": { + "audioEncoding": "MP3", + "speakingRate": speaking_rate, + "pitch": pitch + } + } + async with aiohttp.ClientSession() as session: + response = await session.post(url, headers=headers, json=data) + response_json = await response.json() + audio_content = response_json.get("audioContent") + file = pathlib.Path(str(uuid.uuid4()) + ".mp3") + with file.open("wb") as audio_file: + audio_file.write(base64.b64decode(audio_content.encode('latin1'))) + play_audio(file) + file.unlink() + return + + +def oud(): + client = speech.SpeechClient() + + with open(file_path, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + ) + response = client.recognize(config=config, audio=audio) + for result in response.results: + print("Transcript:", result.alternatives[0].transcript) + + +async def main_async(): + print(google_token()) + await tts("If you hear this sentence, the google part works fine. Congrats.") + +def main(): + asyncio.run(main_async()) + +if __name__ == '__main__': + main() diff --git a/src/rspeech/play.py b/src/rspeech/play.py new file mode 100644 index 0000000..1fc1096 --- /dev/null +++ b/src/rspeech/play.py @@ -0,0 +1,76 @@ +# Written by retoor@molodetz.nl + +# This source code initializes a Text-to-Speech (TTS) engine, plays text as audio using the TTS engine, and plays audio files using both the VLC media player and PyAudio. + +# Libraries imported: 'pyaudio', 'wave', 'pyttsx3', 'functools', 'os', 'simpleaudio' + +# The MIT License (MIT) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import pyaudio +import functools +import os +import subprocess +import sys +import pygame + + + + + +def play_audio(filename): + pygame.mixer.init() + pygame.mixer.music.load(filename) + pygame.mixer.music.play() + while pygame.mixer.music.get_busy(): + pygame.time.Clock().tick(10) + + +def play_audio2(filename): + ffmpeg_cmd = [ + "ffmpeg", + "-i", filename, + "-f", "s16le", + "-ar", "44100", + "-ac", "2", + "pipe:1" + ] + process = subprocess.Popen(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=10**6) + + p = pyaudio.PyAudio() + stream = p.open( + format=p.get_format_from_width(2), + channels=2, + rate=44100, + output=True + ) + chunk_size = 4096 + try: + while True: + data = process.stdout.read(chunk_size) + if not data: + break + stream.write(data) + finally: + stream.stop_stream() + stream.close() + p.terminate() + process.stdout.close() + process.wait() diff --git a/src/rspeech/stt.py b/src/rspeech/stt.py new file mode 100644 index 0000000..1ce9b20 --- /dev/null +++ b/src/rspeech/stt.py @@ -0,0 +1,103 @@ +# Written by retoor@molodetz.nl + +# This script listens to audio input via a microphone, recognizes speech using the Google API, sends the recognized text to a server for processing, and uses Google Cloud to convert the server response to speech. + +# Imports: +# - speech_recognition: For speech recognition functionality. +# - xmlrpc.client: To communicate with a remote server using the XML-RPC protocol. +# - gcloud: Presumably for Google Cloud services, though this requires clarification or specific library inclusion. + +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import speech_recognition as sr +from rspeech import gcloud +import logging +import asyncio +import time +logger = logging.getLogger(__name__) + + +def listen(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True): + """ + Function for listening to audio input via a microphone and recognizing speech using the Google API. For this function there are no credentials or gcloud account required. + + Args: + timeout (int): The maximum amount of time in seconds to listen for audio input. + phrase_time_limit (int): The maximum amount of time in seconds for a single phrase of speech. + language (str): The language code for the speech recognition. Default is "nl-NL". For English use "en-US". + adjust_ambiance_seconds (int): The number of seconds to adjust to ambient noise. Default is 5. + save_to (str): The path to save the audio data to. Default is None. + recognize (bool): Whether to recognize speech or not. Default is True. + Returns: + str: The recognized speech as a string or True if recognize is set to False. + """ + + recognizer = sr.Recognizer() + with sr.Microphone() as source: + + if adjust_ambiance_seconds: + logger.info("Adjusting to surroundings for {adjust_ambiance_seconds} seconds.") + recognizer.adjust_for_ambient_noise(source, duration=adjust_ambiance_seconds) + while True: + logger.info("Listening...") + try: + audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit) + if save_to: + with open(save_to, "wb") as f: + logger.debug(f"Saved to {save_to}") + f.write(audio_data.get_wav_data()) + + if not recognize: + logger.info(f"Recognition is disabled so returning True.") + return True + text = recognizer.recognize_google(audio_data, language=language) + source = None + recognizer = None + logger.info(f"Returning {text}") + return text + except sr.WaitTimeoutError: + continue + except sr.UnknownValueError: + continue + except sr.RequestError: + continue + +async def listen_async(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True): + def listen_sync(): + return listen(timeout=timeout, phrase_time_limit=phrase_time_limit,language=language,adjust_ambiance_seconds=adjust_ambiance_seconds,save_to=save_to,recognize=recognize) + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, listen_sync) + + +async def main_async(): + while True: + print("Listening...") + print(await listen_async()) + time.sleep(3) + +def main(): + try: + asyncio.run(main_async()) + except KeyboardInterrupt: + pass + +if __name__ == "__main__": + main() diff --git a/src/rspeech/tts.py b/src/rspeech/tts.py new file mode 100644 index 0000000..884efbe --- /dev/null +++ b/src/rspeech/tts.py @@ -0,0 +1,18 @@ +import asyncio +from rspeech.gcloud import tts + + + +async def main_async(): + await tts("Type a few times return to stop.") + while True: + text = input("> ").strip() + if not text: + break + await tts(text) + +def main(): + asyncio.run(main_async()) + +if __name__ == '__main__': + main()