New speech api.

2025-01-18 11:56:40 +01:00 · 2025-01-18 11:56:40 +01:00 · b6871a85c1
commit b6871a85c1
11 changed files with 549 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,166 @@
+.vscode
+.history
+*.db*
+
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
--- a/8
+++ b/8
@ -0,0 +1,8 @@
+install:
+	python3 -m venv .venv 
+	./.venv/bin/pip install -e .
+
+tts:
+	./.venv/bin/rtts 
+stt:
+	./.venv/bin/rstt
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
+pyaudio
+SpeechRecognition
+google-cloud-speech
+google-cloud-texttospeech
+google-auth
+pygame
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,32 @@
+[metadata]
+name = rspeech 
+version = 1.0.0
+description = Library for speech processing
+author = retoor
+author_email = retoor@molodetz.nl
+license = MIT
+long_description = file: README.md
+long_description_content_type = text/markdown
+
+[options]
+packages = find:
+package_dir =
+    = src
+python_requires = >=3.7
+install_requires =
+    pyaudio
+    SpeechRecognition
+    google-cloud-speech
+    google-cloud-texttospeech
+    google-auth
+    pygame
+    aiohttp
+    packaging
+
+[options.packages.find]
+where = src
+
+[options.entry_points]
+console_scripts =
+    rtts = rspeech.tts:main 
+    rstt = rspeech.stt:main
--- a/src/rspeech/init.py
+++ b/src/rspeech/init.py
--- a/src/rspeech/main.py
+++ b/src/rspeech/main.py
--- a/src/rspeech/gcloud.py
+++ b/src/rspeech/gcloud.py
@ -0,0 +1,137 @@
+# Written by retoor@molodetz.nl
+
+# This script interfaces with Google's Text-to-Speech API to synthesize spoken audio from text. 
+# It also includes functionality to handle Google authentication tokens.
+
+# External imports:
+# - aiohttp: Asynchronous HTTP requests.
+# - google-auth packages: For managing Google authentication tokens.
+# - env, play: Local modules for playing audio and environment configurations.
+
+# MIT License
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+
+import aiohttp
+import asyncio
+from urllib.parse import urlencode
+import base64
+import sys
+from functools import cache
+from google.oauth2 import id_token
+from google.auth.transport import requests
+import google.auth
+from rspeech.play import play_audio
+import google.oauth2.credentials
+import uuid
+import pathlib
+from rspeech.play import play_audio
+
+# Chars to be ignored in speech
+IGNORE_CHARS = ["*", "#", "`","'",'"',"\\","/","---"]
+
+@cache
+def google_token():
+    gcloud_default, project = google.auth.default()
+    from google.oauth2 import _client as google_auth_client
+    import google.auth.transport.urllib3 as google_auth_urllib3
+    import urllib3
+    http = urllib3.PoolManager()
+    request = google_auth_urllib3.Request(http)
+    token_uri = 'https://oauth2.googleapis.com/token'
+    refresh_token = gcloud_default.refresh_token
+    client_id = gcloud_default.client_id
+    client_secret = gcloud_default.client_secret
+
+    scopes = ['https://www.googleapis.com/auth/cloud-platform']
+
+    access_token, _, _, _ = google_auth_client.refresh_grant(
+        request, token_uri, refresh_token, client_id, client_secret, scopes)
+    return access_token
+
+
+async def tts(text:str ,google_project:str="lisa-448004", language_code:str="nl-NL",ssml_gender:str="FEMALE",speaking_rate:float=1.0,pitch:float=0.0,name:str="nl-NL-Standard-D",ignore_chars=None):
+    if ignore_chars is None:
+        ignore_chars = IGNORE_CHARS
+
+    url = "https://texttospeech.googleapis.com/v1/text:synthesize"
+
+    # Remove markdown
+    for char in ignore_chars:
+        text = text.replace(char, "")
+    text = text.strip()
+    if not text:
+        return
+
+    headers = {
+        "Authorization": f"Bearer {google_token()}",
+        "Content-Type": "application/json",
+        "X-Goog-User-Project": google_project
+    }
+    data = {
+        "input": {
+            "text": text
+        },
+        "voice": {
+            "languageCode": language_code,
+            "name": name,
+            "ssmlGender": ssml_gender
+        },
+        "audioConfig": {
+            "audioEncoding": "MP3",
+            "speakingRate": speaking_rate,
+            "pitch": pitch
+        }
+    }
+    async with aiohttp.ClientSession() as session:
+        response = await session.post(url, headers=headers, json=data)
+        response_json = await response.json()
+        audio_content = response_json.get("audioContent")
+        file = pathlib.Path(str(uuid.uuid4()) + ".mp3")
+        with file.open("wb") as audio_file:
+            audio_file.write(base64.b64decode(audio_content.encode('latin1')))
+        play_audio(file)
+        file.unlink()
+        return
+
+
+def oud():
+    client = speech.SpeechClient()
+
+    with open(file_path, "rb") as audio_file:
+        content = audio_file.read()
+
+    audio = speech.RecognitionAudio(content=content)
+    config = speech.RecognitionConfig(
+        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+        sample_rate_hertz=16000,
+        language_code="en-US",
+    )
+    response = client.recognize(config=config, audio=audio)
+    for result in response.results:
+        print("Transcript:", result.alternatives[0].transcript)
+
+
+async def main_async():
+    print(google_token())
+    await tts("If you hear this sentence, the google part works fine. Congrats.")
+
+def main():
+    asyncio.run(main_async())
+
+if __name__ == '__main__':
+    main()
--- a/src/rspeech/play.py
+++ b/src/rspeech/play.py
@ -0,0 +1,76 @@
+# Written by retoor@molodetz.nl
+
+# This source code initializes a Text-to-Speech (TTS) engine, plays text as audio using the TTS engine, and plays audio files using both the VLC media player and PyAudio.
+
+# Libraries imported: 'pyaudio', 'wave', 'pyttsx3', 'functools', 'os', 'simpleaudio'
+
+# The MIT License (MIT)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import pyaudio
+import functools
+import os
+import subprocess
+import sys
+import pygame 
+
+
+
+
+
+def play_audio(filename):
+    pygame.mixer.init()
+    pygame.mixer.music.load(filename)
+    pygame.mixer.music.play()
+    while pygame.mixer.music.get_busy():
+        pygame.time.Clock().tick(10)
+
+
+def play_audio2(filename):
+    ffmpeg_cmd = [
+        "ffmpeg",
+        "-i", filename,
+        "-f", "s16le",
+        "-ar", "44100",
+        "-ac", "2",
+        "pipe:1"
+    ]
+    process = subprocess.Popen(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=10**6)
+
+    p = pyaudio.PyAudio()
+    stream = p.open(
+        format=p.get_format_from_width(2),
+        channels=2,
+        rate=44100,
+        output=True
+    )
+    chunk_size = 4096
+    try:
+        while True:
+            data = process.stdout.read(chunk_size)
+            if not data:
+                break
+            stream.write(data)
+    finally:
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+        process.stdout.close()
+        process.wait()
--- a/src/rspeech/stt.py
+++ b/src/rspeech/stt.py
@ -0,0 +1,103 @@
+# Written by retoor@molodetz.nl
+
+# This script listens to audio input via a microphone, recognizes speech using the Google API, sends the recognized text to a server for processing, and uses Google Cloud to convert the server response to speech.
+
+# Imports:
+# - speech_recognition: For speech recognition functionality.
+# - xmlrpc.client: To communicate with a remote server using the XML-RPC protocol.
+# - gcloud: Presumably for Google Cloud services, though this requires clarification or specific library inclusion.
+
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import speech_recognition as sr
+from rspeech import gcloud
+import logging 
+import asyncio 
+import time
+logger = logging.getLogger(__name__)
+
+
+def listen(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True):
+    """
+    Function for listening to audio input via a microphone and recognizing speech using the Google API. For this function there are no credentials or gcloud account required.
+
+    Args:
+        timeout (int): The maximum amount of time in seconds to listen for audio input.
+        phrase_time_limit (int): The maximum amount of time in seconds for a single phrase of speech.
+        language (str): The language code for the speech recognition. Default is "nl-NL". For English use "en-US".
+        adjust_ambiance_seconds (int): The number of seconds to adjust to ambient noise. Default is 5.
+        save_to (str): The path to save the audio data to. Default is None.
+        recognize (bool): Whether to recognize speech or not. Default is True.
+    Returns:
+        str: The recognized speech as a string or True if recognize is set to False.
+    """
+
+    recognizer = sr.Recognizer()
+    with sr.Microphone() as source:
+
+        if adjust_ambiance_seconds:
+            logger.info("Adjusting to surroundings for {adjust_ambiance_seconds} seconds.")
+            recognizer.adjust_for_ambient_noise(source, duration=adjust_ambiance_seconds)
+        while True:
+            logger.info("Listening...")
+            try:
+                audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
+                if save_to:
+                    with open(save_to, "wb") as f:
+                        logger.debug(f"Saved to {save_to}")
+                        f.write(audio_data.get_wav_data())
+
+                if not recognize:
+                    logger.info(f"Recognition is disabled so returning True.")
+                    return True 
+                text = recognizer.recognize_google(audio_data, language=language)
+                source = None 
+                recognizer = None 
+                logger.info(f"Returning {text}")
+                return text
+            except sr.WaitTimeoutError:
+                continue
+            except sr.UnknownValueError:
+                continue
+            except sr.RequestError:
+                continue
+
+async def listen_async(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True):
+    def listen_sync():
+        return listen(timeout=timeout, phrase_time_limit=phrase_time_limit,language=language,adjust_ambiance_seconds=adjust_ambiance_seconds,save_to=save_to,recognize=recognize)
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, listen_sync)
+
+
+async def main_async():
+    while True:
+        print("Listening...")
+        print(await listen_async())
+        time.sleep(3)
+
+def main():
+    try:
+        asyncio.run(main_async())
+    except KeyboardInterrupt:
+        pass
+
+if __name__ == "__main__":
+    main()
--- a/src/rspeech/tts.py
+++ b/src/rspeech/tts.py
@ -0,0 +1,18 @@
+import asyncio 
+from rspeech.gcloud import tts 
+
+
+
+async def main_async():
+    await tts("Type a few times return to stop.")
+    while True:
+        text = input("> ").strip()
+        if not text:
+            break 
+        await tts(text)
+
+def main():
+    asyncio.run(main_async())
+
+if __name__ == '__main__':
+    main()