New speech api.

2025-01-18 11:56:40 +01:00 · 2025-01-18 11:56:40 +01:00 · b6871a85c1
commit b6871a85c1
11 changed files with 549 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,166 @@
 .vscode
 .history
 *.db*
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/8
+++ b/8
@ -0,0 +1,8 @@
 install:
 	python3 -m venv .venv 
 	./.venv/bin/pip install -e .
 tts:
 	./.venv/bin/rtts 
 stt:
 	./.venv/bin/rstt
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,3 @@
 [build-system]
 requires = ["setuptools", "wheel"]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
 pyaudio
 SpeechRecognition
 google-cloud-speech
 google-cloud-texttospeech
 google-auth
 pygame
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,32 @@
 [metadata]
 name = rspeech 
 version = 1.0.0
 description = Library for speech processing
 author = retoor
 author_email = retoor@molodetz.nl
 license = MIT
 long_description = file: README.md
 long_description_content_type = text/markdown
 [options]
 packages = find:
 package_dir =
    = src
 python_requires = >=3.7
 install_requires =
    pyaudio
    SpeechRecognition
    google-cloud-speech
    google-cloud-texttospeech
    google-auth
    pygame
    aiohttp
    packaging
 [options.packages.find]
 where = src
 [options.entry_points]
 console_scripts =
    rtts = rspeech.tts:main 
    rstt = rspeech.stt:main
--- a/src/rspeech/init.py
+++ b/src/rspeech/init.py
--- a/src/rspeech/main.py
+++ b/src/rspeech/main.py
--- a/src/rspeech/gcloud.py
+++ b/src/rspeech/gcloud.py
@ -0,0 +1,137 @@
 # Written by retoor@molodetz.nl
 # This script interfaces with Google's Text-to-Speech API to synthesize spoken audio from text. 
 # It also includes functionality to handle Google authentication tokens.
 # External imports:
 # - aiohttp: Asynchronous HTTP requests.
 # - google-auth packages: For managing Google authentication tokens.
 # - env, play: Local modules for playing audio and environment configurations.
 # MIT License
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 import aiohttp
 import asyncio
 from urllib.parse import urlencode
 import base64
 import sys
 from functools import cache
 from google.oauth2 import id_token
 from google.auth.transport import requests
 import google.auth
 from rspeech.play import play_audio
 import google.oauth2.credentials
 import uuid
 import pathlib
 from rspeech.play import play_audio
 # Chars to be ignored in speech
 IGNORE_CHARS = ["*", "#", "`","'",'"',"\\","/","---"]
@cache
 def google_token():
    gcloud_default, project = google.auth.default()
    from google.oauth2 import _client as google_auth_client
    import google.auth.transport.urllib3 as google_auth_urllib3
    import urllib3
    http = urllib3.PoolManager()
    request = google_auth_urllib3.Request(http)
    token_uri = 'https://oauth2.googleapis.com/token'
    refresh_token = gcloud_default.refresh_token
    client_id = gcloud_default.client_id
    client_secret = gcloud_default.client_secret
    scopes = ['https://www.googleapis.com/auth/cloud-platform']
    access_token, _, _, _ = google_auth_client.refresh_grant(
        request, token_uri, refresh_token, client_id, client_secret, scopes)
    return access_token
 async def tts(text:str ,google_project:str="lisa-448004", language_code:str="nl-NL",ssml_gender:str="FEMALE",speaking_rate:float=1.0,pitch:float=0.0,name:str="nl-NL-Standard-D",ignore_chars=None):
    if ignore_chars is None:
        ignore_chars = IGNORE_CHARS
    url = "https://texttospeech.googleapis.com/v1/text:synthesize"
    # Remove markdown
    for char in ignore_chars:
        text = text.replace(char, "")
    text = text.strip()
    if not text:
        return
    headers = {
        "Authorization": f"Bearer {google_token()}",
        "Content-Type": "application/json",
        "X-Goog-User-Project": google_project
    }
    data = {
        "input": {
            "text": text
        },
        "voice": {
            "languageCode": language_code,
            "name": name,
            "ssmlGender": ssml_gender
        },
        "audioConfig": {
            "audioEncoding": "MP3",
            "speakingRate": speaking_rate,
            "pitch": pitch
        }
    }
    async with aiohttp.ClientSession() as session:
        response = await session.post(url, headers=headers, json=data)
        response_json = await response.json()
        audio_content = response_json.get("audioContent")
        file = pathlib.Path(str(uuid.uuid4()) + ".mp3")
        with file.open("wb") as audio_file:
            audio_file.write(base64.b64decode(audio_content.encode('latin1')))
        play_audio(file)
        file.unlink()
        return
 def oud():
    client = speech.SpeechClient()
    with open(file_path, "rb") as audio_file:
        content = audio_file.read()
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )
    response = client.recognize(config=config, audio=audio)
    for result in response.results:
        print("Transcript:", result.alternatives[0].transcript)
 async def main_async():
    print(google_token())
    await tts("If you hear this sentence, the google part works fine. Congrats.")
 def main():
    asyncio.run(main_async())
 if __name__ == '__main__':
    main()
--- a/src/rspeech/play.py
+++ b/src/rspeech/play.py
@ -0,0 +1,76 @@
 # Written by retoor@molodetz.nl
 # This source code initializes a Text-to-Speech (TTS) engine, plays text as audio using the TTS engine, and plays audio files using both the VLC media player and PyAudio.
 # Libraries imported: 'pyaudio', 'wave', 'pyttsx3', 'functools', 'os', 'simpleaudio'
 # The MIT License (MIT)
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 import pyaudio
 import functools
 import os
 import subprocess
 import sys
 import pygame 
 def play_audio(filename):
    pygame.mixer.init()
    pygame.mixer.music.load(filename)
    pygame.mixer.music.play()
    while pygame.mixer.music.get_busy():
        pygame.time.Clock().tick(10)
 def play_audio2(filename):
    ffmpeg_cmd = [
        "ffmpeg",
        "-i", filename,
        "-f", "s16le",
        "-ar", "44100",
        "-ac", "2",
        "pipe:1"
    ]
    process = subprocess.Popen(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=10**6)
    p = pyaudio.PyAudio()
    stream = p.open(
        format=p.get_format_from_width(2),
        channels=2,
        rate=44100,
        output=True
    )
    chunk_size = 4096
    try:
        while True:
            data = process.stdout.read(chunk_size)
            if not data:
                break
            stream.write(data)
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()
        process.stdout.close()
        process.wait()
--- a/src/rspeech/stt.py
+++ b/src/rspeech/stt.py
@ -0,0 +1,103 @@
 # Written by retoor@molodetz.nl
 # This script listens to audio input via a microphone, recognizes speech using the Google API, sends the recognized text to a server for processing, and uses Google Cloud to convert the server response to speech.
 # Imports:
 # - speech_recognition: For speech recognition functionality.
 # - xmlrpc.client: To communicate with a remote server using the XML-RPC protocol.
 # - gcloud: Presumably for Google Cloud services, though this requires clarification or specific library inclusion.
 # MIT License
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 # 
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 # 
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import speech_recognition as sr
 from rspeech import gcloud
 import logging 
 import asyncio 
 import time
 logger = logging.getLogger(__name__)
 def listen(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True):
    """
    Function for listening to audio input via a microphone and recognizing speech using the Google API. For this function there are no credentials or gcloud account required.
    Args:
        timeout (int): The maximum amount of time in seconds to listen for audio input.
        phrase_time_limit (int): The maximum amount of time in seconds for a single phrase of speech.
        language (str): The language code for the speech recognition. Default is "nl-NL". For English use "en-US".
        adjust_ambiance_seconds (int): The number of seconds to adjust to ambient noise. Default is 5.
        save_to (str): The path to save the audio data to. Default is None.
        recognize (bool): Whether to recognize speech or not. Default is True.
    Returns:
        str: The recognized speech as a string or True if recognize is set to False.
    """
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        if adjust_ambiance_seconds:
            logger.info("Adjusting to surroundings for {adjust_ambiance_seconds} seconds.")
            recognizer.adjust_for_ambient_noise(source, duration=adjust_ambiance_seconds)
        while True:
            logger.info("Listening...")
            try:
                audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
                if save_to:
                    with open(save_to, "wb") as f:
                        logger.debug(f"Saved to {save_to}")
                        f.write(audio_data.get_wav_data())
                if not recognize:
                    logger.info(f"Recognition is disabled so returning True.")
                    return True 
                text = recognizer.recognize_google(audio_data, language=language)
                source = None 
                recognizer = None 
                logger.info(f"Returning {text}")
                return text
            except sr.WaitTimeoutError:
                continue
            except sr.UnknownValueError:
                continue
            except sr.RequestError:
                continue
 async def listen_async(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True):
    def listen_sync():
        return listen(timeout=timeout, phrase_time_limit=phrase_time_limit,language=language,adjust_ambiance_seconds=adjust_ambiance_seconds,save_to=save_to,recognize=recognize)
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(None, listen_sync)
 async def main_async():
    while True:
        print("Listening...")
        print(await listen_async())
        time.sleep(3)
 def main():
    try:
        asyncio.run(main_async())
    except KeyboardInterrupt:
        pass
 if __name__ == "__main__":
    main()
--- a/src/rspeech/tts.py
+++ b/src/rspeech/tts.py
@ -0,0 +1,18 @@
 import asyncio 
 from rspeech.gcloud import tts 
 async def main_async():
    await tts("Type a few times return to stop.")
    while True:
        text = input("> ").strip()
        if not text:
            break 
        await tts(text)
 def main():
    asyncio.run(main_async())
 if __name__ == '__main__':
    main()