From b8a517cc146d3ff580d2078c0687171705f10cc9 Mon Sep 17 00:00:00 2001 From: retoor Date: Sat, 18 Jan 2025 08:58:13 +0100 Subject: [PATCH] Initial commit, hopely the last. --- .gitignore | 11 +++++ README.md | 107 ++++++++++++++++++++++++++++++++++++++++ gcloud.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++ play.py | 66 +++++++++++++++++++++++++ requirements.txt | 5 ++ tts.py | 61 +++++++++++++++++++++++ ttsstt.html | 73 +++++++++++++++++++++++++++ 7 files changed, 448 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 gcloud.py create mode 100644 play.py create mode 100644 requirements.txt create mode 100644 tts.py create mode 100644 ttsstt.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6771451 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +__pycache__ +build +dist +*.egg-info +*.egg +*.pyc +*.pyo +venv +.venv +output.wav +.backup* diff --git a/README.md b/README.md new file mode 100644 index 0000000..a07acc7 --- /dev/null +++ b/README.md @@ -0,0 +1,107 @@ +# Research Regarding STT/TTS + +This repository is a mess! It's my personal notepad — a pure collection of snippets and experiments that cost me blood, sweat, and many tears. + +**Special thanks to:** Google. *You know what you did.* +**To OpenAI:** You're amazing! Quality stuff. Sadly, I'm not rich enough to run a 24/7 service with your pricing regarding STT/TTS, so I use only `gpt4o-mini`. + +The end result of this repository is a working **STT/TTS system** that allows you to talk with ChatGPT. + +To save money, I use TTS/STT from Google Cloud (paid). It's surprisingly cheap! + +Do not take the way I communicate with the LLM too seriously — that wasn’t the main focus. The implementation in this project has no context, memory, or system messages. Every call is treated as a new session. + +If you're interested in this technology but get stuck due to lack of documentation, feel free to email me at **retoor@molodetz.nl**. + +--- + +## How to Play Immediately (Without Configuration) +You can get started in just 5 minutes: +1. Create a virtual environment. +2. Install the requirements file: `pip install -r requirements.txt`. +3. Execute `tts.py`. + +With these steps, you'll have a working `gpt4o-mini` model listening to you and responding in text. + +--- + +## Application Output (`tts.py`) + +The output is speech, but here’s how a typical conversation looks: + +``` +Adjusting for ambient noise, please wait... +Listening... +Recognized Text: what is the name of the dog of ga +Response from gpt4o_mini: Please provide more context or details about what "GA" refers to, so I can assist you accurately. +Recognized Text: Garfield the gas has a dog friends what is his name +Response from gpt4o_mini: Garfield's dog friend is named Odie. +Recognized Text: is FTP still used +Response from gpt4o_mini: Yes, FTP (File Transfer Protocol) is still used for transferring files over a network, although more secure alternatives like SFTP (Secure File Transfer Protocol) and FTPS (FTP Secure) are often preferred due to security concerns. +Recognized Text: why is Linux better than +Response from gpt4o_mini: Please complete your question for a more specific comparison about why Linux might be considered better than another operating system or software. +``` + +--- + +## Repository Structure + +The repository contains: +- **`play.py`**: For playing audio with Python. +- **`gcloud.py`**: A wrapper around the Google Cloud SDK (this was the most time-consuming to build). +- **`tts.py`**: Execute this script to talk with GPT. + +--- + +## Requirements and Preparation + +- **A paid Google Cloud account** + - Google Cloud CLI + - You get $300 and 90 days for free, but you'll need to attach a credit card. I used it extensively and didn't spend a cent! + - The free credit barely depletes even with heavy usage. + +- **Google Cloud SDK + CLI** installed + *Important:* These standalone applications affect the behavior of Python's Google library regarding authentication. + +- **Python 3** and the following: + - `python3-venv` + - `python3-pip` + +> I initially installed a lot using `apt-get`, but I can’t recall if it was all necessary in the end. + +--- + +## Installation Steps + +1. Activate the virtual environment: + ```bash + python3 -m venv venv && source venv/bin/activate + ``` +2. Install the requirements: + ```bash + pip install -r requirements.txt + ``` +## Testing the setup +1. Check Google Authentication & TTS + ```bash + python gcloud.py + ``` + - If successful, it will speak a sentence. + - If not, you'll likely encounter some authentication issues — brace yourself for Google-related configuration struggles. + +2. Check Speech Recognition (No API Needed) + ```bash + python tts.py + ``` + - This sends your text to the gpt4o-mini model and prints the response. + - Requires no configuration and works out of the box. + +## Conclusion +Play stupid games, win stupid prizes. Figuring this out was a nightmare. If OpenAI's services were financially viable, I would have chosen them — better quality and much easier to implement. + +Now, I have a fully operational project that communicates perfectly and even follows conversations. For example, I can: + - Assign numbers. + - Perform calculations (e.g., divide "the first number by the second"). + - Use the microphone full-time to ask or say anything I want. I have a wireless JBL GO speaker that's directly ready for the job when I turn it on. + +I hope some people appreciate the snippets! diff --git a/gcloud.py b/gcloud.py new file mode 100644 index 0000000..8390181 --- /dev/null +++ b/gcloud.py @@ -0,0 +1,125 @@ +# Written by retoor@molodetz.nl + +# This script interfaces with Google's Text-to-Speech API to synthesize spoken audio from text. +# It also includes functionality to handle Google authentication tokens. + +# External imports: +# - aiohttp: Asynchronous HTTP requests. +# - google-auth packages: For managing Google authentication tokens. +# - env, play: Local modules for playing audio and environment configurations. + +# MIT License +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +import aiohttp +import asyncio +from urllib.parse import urlencode +import base64 +import sys +from functools import cache +from google.oauth2 import id_token +from google.auth.transport import requests +import google.auth +from play import play_audio +import google.oauth2.credentials +import uuid +import pathlib + + +@cache +def google_token(): + gcloud_default, project = google.auth.default() + from google.oauth2 import _client as google_auth_client + import google.auth.transport.urllib3 as google_auth_urllib3 + import urllib3 + http = urllib3.PoolManager() + request = google_auth_urllib3.Request(http) + token_uri = 'https://oauth2.googleapis.com/token' + refresh_token = gcloud_default.refresh_token + client_id = gcloud_default.client_id + client_secret = gcloud_default.client_secret + + scopes = ['https://www.googleapis.com/auth/cloud-platform'] + + access_token, _, _, _ = google_auth_client.refresh_grant( + request, token_uri, refresh_token, client_id, client_secret, scopes) + return access_token + + +async def tts(text): + url = "https://texttospeech.googleapis.com/v1/text:synthesize" + text = text.replace("*", "").replace("#", "").replace("`", "").strip() + if not text: + return + + headers = { + "Authorization": f"Bearer {google_token()}", + "Content-Type": "application/json", + "X-Goog-User-Project": "lisa-448004", + } + data = { + "input": { + "text": text + }, + "voice": { + "languageCode": "nl-NL", + "name": "nl-NL-Standard-D", + "ssmlGender": "FEMALE" + }, + "audioConfig": { + "audioEncoding": "MP3", + "speakingRate": 1.0, + "pitch": 0.0 + } + } + async with aiohttp.ClientSession() as session: + response = await session.post(url, headers=headers, json=data) + response_json = await response.json() + audio_content = response_json.get("audioContent") + file = pathlib.Path(str(uuid.uuid4()) + ".mp3") + with file.open("wb") as audio_file: + audio_file.write(base64.b64decode(audio_content.encode('latin1'))) + play_audio(file) + file.unlink() + return + + +def oud(): + client = speech.SpeechClient() + + with open(file_path, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + ) + response = client.recognize(config=config, audio=audio) + for result in response.results: + print("Transcript:", result.alternatives[0].transcript) + + +async def main(): + print(google_token()) + await tts("If you hear this sentence, the google part works fine. Congrats.") + + +if __name__ == '__main__': + asyncio.run(main()) \ No newline at end of file diff --git a/play.py b/play.py new file mode 100644 index 0000000..b89f4a5 --- /dev/null +++ b/play.py @@ -0,0 +1,66 @@ +# Written by retoor@molodetz.nl + +# This source code initializes a Text-to-Speech (TTS) engine, plays text as audio using the TTS engine, and plays audio files using both the VLC media player and PyAudio. + +# Libraries imported: 'pyaudio', 'wave', 'pyttsx3', 'functools', 'os', 'simpleaudio' + +# The MIT License (MIT) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import pyaudio +import functools +import os +import subprocess +import sys + +@functools.cache +def get_py_audio(): + return pyaudio.PyAudio() + +def play_audio(filename): + ffmpeg_cmd = [ + "ffmpeg", + "-i", filename, + "-f", "s16le", + "-ar", "44100", + "-ac", "2", + "pipe:1" + ] + process = subprocess.Popen(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=10**6) + + py_audio = get_py_audio() + stream = py_audio.open( + format=py_audio.get_format_from_width(2), + channels=2, + rate=44100, + output=True + ) + chunk_size = 4096 + try: + while True: + data = process.stdout.read(chunk_size) + if not data: + break + stream.write(data) + finally: + stream.stop_stream() + stream.close() + process.stdout.close() + process.wait() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5967d96 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pyaudio +SpeechRecognition +google-cloud-speech +google-cloud-texttospeech +google-auth diff --git a/tts.py b/tts.py new file mode 100644 index 0000000..edb4175 --- /dev/null +++ b/tts.py @@ -0,0 +1,61 @@ +# Written by retoor@molodetz.nl + +# This script listens to audio input via a microphone, recognizes speech using the Google API, sends the recognized text to a server for processing, and uses Google Cloud to convert the server response to speech. + +# Imports: +# - speech_recognition: For speech recognition functionality. +# - xmlrpc.client: To communicate with a remote server using the XML-RPC protocol. +# - gcloud: Presumably for Google Cloud services, though this requires clarification or specific library inclusion. + +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import speech_recognition as sr +from xmlrpc.client import ServerProxy +import gcloud + +molodetz = ServerProxy("https://api.molodetz.nl/rpc") + +async def main(): + recognizer = sr.Recognizer() + + with sr.Microphone() as source: + print("Adjusting for ambient noise, please wait...") + recognizer.adjust_for_ambient_noise(source, duration=1) + print("Listening...") + + while True: + try: + audio_data = recognizer.listen(source, timeout=10) + text = recognizer.recognize_google(audio_data, language="en-US") + print(f"Recognized Text: {text}") + response_llm = molodetz.gpt4o_mini(text) + print(f"Response from gpt4o_mini: {response_llm}") + await gcloud.tts(response_llm) + except sr.WaitTimeoutError: + continue + except sr.UnknownValueError: + continue + except sr.RequestError: + continue + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/ttsstt.html b/ttsstt.html new file mode 100644 index 0000000..bbf991f --- /dev/null +++ b/ttsstt.html @@ -0,0 +1,73 @@ + + + + +
+ + +