New speech api.
This commit is contained in:
commit
b6871a85c1
166
.gitignore
vendored
Normal file
166
.gitignore
vendored
Normal file
@ -0,0 +1,166 @@
|
||||
.vscode
|
||||
.history
|
||||
*.db*
|
||||
|
||||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
8
Makefile
Normal file
8
Makefile
Normal file
@ -0,0 +1,8 @@
|
||||
install:
|
||||
python3 -m venv .venv
|
||||
./.venv/bin/pip install -e .
|
||||
|
||||
tts:
|
||||
./.venv/bin/rtts
|
||||
stt:
|
||||
./.venv/bin/rstt
|
3
pyproject.toml
Normal file
3
pyproject.toml
Normal file
@ -0,0 +1,3 @@
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@ -0,0 +1,6 @@
|
||||
pyaudio
|
||||
SpeechRecognition
|
||||
google-cloud-speech
|
||||
google-cloud-texttospeech
|
||||
google-auth
|
||||
pygame
|
32
setup.cfg
Normal file
32
setup.cfg
Normal file
@ -0,0 +1,32 @@
|
||||
[metadata]
|
||||
name = rspeech
|
||||
version = 1.0.0
|
||||
description = Library for speech processing
|
||||
author = retoor
|
||||
author_email = retoor@molodetz.nl
|
||||
license = MIT
|
||||
long_description = file: README.md
|
||||
long_description_content_type = text/markdown
|
||||
|
||||
[options]
|
||||
packages = find:
|
||||
package_dir =
|
||||
= src
|
||||
python_requires = >=3.7
|
||||
install_requires =
|
||||
pyaudio
|
||||
SpeechRecognition
|
||||
google-cloud-speech
|
||||
google-cloud-texttospeech
|
||||
google-auth
|
||||
pygame
|
||||
aiohttp
|
||||
packaging
|
||||
|
||||
[options.packages.find]
|
||||
where = src
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
rtts = rspeech.tts:main
|
||||
rstt = rspeech.stt:main
|
0
src/rspeech/__init__.py
Normal file
0
src/rspeech/__init__.py
Normal file
0
src/rspeech/__main__.py
Normal file
0
src/rspeech/__main__.py
Normal file
137
src/rspeech/gcloud.py
Normal file
137
src/rspeech/gcloud.py
Normal file
@ -0,0 +1,137 @@
|
||||
# Written by retoor@molodetz.nl
|
||||
|
||||
# This script interfaces with Google's Text-to-Speech API to synthesize spoken audio from text.
|
||||
# It also includes functionality to handle Google authentication tokens.
|
||||
|
||||
# External imports:
|
||||
# - aiohttp: Asynchronous HTTP requests.
|
||||
# - google-auth packages: For managing Google authentication tokens.
|
||||
# - env, play: Local modules for playing audio and environment configurations.
|
||||
|
||||
# MIT License
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from urllib.parse import urlencode
|
||||
import base64
|
||||
import sys
|
||||
from functools import cache
|
||||
from google.oauth2 import id_token
|
||||
from google.auth.transport import requests
|
||||
import google.auth
|
||||
from rspeech.play import play_audio
|
||||
import google.oauth2.credentials
|
||||
import uuid
|
||||
import pathlib
|
||||
from rspeech.play import play_audio
|
||||
|
||||
# Chars to be ignored in speech
|
||||
IGNORE_CHARS = ["*", "#", "`","'",'"',"\\","/","---"]
|
||||
|
||||
@cache
|
||||
def google_token():
|
||||
gcloud_default, project = google.auth.default()
|
||||
from google.oauth2 import _client as google_auth_client
|
||||
import google.auth.transport.urllib3 as google_auth_urllib3
|
||||
import urllib3
|
||||
http = urllib3.PoolManager()
|
||||
request = google_auth_urllib3.Request(http)
|
||||
token_uri = 'https://oauth2.googleapis.com/token'
|
||||
refresh_token = gcloud_default.refresh_token
|
||||
client_id = gcloud_default.client_id
|
||||
client_secret = gcloud_default.client_secret
|
||||
|
||||
scopes = ['https://www.googleapis.com/auth/cloud-platform']
|
||||
|
||||
access_token, _, _, _ = google_auth_client.refresh_grant(
|
||||
request, token_uri, refresh_token, client_id, client_secret, scopes)
|
||||
return access_token
|
||||
|
||||
|
||||
async def tts(text:str ,google_project:str="lisa-448004", language_code:str="nl-NL",ssml_gender:str="FEMALE",speaking_rate:float=1.0,pitch:float=0.0,name:str="nl-NL-Standard-D",ignore_chars=None):
|
||||
if ignore_chars is None:
|
||||
ignore_chars = IGNORE_CHARS
|
||||
|
||||
url = "https://texttospeech.googleapis.com/v1/text:synthesize"
|
||||
|
||||
# Remove markdown
|
||||
for char in ignore_chars:
|
||||
text = text.replace(char, "")
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {google_token()}",
|
||||
"Content-Type": "application/json",
|
||||
"X-Goog-User-Project": google_project
|
||||
}
|
||||
data = {
|
||||
"input": {
|
||||
"text": text
|
||||
},
|
||||
"voice": {
|
||||
"languageCode": language_code,
|
||||
"name": name,
|
||||
"ssmlGender": ssml_gender
|
||||
},
|
||||
"audioConfig": {
|
||||
"audioEncoding": "MP3",
|
||||
"speakingRate": speaking_rate,
|
||||
"pitch": pitch
|
||||
}
|
||||
}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
response = await session.post(url, headers=headers, json=data)
|
||||
response_json = await response.json()
|
||||
audio_content = response_json.get("audioContent")
|
||||
file = pathlib.Path(str(uuid.uuid4()) + ".mp3")
|
||||
with file.open("wb") as audio_file:
|
||||
audio_file.write(base64.b64decode(audio_content.encode('latin1')))
|
||||
play_audio(file)
|
||||
file.unlink()
|
||||
return
|
||||
|
||||
|
||||
def oud():
|
||||
client = speech.SpeechClient()
|
||||
|
||||
with open(file_path, "rb") as audio_file:
|
||||
content = audio_file.read()
|
||||
|
||||
audio = speech.RecognitionAudio(content=content)
|
||||
config = speech.RecognitionConfig(
|
||||
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
||||
sample_rate_hertz=16000,
|
||||
language_code="en-US",
|
||||
)
|
||||
response = client.recognize(config=config, audio=audio)
|
||||
for result in response.results:
|
||||
print("Transcript:", result.alternatives[0].transcript)
|
||||
|
||||
|
||||
async def main_async():
|
||||
print(google_token())
|
||||
await tts("If you hear this sentence, the google part works fine. Congrats.")
|
||||
|
||||
def main():
|
||||
asyncio.run(main_async())
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
76
src/rspeech/play.py
Normal file
76
src/rspeech/play.py
Normal file
@ -0,0 +1,76 @@
|
||||
# Written by retoor@molodetz.nl
|
||||
|
||||
# This source code initializes a Text-to-Speech (TTS) engine, plays text as audio using the TTS engine, and plays audio files using both the VLC media player and PyAudio.
|
||||
|
||||
# Libraries imported: 'pyaudio', 'wave', 'pyttsx3', 'functools', 'os', 'simpleaudio'
|
||||
|
||||
# The MIT License (MIT)
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
import pyaudio
|
||||
import functools
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import pygame
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def play_audio(filename):
|
||||
pygame.mixer.init()
|
||||
pygame.mixer.music.load(filename)
|
||||
pygame.mixer.music.play()
|
||||
while pygame.mixer.music.get_busy():
|
||||
pygame.time.Clock().tick(10)
|
||||
|
||||
|
||||
def play_audio2(filename):
|
||||
ffmpeg_cmd = [
|
||||
"ffmpeg",
|
||||
"-i", filename,
|
||||
"-f", "s16le",
|
||||
"-ar", "44100",
|
||||
"-ac", "2",
|
||||
"pipe:1"
|
||||
]
|
||||
process = subprocess.Popen(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=10**6)
|
||||
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(
|
||||
format=p.get_format_from_width(2),
|
||||
channels=2,
|
||||
rate=44100,
|
||||
output=True
|
||||
)
|
||||
chunk_size = 4096
|
||||
try:
|
||||
while True:
|
||||
data = process.stdout.read(chunk_size)
|
||||
if not data:
|
||||
break
|
||||
stream.write(data)
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
process.stdout.close()
|
||||
process.wait()
|
103
src/rspeech/stt.py
Normal file
103
src/rspeech/stt.py
Normal file
@ -0,0 +1,103 @@
|
||||
# Written by retoor@molodetz.nl
|
||||
|
||||
# This script listens to audio input via a microphone, recognizes speech using the Google API, sends the recognized text to a server for processing, and uses Google Cloud to convert the server response to speech.
|
||||
|
||||
# Imports:
|
||||
# - speech_recognition: For speech recognition functionality.
|
||||
# - xmlrpc.client: To communicate with a remote server using the XML-RPC protocol.
|
||||
# - gcloud: Presumably for Google Cloud services, though this requires clarification or specific library inclusion.
|
||||
|
||||
# MIT License
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
import speech_recognition as sr
|
||||
from rspeech import gcloud
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def listen(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True):
|
||||
"""
|
||||
Function for listening to audio input via a microphone and recognizing speech using the Google API. For this function there are no credentials or gcloud account required.
|
||||
|
||||
Args:
|
||||
timeout (int): The maximum amount of time in seconds to listen for audio input.
|
||||
phrase_time_limit (int): The maximum amount of time in seconds for a single phrase of speech.
|
||||
language (str): The language code for the speech recognition. Default is "nl-NL". For English use "en-US".
|
||||
adjust_ambiance_seconds (int): The number of seconds to adjust to ambient noise. Default is 5.
|
||||
save_to (str): The path to save the audio data to. Default is None.
|
||||
recognize (bool): Whether to recognize speech or not. Default is True.
|
||||
Returns:
|
||||
str: The recognized speech as a string or True if recognize is set to False.
|
||||
"""
|
||||
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.Microphone() as source:
|
||||
|
||||
if adjust_ambiance_seconds:
|
||||
logger.info("Adjusting to surroundings for {adjust_ambiance_seconds} seconds.")
|
||||
recognizer.adjust_for_ambient_noise(source, duration=adjust_ambiance_seconds)
|
||||
while True:
|
||||
logger.info("Listening...")
|
||||
try:
|
||||
audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
|
||||
if save_to:
|
||||
with open(save_to, "wb") as f:
|
||||
logger.debug(f"Saved to {save_to}")
|
||||
f.write(audio_data.get_wav_data())
|
||||
|
||||
if not recognize:
|
||||
logger.info(f"Recognition is disabled so returning True.")
|
||||
return True
|
||||
text = recognizer.recognize_google(audio_data, language=language)
|
||||
source = None
|
||||
recognizer = None
|
||||
logger.info(f"Returning {text}")
|
||||
return text
|
||||
except sr.WaitTimeoutError:
|
||||
continue
|
||||
except sr.UnknownValueError:
|
||||
continue
|
||||
except sr.RequestError:
|
||||
continue
|
||||
|
||||
async def listen_async(timeout: int=0, phrase_time_limit: int=0,language:str="nl-NL",adjust_ambiance_seconds:int=1,save_to=None,recognize=True):
|
||||
def listen_sync():
|
||||
return listen(timeout=timeout, phrase_time_limit=phrase_time_limit,language=language,adjust_ambiance_seconds=adjust_ambiance_seconds,save_to=save_to,recognize=recognize)
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(None, listen_sync)
|
||||
|
||||
|
||||
async def main_async():
|
||||
while True:
|
||||
print("Listening...")
|
||||
print(await listen_async())
|
||||
time.sleep(3)
|
||||
|
||||
def main():
|
||||
try:
|
||||
asyncio.run(main_async())
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
18
src/rspeech/tts.py
Normal file
18
src/rspeech/tts.py
Normal file
@ -0,0 +1,18 @@
|
||||
import asyncio
|
||||
from rspeech.gcloud import tts
|
||||
|
||||
|
||||
|
||||
async def main_async():
|
||||
await tts("Type a few times return to stop.")
|
||||
while True:
|
||||
text = input("> ").strip()
|
||||
if not text:
|
||||
break
|
||||
await tts(text)
|
||||
|
||||
def main():
|
||||
asyncio.run(main_async())
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user