zh_en.py

# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: LicenseRef-.amazon.com.-AmznSL-1.0
# Licensed under the Amazon Software License  http://aws.amazon.com/asl/

import boto3
import numpy as np
import pyaudio
import wave
import sys
import os
import websocket
import samplerate as sr
import threading
import time
import aiofile
import asyncio
#import sounddevice
from amazon_transcribe.client import TranscribeStreamingClient
from amazon_transcribe.handlers import TranscriptResultStreamHandler
from amazon_transcribe.model import Result, Transcript, TranscriptEvent
from pydub import AudioSegment
from pydub.utils import make_chunks
from pydub.playback import play
input_rate = 44100
target_rate = 32000
defaultframes = 1024
class textcolors:
    if not os.name == 'nt':
        blue = '\033[94m'
        green = '\033[92m'
        warning = '\033[93m'
        fail = '\033[91m'
        end = '\033[0m'
    else:
        blue = ''
        green = ''
        warning = ''
        fail = ''
        end = ''
recorded_frames = []
device_info = {}
useloopback = False
recordtime = 100
#Use module
p = pyaudio.PyAudio()
#Set default to first in list or ask Windows
try:
    default_device_index = p.get_default_input_device_info()
except IOError:
    default_device_index = -1
#Select Device
print (textcolors.blue + "Available devices:\n" + textcolors.end)
for i in range(0, p.get_device_count()):
    info = p.get_device_info_by_index(i)
    print (textcolors.green + str(info["index"]) + textcolors.end + ": \t %s \n \t %s \n" % (info["name"], p.get_host_api_info_by_index(info["hostApi"])["name"]))
    if default_device_index == -1:
        default_device_index = info["index"]
#Handle no devices available
if default_device_index == -1:
    print (textcolors.fail + "No device available. Quitting." + textcolors.end)
    exit()
#Get input or default
device_id = int(input("Choose device [" + textcolors.blue + str(default_device_index) + textcolors.end + "]: ") or default_device_index)
print ("")
#Get device info
try:
    device_info = p.get_device_info_by_index(device_id)
except IOError:
    device_info = p.get_device_info_by_index(default_device_index)
    print (textcolors.warning + "Selection not available, using default." + textcolors.end)
#Choose between loopback or standard mode
is_input = device_info["maxInputChannels"] > 0
is_wasapi = (p.get_host_api_info_by_index(device_info["hostApi"])["name"]).find("WASAPI") != -1
if is_input:
    print (textcolors.blue + "Selection is input using standard mode.\n" + textcolors.end)
else:
    if is_wasapi:
        useloopback = True;
        print (textcolors.green + "Selection is output. Using loopback mode.\n" + textcolors.end)
    else:
        print (textcolors.fail + "Selection is input and does not support loopback mode. Quitting.\n" + textcolors.end)
        exit()
recordtime = int(input("Record time in seconds [" + textcolors.blue + str(recordtime) + textcolors.end + "]: ") or recordtime)
resampler = sr.Resampler()
ratio = target_rate / input_rate
def resample(chunk, target_rate=32000):
    raw_data = chunk
    data = np.fromstring(raw_data, dtype=np.int16)
    resampled_data = resampler.process(data, ratio)
    #print('{} -> {}'.format(len(data), len(resampled_data)))
    return resampled_data
#pre-recorded audio file
filename = "zoom_test.wav"
translate = boto3.client(service_name='translate', region_name='us-west-2', use_ssl=True)
transcription = ''
async def mic_stream():
    # This function wraps the raw input stream from the microphone forwarding
    # the blocks to an asyncio.Queue.
    loop = asyncio.get_event_loop()
    input_queue = asyncio.Queue()
    def callback(indata, frame_count, time_info, status):
        #data = wf.readframes(frame_count)
        #return (data, pyaudio.paContinue)
        loop.call_soon_threadsafe(input_queue.put_nowait, indata)
        return (indata, pyaudio.paContinue)
    # Be sure to use the correct parameters for the audio stream that matches
    # the audio formats described for the source language you'll be using:
    # https://docs.aws.amazon.com/transcribe/latest/dg/streaming.html
    """stream = sounddevice.RawInputStream(
        channels=1,
        samplerate=16000,
        callback=callback,
        blocksize=1024 * 2,
        dtype="int16",
    )
    """
    print(device_info)
    #Open stream
    channelcount = device_info["maxInputChannels"] if (device_info["maxOutputChannels"] < device_info["maxInputChannels"]) else device_info["maxOutputChannels"]
    stream = p.open(format = pyaudio.paInt16,
                channels = channelcount,
                rate = int(device_info["defaultSampleRate"]),
                #rate = 16000,
                input = True,
                frames_per_buffer = defaultframes,
                input_device_index = device_info["index"],
                # as_loopback = useloopback,
                stream_callback=callback)
    # Initiate the audio stream and asynchronously yield the audio chunks
    # as they become available.
    stream.start_stream()
    print("started stream")
    while True:
        #indata = resample(stream.read(defaultframes), 32000)
        indata = await input_queue.get()
        #print("sending", indata)
        #print(len(indata))
        yield indata
        #yield resample(indata).tobytes()
class MyEventHandler(TranscriptResultStreamHandler):
    async def handle_transcript_event(self, transcript_event: TranscriptEvent):
        #globals transcription
        # This handler can be implemented to handle transcriptions as needed.
        # In this case, we're simply printing the finished 
        results = transcript_event.transcript.results
        print("firing outputs..", results)
        if len(results) > 0:
            if len(results[0].alternatives) > 0:
                transcript = results[0].alternatives[0].transcript
                print(transcript)
                #transcription += transcript
                print("transcript:", transcript)
                #print(dir(results[0]))
                print(results[0].channel_id)
                if hasattr(results[0], "is_partial") and results[0].is_partial == False:
                    #translate only 1 channel. the other channel is a duplicate
                    if results[0].channel_id == "ch_0":
                        trans_result = translate.translate_text(
                            Text = transcript,
                            SourceLanguageCode = "zh",
                            TargetLanguageCode = "en"
                        )
                        print("translated text in en:" + trans_result.get("TranslatedText"))
                        text = trans_result.get("TranslatedText")
                        aws_polly_tts(text)
    # async def handle_transcript_event(self, transcript: Transcript):
    #     # This handler can be implemented to handle transcriptions as needed.
    #     # In this case, we're simply printing the finished 
    #         if transcript.results[0].alternatives[0].is_partial == False:
    #             print(transcript.results[0].alternatives[0].transcript)
polly = boto3.client('polly', region_name = 'us-west-2')
def aws_polly_tts(text):
    response = polly.synthesize_speech(
        Engine = 'standard',
        LanguageCode = 'en-US',
        Text=text,
        VoiceId = "Joanna",
        OutputFormat = "mp3",
    )
    #play back into microphone
    #TODO: playback asap the buffer fills-in
    #https://aws.amazon.com/blogs/machine-learning/building-a-reliable-text-to-speech-service-with-amazon-polly/
    chunk = response['AudioStream'].read()
    #write to a file
    file = open('zh_results_en.mp3','wb')
    file.write(chunk)
    file.close()
    #playback
    chunk = 1024
    # Open a .Stream object to write the WAV file
    # 'output = True' indicates that the
    # sound will be played rather than
    # recorded and opposite can be used for recording
    af = AudioSegment.from_file("zh_results_en.mp3")
    """
    pa = pyaudio.PyAudio()
    op_stream = pa.open(format = pa.get_format_from_width(af.sample_width),
                    channels = af.channels,
                    rate = af.frame_rate,
                    output = True)
    # Read data in chunks
    rd_data = af.(chunk)
    # Play the sound by writing the audio
    # data to the Stream using while loop
    while rd_data != '':
        op_stream.write(rd_data)
        rd_data = af.readframes(chunk)
    # Close and terminate the stream
    op_stream.stop_stream()
    op_stream.close()
    pa.terminate()
    """
    play(af)
async def transcribe():
# Setup up our client with our chosen AWS region
    client = TranscribeStreamingClient(region="us-west-2")
    stream = await client.start_stream_transcription(
        language_code="zh-CN",
        #media_sample_rate_hz=16000,
        media_sample_rate_hz=int(device_info["defaultSampleRate"]),
        number_of_channels = 2,
        enable_channel_identification=True,
        media_encoding="pcm",
    )
    recorded_frames = []
    async def write_chunks(stream):
        # This connects the raw audio chunks generator coming from the microphone
        # and passes them salong to the transcription stream.
        print("getting mic stream")
        async for chunk in mic_stream():
            #print("found chunk sending now...")
            #print(len(chunk))
            recorded_frames.append(chunk)
            await stream.input_stream.send_audio_event(audio_chunk=chunk)
        await stream.input_stream.end_stream()
        """
        async with aiofile.AIOFile("zoom_test.wav", 'rb') as afp:
            reader = aiofile.Reader(afp, chunk_size=1024 * 16)
            async for chunk in reader:
                await stream.input_stream.send_audio_event(audio_chunk=chunk)
        await stream.input_stream.end_stream()
        """
    print("1")
    handler = MyEventHandler(stream.output_stream)
    print("2")
    await asyncio.gather(write_chunks(stream), handler.handle_events())
    filename = input("Save as [" + textcolors.blue + "outtesting.wav" + textcolors.end + "]: ") or "outtesting.wav"
    waveFile = wave.open(filename, 'wb')
    waveFile.setnchannels(2)
    waveFile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    waveFile.setframerate(int(device_info["defaultSampleRate"]))
    waveFile.writeframes(b''.join(recorded_frames))
    waveFile.close()
    print("3")
loop = asyncio.get_event_loop()
loop.run_until_complete(transcribe())
loop.close()
# async def main():
#     # Instantiate our handler and start processing events
#     loop = asyncio.get_event_loop()
#     print("did the thing")
#     return
# if __name__ == "__main__":
#     asyncio.run(main())
#     # main()
print("done")