Recherche avancée
Sur d’autres sites (233)
-
Google cloud speech to text not giving output for OGG & MP3 files
27 avril 2021, par Vedant JumleI am trying to perform speech to text on a bunch of audio files which are over 10 mins long. I don't want to waste storage on the cloud bucket by straight-up uploading wav files on it. So I am using
ffmpeg
to convert the files either to ogg or mp3 like :
ffmpeg -y -i audio.wav -ar 12000 -r 16000 audio.mp3


ffmpeg -y -i audio.wav -ar 12000 -r 16000 audio.ogg


For testing purpose I ran the speech to text service on a dummy wav file and it seemed to work, I got the text as expected. But for some reason it isn't detecting any speech when I use the ogg or mp3 file. I could not give amr files to work either.


My code :


def transcribe_gcs(gcs_uri):
 client = speech.SpeechClient()

 audio = speech.RecognitionAudio(uri=gcs_uri)
 config = speech.RecognitionConfig(
 encoding="OGG_OPUS", #replace with "LINEAR16" for wav, "OGG_OPUS" for ogg, "AMR" for amr
 sample_rate_hertz=16000,
 language_code="en-US",
 )
 print("starting operation")
 operation = client.long_running_recognize(config=config, audio=audio)
 response = operation.result()
 print(response)



I have set up the authentication properly, so that is not a problem.


When I run the speech to text service on the same audio but in ogg or mp3(I just comment out the encoding setting from the config for mp3) format, it gives no response, just prints out a line break and done.


What can I do to fix this ?


-
Google Speech - Streaming Request Returns EOF
9 octobre 2017, par JoshUsing Go, I’m taking a RTMP stream, transcoding it to FLAC (using ffmpeg) and attempting to stream to Google’s Speech API to transcribe the audio. However, I keep getting
EOF
errors when sending the data. I can’t find any information on this error in the docs so I’m not exactly sure what’s causing it.I’m chunking the received data into 3s clips (length isn’t relevant as long as it’s less than the maximum length of a streaming recognition request).
Here is the core of my code :
func main() {
done := make(chan os.Signal)
received := make(chan []byte)
go receive(received)
go transcribe(received)
signal.Notify(done, os.Interrupt, syscall.SIGTERM)
select {
case <-done:
os.Exit(0)
}
}
func receive(received chan<- []byte) {
var b bytes.Buffer
stdout := bufio.NewWriter(&b)
cmd := exec.Command("ffmpeg", "-i", "rtmp://127.0.0.1:1935/live/key", "-f", "flac", "-ar", "16000", "-")
cmd.Stdout = stdout
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
duration, _ := time.ParseDuration("3s")
ticker := time.NewTicker(duration)
for {
select {
case <-ticker.C:
stdout.Flush()
log.Printf("Received %d bytes", b.Len())
received <- b.Bytes()
b.Reset()
}
}
}
func transcribe(received <-chan []byte) {
ctx := context.TODO()
client, err := speech.NewClient(ctx)
if err != nil {
log.Fatal(err)
}
stream, err := client.StreamingRecognize(ctx)
if err != nil {
log.Fatal(err)
}
// Send the initial configuration message.
if err = stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
StreamingConfig: &speechpb.StreamingRecognitionConfig{
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_FLAC,
LanguageCode: "en-GB",
SampleRateHertz: 16000,
},
},
},
}); err != nil {
log.Fatal(err)
}
for {
select {
case data := <-received:
if len(data) > 0 {
log.Printf("Sending %d bytes", len(data))
if err := stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
AudioContent: data,
},
}); err != nil {
log.Printf("Could not send audio: %v", err)
}
}
}
}
}Running this code gives this output :
2017/10/09 16:05:00 Received 191704 bytes
2017/10/09 16:05:00 Saving 191704 bytes
2017/10/09 16:05:00 Sending 191704 bytes
2017/10/09 16:05:00 Could not send audio: EOF
2017/10/09 16:05:03 Received 193192 bytes
2017/10/09 16:05:03 Saving 193192 bytes
2017/10/09 16:05:03 Sending 193192 bytes
2017/10/09 16:05:03 Could not send audio: EOF
2017/10/09 16:05:06 Received 193188 bytes
2017/10/09 16:05:06 Saving 193188 bytes
2017/10/09 16:05:06 Sending 193188 bytes // Notice that this doesn't error
2017/10/09 16:05:09 Received 191704 bytes
2017/10/09 16:05:09 Saving 191704 bytes
2017/10/09 16:05:09 Sending 191704 bytes
2017/10/09 16:05:09 Could not send audio: EOFNotice that not all of the
Send
s fail.Could anyone point me in the right direction here ? Is it something to do with the FLAC headers or something ? I also wonder if maybe resetting the buffer causes some of the data to be dropped (i.e. it’s a non-trivial operation that actually takes some time to complete) and it doesn’t like this missing information ?
Any help would be really appreciated.
-
Google Speech - Streaming Request Returns EOF Error
16 octobre 2017, par JoshUsing Go, I’m taking a RTMP stream, transcoding it to FLAC (using ffmpeg) and attempting to stream to Google’s Speech API to transcribe the audio. However, I keep getting
EOF
errors when sending the data. I can’t find any information on this error in the docs so I’m not exactly sure what’s causing it.I’m chunking the received data into 3s clips (length isn’t relevant as long as it’s less than the maximum length of a streaming recognition request).
Here is the core of my code :
func main() {
done := make(chan os.Signal)
received := make(chan []byte)
go receive(received)
go transcribe(received)
signal.Notify(done, os.Interrupt, syscall.SIGTERM)
select {
case <-done:
os.Exit(0)
}
}
func receive(received chan<- []byte) {
var b bytes.Buffer
stdout := bufio.NewWriter(&b)
cmd := exec.Command("ffmpeg", "-i", "rtmp://127.0.0.1:1935/live/key", "-f", "flac", "-ar", "16000", "-")
cmd.Stdout = stdout
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
duration, _ := time.ParseDuration("3s")
ticker := time.NewTicker(duration)
for {
select {
case <-ticker.C:
stdout.Flush()
log.Printf("Received %d bytes", b.Len())
received <- b.Bytes()
b.Reset()
}
}
}
func transcribe(received <-chan []byte) {
ctx := context.TODO()
client, err := speech.NewClient(ctx)
if err != nil {
log.Fatal(err)
}
stream, err := client.StreamingRecognize(ctx)
if err != nil {
log.Fatal(err)
}
// Send the initial configuration message.
if err = stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
StreamingConfig: &speechpb.StreamingRecognitionConfig{
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_FLAC,
LanguageCode: "en-GB",
SampleRateHertz: 16000,
},
},
},
}); err != nil {
log.Fatal(err)
}
for {
select {
case data := <-received:
if len(data) > 0 {
log.Printf("Sending %d bytes", len(data))
if err := stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
AudioContent: data,
},
}); err != nil {
log.Printf("Could not send audio: %v", err)
}
}
}
}
}Running this code gives this output :
2017/10/09 16:05:00 Received 191704 bytes
2017/10/09 16:05:00 Saving 191704 bytes
2017/10/09 16:05:00 Sending 191704 bytes
2017/10/09 16:05:00 Could not send audio: EOF
2017/10/09 16:05:03 Received 193192 bytes
2017/10/09 16:05:03 Saving 193192 bytes
2017/10/09 16:05:03 Sending 193192 bytes
2017/10/09 16:05:03 Could not send audio: EOF
2017/10/09 16:05:06 Received 193188 bytes
2017/10/09 16:05:06 Saving 193188 bytes
2017/10/09 16:05:06 Sending 193188 bytes // Notice that this doesn't error
2017/10/09 16:05:09 Received 191704 bytes
2017/10/09 16:05:09 Saving 191704 bytes
2017/10/09 16:05:09 Sending 191704 bytes
2017/10/09 16:05:09 Could not send audio: EOFNotice that not all of the
Send
s fail.Could anyone point me in the right direction here ? Is it something to do with the FLAC headers or something ? I also wonder if maybe resetting the buffer causes some of the data to be dropped (i.e. it’s a non-trivial operation that actually takes some time to complete) and it doesn’t like this missing information ?
Any help would be really appreciated.