// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "content/browser/speech/speech_recognizer_impl.h" #include #include #include "base/bind.h" #include "base/macros.h" #include "base/time/time.h" #include "build/build_config.h" #include "content/browser/browser_main_loop.h" #include "content/browser/media/media_internals.h" #include "content/browser/service_manager/service_manager_context.h" #include "content/browser/speech/audio_buffer.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/speech_recognition_event_listener.h" #include "media/audio/audio_system.h" #include "media/base/audio_converter.h" #include "media/mojo/interfaces/audio_logging.mojom.h" #include "services/audio/public/cpp/audio_system_factory.h" #include "services/audio/public/cpp/device_factory.h" #include "services/service_manager/public/mojom/connector.mojom.h" #if defined(OS_WIN) #include "media/audio/win/core_audio_util_win.h" #endif using media::AudioBus; using media::AudioConverter; using media::AudioParameters; using media::ChannelLayout; namespace content { // Private class which encapsulates the audio converter and the // AudioConverter::InputCallback. It handles resampling, buffering and // channel mixing between input and output parameters. class SpeechRecognizerImpl::OnDataConverter : public media::AudioConverter::InputCallback { public: OnDataConverter(const AudioParameters& input_params, const AudioParameters& output_params); ~OnDataConverter() override; // Converts input audio |data| bus into an AudioChunk where the input format // is given by |input_parameters_| and the output format by // |output_parameters_|. scoped_refptr Convert(const AudioBus* data); bool data_was_converted() const { return data_was_converted_; } private: // media::AudioConverter::InputCallback implementation. double ProvideInput(AudioBus* dest, uint32_t frames_delayed) override; // Handles resampling, buffering, and channel mixing between input and output // parameters. AudioConverter audio_converter_; std::unique_ptr input_bus_; std::unique_ptr output_bus_; const AudioParameters input_parameters_; const AudioParameters output_parameters_; bool data_was_converted_; DISALLOW_COPY_AND_ASSIGN(OnDataConverter); }; namespace { // The following constants are related to the volume level indicator shown in // the UI for recorded audio. // Multiplier used when new volume is greater than previous level. const float kUpSmoothingFactor = 1.0f; // Multiplier used when new volume is lesser than previous level. const float kDownSmoothingFactor = 0.7f; // RMS dB value of a maximum (unclipped) sine wave for int16_t samples. const float kAudioMeterMaxDb = 90.31f; // This value corresponds to RMS dB for int16_t with 6 most-significant-bits = // 0. // Values lower than this will display as empty level-meter. const float kAudioMeterMinDb = 30.0f; const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; // Returns true if more than 5% of the samples are at min or max value. bool DetectClipping(const AudioChunk& chunk) { const int num_samples = chunk.NumSamples(); const int16_t* samples = chunk.SamplesData16(); const int kThreshold = num_samples / 20; int clipping_samples = 0; for (int i = 0; i < num_samples; ++i) { if (samples[i] <= -32767 || samples[i] >= 32767) { if (++clipping_samples > kThreshold) return true; } } return false; } } // namespace const int SpeechRecognizerImpl::kAudioSampleRate = 16000; const ChannelLayout SpeechRecognizerImpl::kChannelLayout = media::CHANNEL_LAYOUT_MONO; const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; media::AudioSystem* SpeechRecognizerImpl::audio_system_for_tests_ = nullptr; media::AudioCapturerSource* SpeechRecognizerImpl::audio_capturer_source_for_tests_ = nullptr; // SpeechRecognizerImpl::OnDataConverter implementation SpeechRecognizerImpl::OnDataConverter::OnDataConverter( const AudioParameters& input_params, const AudioParameters& output_params) : audio_converter_(input_params, output_params, false), input_bus_(AudioBus::Create(input_params)), output_bus_(AudioBus::Create(output_params)), input_parameters_(input_params), output_parameters_(output_params), data_was_converted_(false) { audio_converter_.AddInput(this); audio_converter_.PrimeWithSilence(); } SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { // It should now be safe to unregister the converter since no more OnData() // callbacks are outstanding at this point. audio_converter_.RemoveInput(this); } scoped_refptr SpeechRecognizerImpl::OnDataConverter::Convert( const AudioBus* data) { CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); data_was_converted_ = false; // Copy recorded audio to the |input_bus_| for later use in ProvideInput(). data->CopyTo(input_bus_.get()); // Convert the audio and place the result in |output_bus_|. This call will // result in a ProvideInput() callback where the actual input is provided. // However, it can happen that the converter contains enough cached data // to return a result without calling ProvideInput(). The caller of this // method should check the state of data_was_converted_() and make an // additional call if it is set to false at return. // See http://crbug.com/506051 for details. audio_converter_.Convert(output_bus_.get()); // Create an audio chunk based on the converted result. scoped_refptr chunk(new AudioChunk( output_parameters_.GetBytesPerBuffer(media::kSampleFormatS16), kNumBitsPerAudioSample / 8)); static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample == 16, "kNumBitsPerAudioSample must match interleaving type."); output_bus_->ToInterleaved( output_bus_->frames(), reinterpret_cast(chunk->writable_data())); return chunk; } double SpeechRecognizerImpl::OnDataConverter::ProvideInput( AudioBus* dest, uint32_t frames_delayed) { // Read from the input bus to feed the converter. input_bus_->CopyTo(dest); // Indicate that the recorded audio has in fact been used by the converter. data_was_converted_ = true; return 1; } // SpeechRecognizerImpl implementation SpeechRecognizerImpl::SpeechRecognizerImpl( SpeechRecognitionEventListener* listener, media::AudioSystem* audio_system, int session_id, bool continuous, bool provisional_results, SpeechRecognitionEngine* engine) : SpeechRecognizer(listener, session_id), audio_system_(audio_system), recognition_engine_(engine), endpointer_(kAudioSampleRate), is_dispatching_event_(false), provisional_results_(provisional_results), end_of_utterance_(false), state_(STATE_IDLE), weak_ptr_factory_(this) { DCHECK_CURRENTLY_ON(BrowserThread::IO); DCHECK(recognition_engine_ != nullptr); DCHECK(audio_system_ != nullptr); if (!continuous) { // In single shot (non-continous) recognition, // the session is automatically ended after: // - 0.5 seconds of silence if time < 3 seconds // - 1 seconds of silence if time >= 3 seconds endpointer_.set_speech_input_complete_silence_length( base::Time::kMicrosecondsPerSecond / 2); endpointer_.set_long_speech_input_complete_silence_length( base::Time::kMicrosecondsPerSecond); endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); } else { // In continuous recognition, the session is automatically ended after 15 // seconds of silence. const int64_t cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15; endpointer_.set_speech_input_complete_silence_length(cont_timeout_us); endpointer_.set_long_speech_length(0); // Use only a single timeout. } endpointer_.StartSession(); recognition_engine_->set_delegate(this); } // ------- Methods that trigger Finite State Machine (FSM) events ------------ // NOTE:all the external events and requests should be enqueued (PostTask), even // if they come from the same (IO) thread, in order to preserve the relationship // of causality between events and avoid interleaved event processing due to // synchronous callbacks. void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { DCHECK(!device_id.empty()); device_id_ = device_id; BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, base::BindOnce(&SpeechRecognizerImpl::DispatchEvent, this, FSMEventArgs(EVENT_PREPARE))); } void SpeechRecognizerImpl::AbortRecognition() { BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, base::BindOnce(&SpeechRecognizerImpl::DispatchEvent, this, FSMEventArgs(EVENT_ABORT))); } void SpeechRecognizerImpl::StopAudioCapture() { BrowserThread::PostTask( BrowserThread::IO, FROM_HERE, base::BindOnce(&SpeechRecognizerImpl::DispatchEvent, this, FSMEventArgs(EVENT_STOP_CAPTURE))); } bool SpeechRecognizerImpl::IsActive() const { // Checking the FSM state from another thread (thus, while the FSM is // potentially concurrently evolving) is meaningless. DCHECK_CURRENTLY_ON(BrowserThread::IO); return state_ != STATE_IDLE && state_ != STATE_ENDED; } bool SpeechRecognizerImpl::IsCapturingAudio() const { DCHECK_CURRENTLY_ON(BrowserThread::IO); // See IsActive(). const bool is_capturing_audio = state_ >= STATE_STARTING && state_ <= STATE_RECOGNIZING; return is_capturing_audio; } const SpeechRecognitionEngine& SpeechRecognizerImpl::recognition_engine() const { return *(recognition_engine_.get()); } SpeechRecognizerImpl::~SpeechRecognizerImpl() { DCHECK_CURRENTLY_ON(BrowserThread::IO); endpointer_.EndSession(); if (GetAudioCapturerSource()) { GetAudioCapturerSource()->Stop(); audio_capturer_source_ = nullptr; } } void SpeechRecognizerImpl::Capture(const AudioBus* data, int audio_delay_milliseconds, double volume, bool key_pressed) { // Convert audio from native format to fixed format used by WebSpeech. FSMEventArgs event_args(EVENT_AUDIO_DATA); event_args.audio_data = audio_converter_->Convert(data); BrowserThread::PostTask( BrowserThread::IO, FROM_HERE, base::BindOnce(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); // See http://crbug.com/506051 regarding why one extra convert call can // sometimes be required. It should be a rare case. if (!audio_converter_->data_was_converted()) { event_args.audio_data = audio_converter_->Convert(data); BrowserThread::PostTask( BrowserThread::IO, FROM_HERE, base::BindOnce(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); } // Something is seriously wrong here and we are most likely missing some // audio segments. CHECK(audio_converter_->data_was_converted()); } void SpeechRecognizerImpl::OnCaptureError(const std::string& message) { FSMEventArgs event_args(EVENT_AUDIO_ERROR); BrowserThread::PostTask( BrowserThread::IO, FROM_HERE, base::BindOnce(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); } void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( const std::vector& results) { FSMEventArgs event_args(EVENT_ENGINE_RESULT); event_args.engine_results = mojo::Clone(results); BrowserThread::PostTask( BrowserThread::IO, FROM_HERE, base::BindOnce(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); } void SpeechRecognizerImpl::OnSpeechRecognitionEngineEndOfUtterance() { DCHECK(!end_of_utterance_); end_of_utterance_ = true; } void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( const blink::mojom::SpeechRecognitionError& error) { FSMEventArgs event_args(EVENT_ENGINE_ERROR); event_args.engine_error = error; BrowserThread::PostTask( BrowserThread::IO, FROM_HERE, base::BindOnce(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); } // ----------------------- Core FSM implementation --------------------------- // TODO(primiano): After the changes in the media package (r129173), this class // slightly violates the SpeechRecognitionEventListener interface contract. In // particular, it is not true anymore that this class can be freed after the // OnRecognitionEnd event, since the audio_capturer_source_->Stop() asynchronous // call can be still in progress after the end event. Currently, it does not // represent a problem for the browser itself, since refcounting protects us // against such race conditions. However, we should fix this in the next CLs. void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { DCHECK_CURRENTLY_ON(BrowserThread::IO); DCHECK_LE(event_args.event, EVENT_MAX_VALUE); DCHECK_LE(state_, STATE_MAX_VALUE); // Event dispatching must be sequential, otherwise it will break all the rules // and the assumptions of the finite state automata model. DCHECK(!is_dispatching_event_); is_dispatching_event_ = true; // Guard against the delegate freeing us until we finish processing the event. scoped_refptr me(this); if (event_args.event == EVENT_AUDIO_DATA) { DCHECK(event_args.audio_data.get() != nullptr); ProcessAudioPipeline(*event_args.audio_data.get()); } // The audio pipeline must be processed before the event dispatch, otherwise // it would take actions according to the future state instead of the current. state_ = ExecuteTransitionAndGetNextState(event_args); is_dispatching_event_ = false; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( const FSMEventArgs& event_args) { const FSMEvent event = event_args.event; switch (state_) { case STATE_IDLE: switch (event) { // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and // EVENT_STOP_CAPTURE below once speech input extensions are fixed. case EVENT_ABORT: return AbortSilently(event_args); case EVENT_PREPARE: return PrepareRecognition(event_args); case EVENT_START: return NotFeasible(event_args); case EVENT_STOP_CAPTURE: return AbortSilently(event_args); case EVENT_AUDIO_DATA: // Corner cases related to queued messages case EVENT_ENGINE_RESULT: // being lately dispatched. case EVENT_ENGINE_ERROR: case EVENT_AUDIO_ERROR: return DoNothing(event_args); } break; case STATE_PREPARING: switch (event) { case EVENT_ABORT: return AbortSilently(event_args); case EVENT_PREPARE: return NotFeasible(event_args); case EVENT_START: return StartRecording(event_args); case EVENT_STOP_CAPTURE: return AbortSilently(event_args); case EVENT_AUDIO_DATA: // Corner cases related to queued messages case EVENT_ENGINE_RESULT: // being lately dispatched. case EVENT_ENGINE_ERROR: case EVENT_AUDIO_ERROR: return DoNothing(event_args); } break; case STATE_STARTING: switch (event) { case EVENT_ABORT: return AbortWithError(event_args); case EVENT_PREPARE: return NotFeasible(event_args); case EVENT_START: return NotFeasible(event_args); case EVENT_STOP_CAPTURE: return AbortSilently(event_args); case EVENT_AUDIO_DATA: return StartRecognitionEngine(event_args); case EVENT_ENGINE_RESULT: return NotFeasible(event_args); case EVENT_ENGINE_ERROR: case EVENT_AUDIO_ERROR: return AbortWithError(event_args); } break; case STATE_ESTIMATING_ENVIRONMENT: switch (event) { case EVENT_ABORT: return AbortWithError(event_args); case EVENT_PREPARE: return NotFeasible(event_args); case EVENT_START: return NotFeasible(event_args); case EVENT_STOP_CAPTURE: return StopCaptureAndWaitForResult(event_args); case EVENT_AUDIO_DATA: return WaitEnvironmentEstimationCompletion(event_args); case EVENT_ENGINE_RESULT: return ProcessIntermediateResult(event_args); case EVENT_ENGINE_ERROR: case EVENT_AUDIO_ERROR: return AbortWithError(event_args); } break; case STATE_WAITING_FOR_SPEECH: switch (event) { case EVENT_ABORT: return AbortWithError(event_args); case EVENT_PREPARE: return NotFeasible(event_args); case EVENT_START: return NotFeasible(event_args); case EVENT_STOP_CAPTURE: return StopCaptureAndWaitForResult(event_args); case EVENT_AUDIO_DATA: return DetectUserSpeechOrTimeout(event_args); case EVENT_ENGINE_RESULT: return ProcessIntermediateResult(event_args); case EVENT_ENGINE_ERROR: case EVENT_AUDIO_ERROR: return AbortWithError(event_args); } break; case STATE_RECOGNIZING: switch (event) { case EVENT_ABORT: return AbortWithError(event_args); case EVENT_PREPARE: return NotFeasible(event_args); case EVENT_START: return NotFeasible(event_args); case EVENT_STOP_CAPTURE: return StopCaptureAndWaitForResult(event_args); case EVENT_AUDIO_DATA: return DetectEndOfSpeech(event_args); case EVENT_ENGINE_RESULT: return ProcessIntermediateResult(event_args); case EVENT_ENGINE_ERROR: case EVENT_AUDIO_ERROR: return AbortWithError(event_args); } break; case STATE_WAITING_FINAL_RESULT: switch (event) { case EVENT_ABORT: return AbortWithError(event_args); case EVENT_PREPARE: return NotFeasible(event_args); case EVENT_START: return NotFeasible(event_args); case EVENT_STOP_CAPTURE: case EVENT_AUDIO_DATA: return DoNothing(event_args); case EVENT_ENGINE_RESULT: return ProcessFinalResult(event_args); case EVENT_ENGINE_ERROR: case EVENT_AUDIO_ERROR: return AbortWithError(event_args); } break; // TODO(primiano): remove this state when speech input extensions support // will be removed and STATE_IDLE.EVENT_ABORT,EVENT_STOP_CAPTURE will be // reset to NotFeasible (see TODO above). case STATE_ENDED: return DoNothing(event_args); } return NotFeasible(event_args); } // ----------- Contract for all the FSM evolution functions below ------------- // - Are guaranteed to be executed in the IO thread; // - Are guaranteed to be not reentrant (themselves and each other); // - event_args members are guaranteed to be stable during the call; // - The class won't be freed in the meanwhile due to callbacks; // TODO(primiano): the audio pipeline is currently serial. However, the // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. // We should profile the execution to see if it would be worth or not. void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING; const bool route_to_sr_engine = route_to_endpointer; const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && state_ <= STATE_RECOGNIZING; const bool clip_detected = DetectClipping(raw_audio); float rms = 0.0f; num_samples_recorded_ += raw_audio.NumSamples(); if (route_to_endpointer) endpointer_.ProcessAudio(raw_audio, &rms); if (route_to_vumeter) { DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. UpdateSignalAndNoiseLevels(rms, clip_detected); } if (route_to_sr_engine) { DCHECK(recognition_engine_.get() != nullptr); recognition_engine_->TakeAudioChunk(raw_audio); } } void SpeechRecognizerImpl::OnDeviceInfo( const base::Optional& params) { DCHECK_CURRENTLY_ON(BrowserThread::IO); device_params_ = params.value_or(AudioParameters()); DVLOG(1) << "Device parameters: " << device_params_.AsHumanReadableString(); DispatchEvent(FSMEventArgs(EVENT_START)); } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::PrepareRecognition( const FSMEventArgs&) { DCHECK(state_ == STATE_IDLE); DCHECK(recognition_engine_.get() != nullptr); DCHECK(!IsCapturingAudio()); GetAudioSystem()->GetInputStreamParameters( device_id_, base::BindOnce(&SpeechRecognizerImpl::OnDeviceInfo, weak_ptr_factory_.GetWeakPtr())); listener()->OnRecognitionStart(session_id()); return STATE_PREPARING; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { DCHECK(state_ == STATE_PREPARING); DCHECK(recognition_engine_.get() != nullptr); DCHECK(!IsCapturingAudio()); DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; num_samples_recorded_ = 0; audio_level_ = 0; end_of_utterance_ = false; int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); if (!device_params_.IsValid()) { DLOG(ERROR) << "Audio input device not found"; return Abort(blink::mojom::SpeechRecognitionError( blink::mojom::SpeechRecognitionErrorCode::kAudioCapture, blink::mojom::SpeechAudioErrorDetails::kNoMic)); } // Audio converter shall provide audio based on these parameters as output. // Hard coded, WebSpeech specific parameters are utilized here. int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; AudioParameters output_parameters = AudioParameters(AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, frames_per_buffer); DVLOG(1) << "SRI::output_parameters: " << output_parameters.AsHumanReadableString(); // Audio converter will receive audio based on these parameters as input. // On Windows we start by verifying that Core Audio is supported. If not, // the WaveIn API is used and we might as well avoid all audio conversations // since WaveIn does the conversion for us. // TODO(henrika): this code should be moved to platform dependent audio // managers. bool use_native_audio_params = true; #if defined(OS_WIN) use_native_audio_params = media::CoreAudioUtil::IsSupported(); DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; #endif AudioParameters input_parameters = output_parameters; // AUDIO_FAKE means we are running a test. if (use_native_audio_params && device_params_.format() != media::AudioParameters::AUDIO_FAKE) { // Use native audio parameters but avoid opening up at the native buffer // size. Instead use same frame size (in milliseconds) as WebSpeech uses. // We rely on internal buffers in the audio back-end to fulfill this request // and the idea is to simplify the audio conversion since each Convert() // call will then render exactly one ProvideInput() call. input_parameters = device_params_; frames_per_buffer = ((input_parameters.sample_rate() * chunk_duration_ms) / 1000.0) + 0.5; input_parameters.set_frames_per_buffer(frames_per_buffer); DVLOG(1) << "SRI::input_parameters: " << input_parameters.AsHumanReadableString(); } // Create an audio converter which converts data between native input format // and WebSpeech specific output format. audio_converter_.reset( new OnDataConverter(input_parameters, output_parameters)); // The endpointer needs to estimate the environment/background noise before // starting to treat the audio as user input. We wait in the state // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching // to user input mode. endpointer_.SetEnvironmentEstimationMode(); CreateAudioCapturerSource(); GetAudioCapturerSource()->Initialize(input_parameters, this); GetAudioCapturerSource()->Start(); return STATE_STARTING; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { // This is the first audio packet captured, so the recognition engine is // started and the delegate notified about the event. DCHECK(recognition_engine_.get() != nullptr); recognition_engine_->StartRecognition(); listener()->OnAudioStart(session_id()); // This is a little hack, since TakeAudioChunk() is already called by // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping // the first audio chunk captured after opening the audio device. recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get())); return STATE_ESTIMATING_ENVIRONMENT; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { DCHECK(endpointer_.IsEstimatingEnvironment()); if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { endpointer_.SetUserInputMode(); listener()->OnEnvironmentEstimationComplete(session_id()); return STATE_WAITING_FOR_SPEECH; } else { return STATE_ESTIMATING_ENVIRONMENT; } } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { if (endpointer_.DidStartReceivingSpeech()) { listener()->OnSoundStart(session_id()); return STATE_RECOGNIZING; } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { return Abort(blink::mojom::SpeechRecognitionError( blink::mojom::SpeechRecognitionErrorCode::kNoSpeech, blink::mojom::SpeechAudioErrorDetails::kNone)); } return STATE_WAITING_FOR_SPEECH; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { if (end_of_utterance_ || endpointer_.speech_input_complete()) return StopCaptureAndWaitForResult(event_args); return STATE_RECOGNIZING; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); DVLOG(1) << "Concluding recognition"; CloseAudioCapturerSource(); recognition_engine_->AudioChunksEnded(); if (state_ > STATE_WAITING_FOR_SPEECH) listener()->OnSoundEnd(session_id()); listener()->OnAudioEnd(session_id()); return STATE_WAITING_FINAL_RESULT; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); return Abort(blink::mojom::SpeechRecognitionError( blink::mojom::SpeechRecognitionErrorCode::kNone, blink::mojom::SpeechAudioErrorDetails::kNone)); } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { if (event_args.event == EVENT_AUDIO_ERROR) { return Abort(blink::mojom::SpeechRecognitionError( blink::mojom::SpeechRecognitionErrorCode::kAudioCapture, blink::mojom::SpeechAudioErrorDetails::kNone)); } else if (event_args.event == EVENT_ENGINE_ERROR) { return Abort(event_args.engine_error); } return Abort(blink::mojom::SpeechRecognitionError( blink::mojom::SpeechRecognitionErrorCode::kAborted, blink::mojom::SpeechAudioErrorDetails::kNone)); } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( const blink::mojom::SpeechRecognitionError& error) { DCHECK_CURRENTLY_ON(BrowserThread::IO); if (IsCapturingAudio()) CloseAudioCapturerSource(); DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; if (state_ == STATE_PREPARING) { // Cancel an outstanding reply from AudioSystem. weak_ptr_factory_.InvalidateWeakPtrs(); } // The recognition engine is initialized only after STATE_STARTING. if (state_ > STATE_STARTING) { DCHECK(recognition_engine_.get() != nullptr); recognition_engine_->EndRecognition(); } if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) listener()->OnSoundEnd(session_id()); if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) listener()->OnAudioEnd(session_id()); if (error.code != blink::mojom::SpeechRecognitionErrorCode::kNone) listener()->OnRecognitionError(session_id(), error); listener()->OnRecognitionEnd(session_id()); return STATE_ENDED; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( const FSMEventArgs& event_args) { // In continuous recognition, intermediate results can occur even when we are // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the // recognition engine is "faster" than our endpointer). In these cases we // skip the endpointer and fast-forward to the RECOGNIZING state, with respect // of the events triggering order. if (state_ == STATE_ESTIMATING_ENVIRONMENT) { DCHECK(endpointer_.IsEstimatingEnvironment()); endpointer_.SetUserInputMode(); listener()->OnEnvironmentEstimationComplete(session_id()); } else if (state_ == STATE_WAITING_FOR_SPEECH) { listener()->OnSoundStart(session_id()); } else { DCHECK_EQ(STATE_RECOGNIZING, state_); } listener()->OnRecognitionResults(session_id(), event_args.engine_results); return STATE_RECOGNIZING; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { const std::vector& results = event_args.engine_results; std::vector::const_iterator i = results.begin(); bool provisional_results_pending = false; bool results_are_empty = true; for (; i != results.end(); ++i) { const blink::mojom::SpeechRecognitionResultPtr& result = *i; if (result->is_provisional) { DCHECK(provisional_results_); provisional_results_pending = true; } else if (results_are_empty) { results_are_empty = result->hypotheses.empty(); } } if (provisional_results_pending) { listener()->OnRecognitionResults(session_id(), results); // We don't end the recognition if a provisional result is received in // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will // end the recognition. return state_; } recognition_engine_->EndRecognition(); if (!results_are_empty) { // We could receive an empty result (which we won't propagate further) // in the following (continuous) scenario: // 1. The caller start pushing audio and receives some results; // 2. A |StopAudioCapture| is issued later; // 3. The final audio frames captured in the interval ]1,2] do not lead to // any result (nor any error); // 4. The speech recognition engine, therefore, emits an empty result to // notify that the recognition is ended with no error, yet neither any // further result. listener()->OnRecognitionResults(session_id(), results); } listener()->OnRecognitionEnd(session_id()); return STATE_ENDED; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { return state_; // Just keep the current state. } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { NOTREACHED() << "Unfeasible event " << event_args.event << " in state " << state_; return state_; } void SpeechRecognizerImpl::CloseAudioCapturerSource() { DCHECK(IsCapturingAudio()); DVLOG(1) << "SpeechRecognizerImpl closing audio capturer source."; GetAudioCapturerSource()->Stop(); audio_capturer_source_ = nullptr; } int SpeechRecognizerImpl::GetElapsedTimeMs() const { return (num_samples_recorded_ * 1000) / kAudioSampleRate; } void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected) { // Calculate the input volume to display in the UI, smoothing towards the // new level. // TODO(primiano): Do we really need all this floating point arith here? // Perhaps it might be quite expensive on mobile. float level = (rms - kAudioMeterMinDb) / (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : kDownSmoothingFactor; audio_level_ += (level - audio_level_) * smoothing_factor; float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); noise_level = std::min(std::max(0.0f, noise_level), kAudioMeterRangeMaxUnclipped); listener()->OnAudioLevelsChange( session_id(), clip_detected ? 1.0f : audio_level_, noise_level); } void SpeechRecognizerImpl::SetAudioEnvironmentForTesting( media::AudioSystem* audio_system, media::AudioCapturerSource* audio_capturer_source) { audio_system_for_tests_ = audio_system; audio_capturer_source_for_tests_ = audio_capturer_source; } media::AudioSystem* SpeechRecognizerImpl::GetAudioSystem() { return audio_system_for_tests_ ? audio_system_for_tests_ : audio_system_; } void SpeechRecognizerImpl::CreateAudioCapturerSource() { service_manager::Connector* connector = ServiceManagerContext::GetConnectorForIOThread(); if (connector) { audio_capturer_source_ = audio::CreateInputDevice( connector->Clone(), device_id_, MediaInternals::GetInstance()->CreateMojoAudioLog( media::AudioLogFactory::AUDIO_INPUT_CONTROLLER, 0 /* component_id */)); } } media::AudioCapturerSource* SpeechRecognizerImpl::GetAudioCapturerSource() { return audio_capturer_source_for_tests_ ? audio_capturer_source_for_tests_ : audio_capturer_source_.get(); } SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) : event(event_value), audio_data(nullptr), engine_error(blink::mojom::SpeechRecognitionErrorCode::kNone, blink::mojom::SpeechAudioErrorDetails::kNone) {} SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(const FSMEventArgs& other) : event(other.event), audio_data(other.audio_data), engine_error(other.engine_error) { engine_results = mojo::Clone(other.engine_results); } SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {} } // namespace content