toy-hardware/managed_components/espressif__esp-sr/include/esp32p4/esp_vad.h

// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
#ifndef _ESP_VAD_H_
#define _ESP_VAD_H_

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

#define SAMPLE_RATE_HZ 16000   // Supports 32000, 16000, 8000
#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms

/**
 * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
 * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
 */
typedef enum {
    VAD_MODE_0 = 0, // Normal
    VAD_MODE_1,     // Aggressive
    VAD_MODE_2,     // Very Aggressive
    VAD_MODE_3,     // Very Very Aggressive
    VAD_MODE_4      // Very Very Very Aggressive
} vad_mode_t;

typedef enum {
    VAD_SILENCE = 0,
    VAD_SPEECH = 1,
} vad_state_t;

typedef struct vad_trigger_tag {
    vad_state_t state;
    unsigned int min_speech_len;
    unsigned int noise_len;
    unsigned int min_noise_len;
    unsigned int speech_len;
} vad_trigger_t;

#define vad_MAX_LEN INT32_MAX - 1
/**
 * @brief Allocate wakenet trigger
 *
 * @param min_speech_len  Minimum frame number of speech duration
 * @param min_noise_len   Minimum frame number of noise duration
 *
 * @return Trigger pointer
 **/
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);

/**
 * @brief Free wakenet trigger
 **/
void vad_trigger_free(vad_trigger_t *trigger);

/**
 * @brief Reset wakenet trigger
 **/
void vad_trigger_reset(vad_trigger_t *trigger);

/**
 * @brief detect activaty voice by trigger
 **/
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);

typedef struct {
    vad_trigger_t *trigger;
    void *vad_inst;
    int sample_rate;
    int frame_size;
} vad_handle_with_trigger_t;

typedef vad_handle_with_trigger_t *vad_handle_t;

// typedef vad_handle_tag * vad_handle_t;

/**
 * @brief Creates an instance to the VAD structure.
 *
 * @param vad_mode          Sets the VAD operating mode.
 *
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of VAD
 */
vad_handle_t vad_create(vad_mode_t vad_mode);

/**
 * @brief Creates an instance to the VAD structure.
 *
 * @param vad_mode          Sets the VAD operating mode.
 * @param sample_rate       Sample rate in Hz
 * @param one_frame_ms      Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
 * @param min_speech_ms     Minimum speech duration, unit is ms
 * @param min_noise_ms      Minimum noise duration, unit is ms
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of VAD
 */
vad_handle_t vad_create_with_param(
    vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);

/**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
 *
 * @param handle            The instance of VAD.
 * @param data              An array of 16-bit signed audio samples.
 * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
 * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
 * @return
 *         - VAD_SILENCE if no voice
 *         - VAD_SPEECH  if voice is detected
 *
 */
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);

/**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
 *
 * @param handle            The instance of VAD.
 * @param data              An array of 16-bit signed audio samples.
 * @return
 *         - VAD_SILENCE if no voice
 *         - VAD_SPEECH  if voice is detected
 *
 */
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);

/**
 * @brief Reset trigger state as Silence
 *
 * @param handle            The instance of VAD.
 */
void vad_reset_trigger(vad_handle_t handle);

/**
 * @brief Free the VAD instance
 *
 * @param inst The instance of VAD.
 *
 * @return None
 *
 */
void vad_destroy(vad_handle_t inst);

/*
 * Programming Guide:
 *
 * @code{c}
 * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to
 * the VAD structure.
 *
 * while (1) {
 *    //Use buffer to receive the audio data from MIC.
 *    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
 * }
 *
 * vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
 *
 * @endcode
 */

#ifdef __cplusplus
}
#endif

#endif //_ESP_VAD_H_