XIAO ESP32S3 Sense: Static noise when streaming PCM audio chunks from REST API to i2s

Hi

I’m having an issue where streaming PCM audio data as chunks received as the response from an API call is causing high amounts of static noise which I’m assuming is between each chunk, as I’m still able to hear the audio played in the “background”. If possible I also want to have a better understanding of when the stream is finished also somehow, but not sure how to do it yet.

I’ve tried streaming the entire response after all chunks have been generated from the API and it works nicely playing them through i2s.

I’ve also experimented with various buffer sizes, and dma_buf_count and dma_buf_length.

Here is my code for reference:

#include <driver/i2s.h>
#include <WiFi.h>
#include <HTTPClient.h>

// WiFi credentials
const char *ssid = "Internett";
const char *password = "Simato21";

#define SAMPLE_RATE 8000U
#define SAMPLE_BITS 16
#define MAX_RECORD_TIME 60  // Maximum record time in seconds
#define BUTTON_PIN 4        // Button connected to pin 4
#define WAV_HEADER_SIZE 44

#define I2S_DOUT 9
#define I2S_BCLK 8
#define I2S_LRC 7


// Adjust the buffer size to accommodate maximum recording time
#define MAX_AUDIO_BUFFER_SIZE (SAMPLE_RATE * SAMPLE_BITS / 8 * MAX_RECORD_TIME + WAV_HEADER_SIZE)

HTTPClient http;
uint8_t *audioBuffer = nullptr;
bool isRecording = false;
bool sendPostFlag = false;
bool requestSwitchToRxMode = false;
unsigned long lastDebounceTime = 0;
const unsigned long debounceDelay = 100;
size_t audioBufferIndex = 0;
QueueHandle_t xQueue;

unsigned long inactivityTimeout = 10000; // in milliseconds
unsigned long lastDataTime = millis();

// Function prototypes
void setup_wifi();
void setup_button();
void setup_i2s(i2s_mode_t mode);
void switch_i2s_mode(i2s_mode_t mode);
void IRAM_ATTR button_isr_handler();
void record_audio_task(void *param);
void send_audio_data(uint8_t *data, size_t length);
void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate);

void setup() {
  Serial.begin(115200);
  while (!Serial)
    ;

  audioBuffer = (uint8_t *)ps_malloc(MAX_AUDIO_BUFFER_SIZE);
  if (audioBuffer == nullptr) {
    Serial.println("Failed to allocate memory for audio buffer");
    return;
  }

  setup_wifi();
  setup_button();
  setup_i2s_tx();
  setup_i2s_rx();

  xQueue = xQueueCreate(10, sizeof(bool));
  xTaskCreate(record_audio_task, "RecordAudioTask", 16384, NULL, 1, NULL);
}

void loop() {
}

void setup_wifi() {
  WiFi.begin(ssid, password);
  while (WiFi.status() != WL_CONNECTED) {
    delay(500);
    Serial.println("Connecting to WiFi...");
  }
  Serial.println("Connected to WiFi");
}

void setup_button() {
  pinMode(BUTTON_PIN, INPUT_PULLUP);
  attachInterrupt(digitalPinToInterrupt(BUTTON_PIN), button_isr_handler, CHANGE);
}

void setup_i2s_tx() {
  i2s_config_t i2s_config = {
      .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX ),
      .sample_rate = 16000,
      .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
      .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
      .communication_format = I2S_COMM_FORMAT_STAND_PCM_SHORT,
      .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
      .dma_buf_count = 50,
      .dma_buf_len = 1024,
      .use_apll = true,
      .tx_desc_auto_clear = true,
      .fixed_mclk = 0
    };

  i2s_pin_config_t pin_config = {
      .bck_io_num = I2S_BCLK,
      .ws_io_num = I2S_LRC,
      .data_out_num = I2S_DOUT,
      .data_in_num = -1  // Not used
    };
  

  i2s_driver_install((i2s_port_t)1, &i2s_config, 0, NULL);
  i2s_set_pin((i2s_port_t)1, &pin_config);
  i2s_zero_dma_buffer((i2s_port_t)1);
}

void setup_i2s_rx() {
  i2s_config_t i2s_config = {
      .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_PDM| I2S_MODE_RX),
      .sample_rate = SAMPLE_RATE,
      .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
      .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
      .communication_format = I2S_COMM_FORMAT_STAND_I2S,
      .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
      .dma_buf_count = 8,
      .dma_buf_len = 1024,
      .use_apll = false,
      .tx_desc_auto_clear = true,  // Only applicable in TX mode
      .fixed_mclk = 0
    };

  i2s_pin_config_t pin_config = {
      .bck_io_num = -1,    // Not used
      .ws_io_num = 42,     // IIS_LCLK for microphone
      .data_out_num = -1,  // Not used
      .data_in_num = 41    // IIS_DOUT for microphone
    };

  // Uninstall the existing driver before setting a new configuration
  i2s_driver_install((i2s_port_t)0, &i2s_config, 0, NULL);
  i2s_set_pin((i2s_port_t)0, &pin_config);
  i2s_zero_dma_buffer((i2s_port_t)0);
}


void IRAM_ATTR button_isr_handler() {
  unsigned long interruptTime = millis();
  if (interruptTime - lastDebounceTime > debounceDelay) {
    bool currentButtonState = digitalRead(BUTTON_PIN) == LOW;
    if (currentButtonState != isRecording) {
      isRecording = currentButtonState;
      lastDebounceTime = interruptTime;

      if (isRecording) {
        requestSwitchToRxMode = true;  // Request to switch to RX mode
      }
      xQueueSendFromISR(xQueue, &isRecording, NULL);
    }
  }
}

void record_audio_task(void *param) {
  bool shouldRecord = false;
  bool currentlyRecording = false;
  Serial.println("Record audio task started.");

  while (true) {
    // Check for recording state updates
    while (xQueueReceive(xQueue, &shouldRecord, 0) == pdTRUE) {
      if (shouldRecord && !currentlyRecording) {
        currentlyRecording = true;
        Serial.println("Starting recording...");
        audioBufferIndex = WAV_HEADER_SIZE;  // Reset index for new recording
      } else if (!shouldRecord && currentlyRecording) {
        currentlyRecording = false;
        Serial.println("Stopping recording.");

        // Update WAV header and prepare to send data
        generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
        sendPostFlag = true;
      }
    }

    if (currentlyRecording) {
      size_t bytesRead = 0;
      TickType_t i2sReadTimeoutTicks = 1;  // 1 tick timeout for minimal blocking

      // Attempt to read audio data from I2S with minimal blocking
      esp_err_t result = i2s_read((i2s_port_t)0, audioBuffer + audioBufferIndex, MAX_AUDIO_BUFFER_SIZE - audioBufferIndex, &bytesRead, i2sReadTimeoutTicks);

      if (result == ESP_OK && bytesRead > 0) {
        audioBufferIndex += bytesRead;
        // Check for buffer overflow
        if (audioBufferIndex >= MAX_AUDIO_BUFFER_SIZE) {
          currentlyRecording = false;
          Serial.println("Max recording length reached, stopping recording.");
          // Update WAV header with actual data siz e and prepare to send data
          generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
          sendPostFlag = true;  // Set flag to indicate data is ready to be sent
        }
      }

      // Immediately check the queue again to see if recording should stop
      if (xQueueReceive(xQueue, &shouldRecord, 0) == pdTRUE && !shouldRecord) {
        currentlyRecording = false;
        Serial.println("Stopping recording via queue message.");
        generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
        sendPostFlag = true;  // Prepare to send data
      }

      // Use a short delay to yield to other tasks
      vTaskDelay(1 / portTICK_PERIOD_MS);
    } else {
      // If not recording, check less frequently
      vTaskDelay(10 / portTICK_PERIOD_MS);
    }

    // Check if the audio data is ready to be sent
    if (sendPostFlag) {
      send_audio_data(audioBuffer, audioBufferIndex);  // Send the recorded audio data
      audioBufferIndex = WAV_HEADER_SIZE;              // Reset index for the next recording
      sendPostFlag = false;                            // Reset the flag
    }
  }
}

void send_audio_data(uint8_t *data, size_t length) {
  if (WiFi.status() != WL_CONNECTED) {
    Serial.println("Not connected to WiFi");
    return;
  }

  HTTPClient http;
  int httpResponseCode;

  // Only Request: Send audio data to /api/complete
  http.begin("http://192.168.1.137:8000/api/complete"); // Adjusted endpoint
  http.addHeader("Content-Type", "audio/wav");
  http.setTimeout(30000); // Long timeout for potential audio processing

  Serial.println("Sending audio data to /api/complete...");
  httpResponseCode = http.POST(data, length);
  if (httpResponseCode != 200) {
    Serial.print("Error on sending POST to /api/complete: ");
    Serial.println(httpResponseCode);
    http.end();
    return;
  }

  // Stream the audio response directly
  WiFiClient *stream = http.getStreamPtr();
  uint8_t *buffer = (uint8_t *)ps_malloc(1024); // Buffer size for audio data
  if (buffer == nullptr) {
    Serial.println("Failed to allocate memory for buffer");
    http.end();
    return;
  }
  memset(buffer, 0, 1024);

  while (http.connected()) {
    if (stream->available()) {
      int bytesRead = stream->readBytes(buffer, 1024);
      if (bytesRead > 0) {
        size_t bytes_written = 0;
        esp_err_t result = i2s_write((i2s_port_t)1, buffer, bytesRead, &bytes_written, portMAX_DELAY);
        if (result != ESP_OK || bytes_written < bytesRead) {
          Serial.println("Error writing to I2S or partial write occurred");
        }
        memset(buffer, 0, 1024);
      }
    }  
  }

  http.end(); // End the HTTP connection
  free(buffer); // Free the dynamically allocated buffer
}



void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate) {
  uint32_t file_size = wav_size + WAV_HEADER_SIZE - 8;
  uint32_t byte_rate = SAMPLE_RATE * SAMPLE_BITS / 8;
  const uint8_t set_wav_header[] = {
    'R', 'I', 'F', 'F',                                                   // ChunkID
    file_size, file_size >> 8, file_size >> 16, file_size >> 24,          // ChunkSize
    'W', 'A', 'V', 'E',                                                   // Format
    'f', 'm', 't', ' ',                                                   // Subchunk1ID
    0x10, 0x00, 0x00, 0x00,                                               // Subchunk1Size (16 for PCM)
    0x01, 0x00,                                                           // AudioFormat (1 for PCM)
    0x01, 0x00,                                                           // NumChannels (1 channel)
    sample_rate, sample_rate >> 8, sample_rate >> 16, sample_rate >> 24,  // SampleRate
    byte_rate, byte_rate >> 8, byte_rate >> 16, byte_rate >> 24,          // ByteRate
    0x02, 0x00,                                                           // BlockAlign
    0x10, 0x00,                                                           // BitsPerSample (16 bits)
    'd', 'a', 't', 'a',                                                   // Subchunk2ID
    wav_size, wav_size >> 8, wav_size >> 16, wav_size >> 24,              // Subchunk2Size
  };
  memcpy(wav_header, set_wav_header, sizeof(set_wav_header));
}

Hi there,
WOW, You have come a mile or three :smile: on the Code for sure… LOOKS Great!
If I were to make any comment it would be the ISR’s routines look a little long.
When ever I have had latency or drops it always came down to how fast the ISR’s where.
Also
Is the static always present Like the bias is off or, just in between, chunks?
Very cool
HTH
GL :slight_smile: PJ :v:

1 Like

Your consistency is inspiring @PJ_Glasso Thanks for helping out with your knowledge!

I tried to update the ISR like this, but did not resolve the playback issue…

void IRAM_ATTR button_isr_handler() {
  BaseType_t xHigherPriorityTaskWoken = pdFALSE;
  isRecording = !isRecording;  // Toggle recording state
  xSemaphoreGiveFromISR(recordingSemaphore, &xHigherPriorityTaskWoken);
  if (xHigherPriorityTaskWoken) {
    portYIELD_FROM_ISR();  // Yield to the unblocked task if it has a higher priority
  }
}

The static is always present. Here is a short video snippet of the audio response, very choppy, and the watchdog is triggered quickly after recording ended, but also only after audio is not being recieved. Not sure how to check for that…: https://youtu.be/Ay4rstNGtYc

I also created a separate task to see if that would resolve the issue, but no change.

Under is my most recent code:

#include <driver/i2s.h>
#include <WiFi.h>
#include <HTTPClient.h>

// WiFi credentials
const char *ssid = "Internett";
const char *password = "Simato21";

#define SAMPLE_RATE 8000U
#define SAMPLE_BITS 16
#define MAX_RECORD_TIME 60  // Maximum record time in seconds
#define BUTTON_PIN 4        // Button connected to pin 4
#define WAV_HEADER_SIZE 44

#define I2S_DOUT 9
#define I2S_BCLK 8
#define I2S_LRC 7


// Adjust the buffer size to accommodate maximum recording time
#define MAX_AUDIO_BUFFER_SIZE (SAMPLE_RATE * SAMPLE_BITS / 8 * MAX_RECORD_TIME + WAV_HEADER_SIZE)

HTTPClient http;
uint8_t *audioBuffer = nullptr;
volatile bool isRecording = false;
bool sendPostFlag = false;
bool requestSwitchToRxMode = false;
unsigned long lastDebounceTime = 0;
const unsigned long debounceDelay = 100;
size_t audioBufferIndex = 0;
QueueHandle_t xQueue;
SemaphoreHandle_t recordingSemaphore;
SemaphoreHandle_t xSemaphore = NULL;
uint8_t *audioData = NULL;
size_t audioDataLength =  0;

unsigned long inactivityTimeout = 10000; // in milliseconds
unsigned long lastDataTime = millis();

// Function prototypes
void setup_wifi();
void setup_button();
void setup_i2s(i2s_mode_t mode);
void switch_i2s_mode(i2s_mode_t mode);
void IRAM_ATTR button_isr_handler();
void recordAudioTask(void *param);
void send_audio_data(uint8_t *data, size_t length);
void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate);

void setup() {
  Serial.begin(115200);
  while (!Serial)
    ;

  audioBuffer = (uint8_t *)ps_malloc(MAX_AUDIO_BUFFER_SIZE);
  if (audioBuffer == nullptr) {
    Serial.println("Failed to allocate memory for audio buffer");
    return;
  }

  setup_wifi();
  setup_button();
  setup_i2s_tx();
  setup_i2s_rx();

  recordingSemaphore = xSemaphoreCreateBinary();
  xSemaphore = xSemaphoreCreateBinary();
  xTaskCreate(recordAudioTask, "RecordAudioTask", 16384, NULL, 1, NULL);
  xTaskCreate(audioPlaybackTask, "AudioPlaybackTask",  16384, NULL,  2, NULL);
}

void loop() {
}

void setup_wifi() {
  WiFi.begin(ssid, password);
  while (WiFi.status() != WL_CONNECTED) {
    delay(500);
    Serial.println("Connecting to WiFi...");
  }
  Serial.println("Connected to WiFi");
}

void setup_button() {
  pinMode(BUTTON_PIN, INPUT_PULLUP);
  attachInterrupt(digitalPinToInterrupt(BUTTON_PIN), button_isr_handler, CHANGE);
}

void setup_i2s_tx() {
  i2s_config_t i2s_config = {
      .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX ),
      .sample_rate = 16000,
      .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
      .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
      .communication_format = I2S_COMM_FORMAT_STAND_PCM_SHORT,
      .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
      .dma_buf_count = 10,
      .dma_buf_len = 1024,
      .use_apll = true,
      .tx_desc_auto_clear = true,  // Only applicable in TX mode
      .fixed_mclk = 0
    };

  i2s_pin_config_t pin_config = {
      .bck_io_num = I2S_BCLK,
      .ws_io_num = I2S_LRC,
      .data_out_num = I2S_DOUT,
      .data_in_num = -1  // Not used
    };
  

  i2s_driver_install((i2s_port_t)1, &i2s_config, 0, NULL);
  i2s_set_pin((i2s_port_t)1, &pin_config);
  i2s_zero_dma_buffer((i2s_port_t)1);
}

void setup_i2s_rx() {
  i2s_config_t i2s_config = {
      .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_PDM| I2S_MODE_RX),
      .sample_rate = SAMPLE_RATE,
      .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
      .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
      .communication_format = I2S_COMM_FORMAT_STAND_I2S,
      .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
      .dma_buf_count = 8,
      .dma_buf_len = 1024,
      .use_apll = false,
      .tx_desc_auto_clear = true,  // Only applicable in TX mode
      .fixed_mclk = 0
    };

  i2s_pin_config_t pin_config = {
      .bck_io_num = -1,    // Not used
      .ws_io_num = 42,     // IIS_LCLK for microphone
      .data_out_num = -1,  // Not used
      .data_in_num = 41    // IIS_DOUT for microphone
    };

  // Uninstall the existing driver before setting a new configuration
  i2s_driver_install((i2s_port_t)0, &i2s_config, 0, NULL);
  i2s_set_pin((i2s_port_t)0, &pin_config);
  i2s_zero_dma_buffer((i2s_port_t)0);
}

void IRAM_ATTR button_isr_handler() {
  BaseType_t xHigherPriorityTaskWoken = pdFALSE;
  isRecording = !isRecording;  // Toggle recording state
  xSemaphoreGiveFromISR(recordingSemaphore, &xHigherPriorityTaskWoken);
  if (xHigherPriorityTaskWoken) {
    portYIELD_FROM_ISR();  // Yield to the unblocked task if it has a higher priority
  }
}

void recordAudioTask(void *param) {
  bool currentlyRecording = false;
  Serial.println("Record audio task started.");

  while (true) {
    // Wait for a signal to start or stop recording
    if (xSemaphoreTake(recordingSemaphore, portMAX_DELAY) == pdTRUE) {
      // Toggle recording state based on the semaphore
      currentlyRecording = !currentlyRecording;
      if (currentlyRecording) {
        Serial.println("Starting recording...");
        audioBufferIndex = WAV_HEADER_SIZE;  // Reset index for new recording
      } else {
        Serial.println("Stopping recording.");
        generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
        sendPostFlag = true;  // Indicate that audio data is ready to be sent
      }
    }

    if (currentlyRecording) {
      size_t bytesRead = 0;
      TickType_t i2sReadTimeoutTicks = 1;  // Minimal blocking

      // Attempt to read audio data from I2S
      esp_err_t result = i2s_read((i2s_port_t)0, audioBuffer + audioBufferIndex, MAX_AUDIO_BUFFER_SIZE - audioBufferIndex, &bytesRead, i2sReadTimeoutTicks);

      if (result == ESP_OK && bytesRead > 0) {
        audioBufferIndex += bytesRead;
        // Check for buffer overflow
        if (audioBufferIndex >= MAX_AUDIO_BUFFER_SIZE) {
          currentlyRecording = false;
          Serial.println("Max recording length reached, stopping recording.");
          generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
          sendPostFlag = true;  // Prepare to send data
        }
      }

      // Short delay to yield to other tasks
      vTaskDelay(1 / portTICK_PERIOD_MS);
    } else {
      // If not recording, check less frequently
      vTaskDelay(100 / portTICK_PERIOD_MS);
    }

    // Check if the audio data is ready to be sent
    if (sendPostFlag) {
      send_audio_data(audioBuffer, audioBufferIndex);  // Send the recorded audio data
      audioBufferIndex = WAV_HEADER_SIZE;              // Reset index for the next recording
      sendPostFlag = false;                            // Reset the flag
    }
  }
}


void send_audio_data(uint8_t *data, size_t length) {
  if (WiFi.status() == WL_CONNECTED) {
    HTTPClient http;

    // First Request: Send audio data to /api/audio
    http.begin("http://192.168.1.137:8000/api/text");
    http.addHeader("Content-Type", "audio/wav");
    http.setTimeout(30000); // Long timeout for potential audio processing

    Serial.println("Sending audio data to /api/audio...");
    int httpResponseCode = http.POST(data, length);

    if (httpResponseCode >  0) {
      String responseText = http.getString(); // Get the text response
      Serial.println(responseText);
      http.end(); // End the first HTTP connection

      // Second Request: Use the responseText as the X-Input-Text header
      http.begin("http://192.168.1.137:8000/api/file-text-audio");
      http.addHeader("X-Input-Text", responseText);
      http.setTimeout(30000); // Long timeout for streaming

      Serial.println("Requesting audio stream with text response...");
      httpResponseCode = http.POST(""); // The body can be empty or whatever is expected by your API

      if (httpResponseCode ==  200) {
        WiFiClient *stream = http.getStreamPtr();
        uint8_t buffer[2048];
        memset(buffer,  0, sizeof(buffer));

        while (http.connected()) {
          // Check if data is available to read
          if (stream->available()) {
            int bytesRead = stream->readBytes(buffer, sizeof(buffer));
            if (bytesRead >  0) {
              // Allocate memory for the audio data and copy the buffer content
              uint8_t *audioDataCopy = (uint8_t *)malloc(bytesRead);
              memcpy(audioDataCopy, buffer, bytesRead);

              // Update the global variables with the new audio data
              audioData = audioDataCopy;
              audioDataLength = bytesRead;

              // Give the semaphore to signal the audio playback task
              xSemaphoreGive(xSemaphore);
            }
          }
        }

        http.end(); // End the HTTP connection
      } else {
        Serial.print("Error on sending POST to /api/file-text-audio: ");
        Serial.println(httpResponseCode);
      }
    } else {
      Serial.print("Error on sending POST to /api/audio: ");
      Serial.println(httpResponseCode);
    }
  } else {
    Serial.println("Not connected to WiFi");
  }
}

void audioPlaybackTask(void *parameter) {
  while (1) {
    // Wait for the semaphore to be given, indicating new audio data is available
    if (xSemaphoreTake(xSemaphore, portMAX_DELAY) == pdTRUE) {
      if (audioData != NULL && audioDataLength >  0) {
        // Playback logic here
        // This is a simplified example. You might need to adjust it based on your specific audio format and requirements.
        size_t bytes_written =  0;
        esp_err_t result = i2s_write((i2s_port_t)1, audioData, audioDataLength, &bytes_written, portMAX_DELAY);
        if (result != ESP_OK || bytes_written < audioDataLength) {
          Serial.println("Error writing to I2S or partial write occurred");
        }
        // Clear the buffer after processing
        free(audioData);
        audioData = NULL;
        audioDataLength =  0;
      }
    }
  }
}

void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate) {
  uint32_t file_size = wav_size + WAV_HEADER_SIZE - 8;
  uint32_t byte_rate = SAMPLE_RATE * SAMPLE_BITS / 8;
  const uint8_t set_wav_header[] = {
    'R', 'I', 'F', 'F',                                                   // ChunkID
    file_size, file_size >> 8, file_size >> 16, file_size >> 24,          // ChunkSize
    'W', 'A', 'V', 'E',                                                   // Format
    'f', 'm', 't', ' ',                                                   // Subchunk1ID
    0x10, 0x00, 0x00, 0x00,                                               // Subchunk1Size (16 for PCM)
    0x01, 0x00,                                                           // AudioFormat (1 for PCM)
    0x01, 0x00,                                                           // NumChannels (1 channel)
    sample_rate, sample_rate >> 8, sample_rate >> 16, sample_rate >> 24,  // SampleRate
    byte_rate, byte_rate >> 8, byte_rate >> 16, byte_rate >> 24,          // ByteRate
    0x02, 0x00,                                                           // BlockAlign
    0x10, 0x00,                                                           // BitsPerSample (16 bits)
    'd', 'a', 't', 'a',                                                   // Subchunk2ID
    wav_size, wav_size >> 8, wav_size >> 16, wav_size >> 24,              // Subchunk2Size
  };
  memcpy(wav_header, set_wav_header, sizeof(set_wav_header));
}

Update: Apparently the endpoint i was requesting audio from supported PCM (24khz, 16Bit) and after changing that it works a lot better! Now there is some issue of static sound suddenly appearing, and clicking while the audio plays smoothly (perhaps in between chunks). The issue is solved now, and it might seem like the FFMPEG conversion from mp3 to pcm I was doing on the server was the issue.

Here is the code for anyone looking at how I solved it. Still some issues as mentioned.

#include <driver/i2s.h>
#include <WiFi.h>
#include <HTTPClient.h>

// WiFi credentials
const char *ssid = "Internett";
const char *password = "Simato21";

#define SAMPLE_RATE 8000U
#define SAMPLE_BITS 16
#define MAX_RECORD_TIME 60  // Maximum record time in seconds
#define BUTTON_PIN 4        // Button connected to pin 4
#define WAV_HEADER_SIZE 44

#define I2S_DOUT 9
#define I2S_BCLK 8
#define I2S_LRC 7


// Adjust the buffer size to accommodate maximum recording time
#define MAX_AUDIO_BUFFER_SIZE (SAMPLE_RATE * SAMPLE_BITS / 8 * MAX_RECORD_TIME + WAV_HEADER_SIZE)

HTTPClient http;
uint8_t *audioBuffer = nullptr;
bool isRecording = false;
bool sendPostFlag = false;
bool requestSwitchToRxMode = false;
unsigned long lastDebounceTime = 0;
const unsigned long debounceDelay = 100;
size_t audioBufferIndex = 0;
QueueHandle_t xQueue;

unsigned long inactivityTimeout = 10000; // in milliseconds
unsigned long lastDataTime = millis();

// Function prototypes
void setup_wifi();
void setup_button();
void setup_i2s(i2s_mode_t mode);
void switch_i2s_mode(i2s_mode_t mode);
void IRAM_ATTR button_isr_handler();
void record_audio_task(void *param);
void send_audio_data(uint8_t *data, size_t length);
void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate);

void setup() {
  Serial.begin(115200);
  while (!Serial)
    ;

  audioBuffer = (uint8_t *)ps_malloc(MAX_AUDIO_BUFFER_SIZE);
  if (audioBuffer == nullptr) {
    Serial.println("Failed to allocate memory for audio buffer");
    return;
  }

  setup_wifi();
  setup_button();
  setup_i2s_tx();
  setup_i2s_rx();

  xQueue = xQueueCreate(10, sizeof(bool));
  xTaskCreate(record_audio_task, "RecordAudioTask", 16384, NULL, 1, NULL);
}

void loop() {
}

void setup_wifi() {
  WiFi.begin(ssid, password);
  while (WiFi.status() != WL_CONNECTED) {
    delay(500);
    Serial.println("Connecting to WiFi...");
  }
  Serial.println("Connected to WiFi");
}

void setup_button() {
  pinMode(BUTTON_PIN, INPUT_PULLUP);
  attachInterrupt(digitalPinToInterrupt(BUTTON_PIN), button_isr_handler, CHANGE);
}

void setup_i2s_tx() {
  i2s_config_t i2s_config = {
      .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX ),
      .sample_rate = 24000,
      .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
      .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
      .communication_format = I2S_COMM_FORMAT_STAND_PCM_LONG,
      .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
      .dma_buf_count = 50,
      .dma_buf_len = 512,
      .use_apll = true,
      .tx_desc_auto_clear = true,  // Only applicable in TX mode
      .fixed_mclk = 0
    };

  i2s_pin_config_t pin_config = {
      .bck_io_num = I2S_BCLK,
      .ws_io_num = I2S_LRC,
      .data_out_num = I2S_DOUT,
      .data_in_num = -1  // Not used
    };
  

  i2s_driver_install((i2s_port_t)1, &i2s_config, 0, NULL);
  i2s_set_pin((i2s_port_t)1, &pin_config);
  i2s_zero_dma_buffer((i2s_port_t)1);
}

void setup_i2s_rx() {
  i2s_config_t i2s_config = {
      .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_PDM| I2S_MODE_RX),
      .sample_rate = SAMPLE_RATE,
      .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
      .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
      .communication_format = I2S_COMM_FORMAT_STAND_I2S,
      .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
      .dma_buf_count = 8,
      .dma_buf_len = 1024,
      .use_apll = false,
      .tx_desc_auto_clear = true,  // Only applicable in TX mode
      .fixed_mclk = 0
    };

  i2s_pin_config_t pin_config = {
      .bck_io_num = -1,    // Not used
      .ws_io_num = 42,     // IIS_LCLK for microphone
      .data_out_num = -1,  // Not used
      .data_in_num = 41    // IIS_DOUT for microphone
    };

  // Uninstall the existing driver before setting a new configuration
  i2s_driver_install((i2s_port_t)0, &i2s_config, 0, NULL);
  i2s_set_pin((i2s_port_t)0, &pin_config);
  i2s_zero_dma_buffer((i2s_port_t)0);
}


void IRAM_ATTR button_isr_handler() {
  unsigned long interruptTime = millis();
  if (interruptTime - lastDebounceTime > debounceDelay) {
    bool currentButtonState = digitalRead(BUTTON_PIN) == LOW;
    if (currentButtonState != isRecording) {
      isRecording = currentButtonState;
      lastDebounceTime = interruptTime;

      if (isRecording) {
        requestSwitchToRxMode = true;  // Request to switch to RX mode
      }
      xQueueSendFromISR(xQueue, &isRecording, NULL);
    }
  }
}

void record_audio_task(void *param) {
  bool shouldRecord = false;
  bool currentlyRecording = false;
  Serial.println("Record audio task started.");

  while (true) {
    // Check for recording state updates
    while (xQueueReceive(xQueue, &shouldRecord, 0) == pdTRUE) {
      if (shouldRecord && !currentlyRecording) {
        currentlyRecording = true;
        Serial.println("Starting recording...");
        audioBufferIndex = WAV_HEADER_SIZE;  // Reset index for new recording
      } else if (!shouldRecord && currentlyRecording) {
        currentlyRecording = false;
        Serial.println("Stopping recording.");

        // Update WAV header and prepare to send data
        generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
        sendPostFlag = true;
      }
    }

    if (currentlyRecording) {
      size_t bytesRead = 0;
      TickType_t i2sReadTimeoutTicks = 1;  // 1 tick timeout for minimal blocking

      // Attempt to read audio data from I2S with minimal blocking
      esp_err_t result = i2s_read((i2s_port_t)0, audioBuffer + audioBufferIndex, MAX_AUDIO_BUFFER_SIZE - audioBufferIndex, &bytesRead, i2sReadTimeoutTicks);

      if (result == ESP_OK && bytesRead > 0) {
        audioBufferIndex += bytesRead;
        // Check for buffer overflow
        if (audioBufferIndex >= MAX_AUDIO_BUFFER_SIZE) {
          currentlyRecording = false;
          Serial.println("Max recording length reached, stopping recording.");
          // Update WAV header with actual data siz e and prepare to send data
          generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
          sendPostFlag = true;  // Set flag to indicate data is ready to be sent
        }
      }

      // Immediately check the queue again to see if recording should stop
      if (xQueueReceive(xQueue, &shouldRecord, 0) == pdTRUE && !shouldRecord) {
        currentlyRecording = false;
        Serial.println("Stopping recording via queue message.");
        generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
        sendPostFlag = true;  // Prepare to send data
      }

      // Use a short delay to yield to other tasks
      vTaskDelay(1 / portTICK_PERIOD_MS);
    } else {
      // If not recording, check less frequently
      vTaskDelay(10 / portTICK_PERIOD_MS);
    }

    // Check if the audio data is ready to be sent
    if (sendPostFlag) {
      send_audio_data(audioBuffer, audioBufferIndex);  // Send the recorded audio data
      audioBufferIndex = WAV_HEADER_SIZE;              // Reset index for the next recording
      sendPostFlag = false;                            // Reset the flag
    }
  }
}

void send_audio_data(uint8_t *data, size_t length) {
  if (WiFi.status() == WL_CONNECTED) {
    HTTPClient http;

    // First Request: Send audio data to /api/audio
    http.begin("http://192.168.1.137:8000/api/text");
    http.addHeader("Content-Type", "audio/wav");
    http.setTimeout(30000); // Long timeout for potential audio processing

    Serial.println("Sending audio data to /api/audio...");
    int httpResponseCode = http.POST(data, length);

    if (httpResponseCode > 0) {
      String responseText = http.getString(); // Get the text response
      Serial.println(responseText);
      http.end(); // End the first HTTP connection

      // Second Request: Use the responseText as the X-Input-Text header
      http.begin("http://192.168.1.137:8000/api/file-text-audio");
      http.addHeader("X-Input-Text", responseText);
      http.setTimeout(30000); // Long timeout for streaming

      Serial.println("Requesting audio stream with text response...");
      httpResponseCode = http.POST(""); // The body can be empty or whatever is expected by your API

      if (httpResponseCode == 200) {
        WiFiClient *stream = http.getStreamPtr();
        uint8_t buffer[8192];
        memset(buffer, 0, sizeof(buffer));

        while (http.connected()) {
        // Check if data is available to read
        if (stream->available()) {
          int bytesRead = stream->readBytes(buffer, sizeof(buffer));
          if (bytesRead > 0) {
            // Reset the last data time on receiving data
            lastDataTime = millis();

            Serial.println("Streaming audio...");
            size_t bytes_written = 0;
            esp_err_t result = i2s_write((i2s_port_t)1, buffer, bytesRead, &bytes_written, portMAX_DELAY);
            
            if (result != ESP_OK || bytes_written < bytesRead) {
              Serial.println("Error writing to I2S or partial write occurred");
            }

            // Clear the buffer after processing each chunk
            memset(buffer, 0, sizeof(buffer));
          }
        } 
        // else if (millis() - lastDataTime > inactivityTimeout) {
        //   // If no data has been received for the duration of the inactivity timeout, assume the stream has ended
        //   Serial.println("Stream ended due to inactivity timeout.");
        //   break;
        // }
      }

      http.end(); // End the HTTP connection
    } else {
      Serial.print("Error on sending POST to /api/file-text-audio: ");
      Serial.println(httpResponseCode);
    }
  } else {
    Serial.println("Not connected to WiFi");
  }
}
}


void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate) {
  uint32_t file_size = wav_size + WAV_HEADER_SIZE - 8;
  uint32_t byte_rate = SAMPLE_RATE * SAMPLE_BITS / 8;
  const uint8_t set_wav_header[] = {
    'R', 'I', 'F', 'F',                                                   // ChunkID
    file_size, file_size >> 8, file_size >> 16, file_size >> 24,          // ChunkSize
    'W', 'A', 'V', 'E',                                                   // Format
    'f', 'm', 't', ' ',                                                   // Subchunk1ID
    0x10, 0x00, 0x00, 0x00,                                               // Subchunk1Size (16 for PCM)
    0x01, 0x00,                                                           // AudioFormat (1 for PCM)
    0x01, 0x00,                                                           // NumChannels (1 channel)
    sample_rate, sample_rate >> 8, sample_rate >> 16, sample_rate >> 24,  // SampleRate
    byte_rate, byte_rate >> 8, byte_rate >> 16, byte_rate >> 24,          // ByteRate
    0x02, 0x00,                                                           // BlockAlign
    0x10, 0x00,                                                           // BitsPerSample (16 bits)
    'd', 'a', 't', 'a',                                                   // Subchunk2ID
    wav_size, wav_size >> 8, wav_size >> 16, wav_size >> 24,              // Subchunk2Size
  };
  memcpy(wav_header, set_wav_header, sizeof(set_wav_header));
}
1 Like