I'm using an esp32s3 to playback an audio file in pcm format through i2s_write function.
The server I have set up converts audio from mp3 to pcm using ffmpeg, and forwards each chunk to the esp32s3 which then plays the chunks through i2s_write. It plays the audio, but it is like every chunk's playback is interrupted with static noise, although I can hear the audio is being played.
This is what I've tried:
1. Experimenting with different buffer sizes both in i2s config and the function.
2. Sending all chunks once they are converted from mp3 to PCM as one UintArray8 from the server works fine for playback on the I2S.
3. Reducing the sample rate from the API to see if conversion speed impacts playback speed.
4. I've tried with content type on the API response; audio/octet-stream, audio/mpeg and audio/wav
The chunks seems to be the issue. I'm also not able to detect when the stream ends, but that's another issue for later. I'm using one I2S port for recording and one for playback.
Here is my code, and any feedback of any kind would be useful!
Code: Select all
#include <driver/i2s.h>
#include <WiFi.h>
#include <HTTPClient.h>
// WiFi credentials
const char *ssid = "SSID";
const char *password = "PASSWORD";
#define SAMPLE_RATE 8000U
#define SAMPLE_BITS 16
#define MAX_RECORD_TIME 60 // Maximum record time in seconds
#define BUTTON_PIN 4 // Button connected to pin 4
#define WAV_HEADER_SIZE 44
#define I2S_DOUT 9
#define I2S_BCLK 8
#define I2S_LRC 7
// Adjust the buffer size to accommodate maximum recording time
#define MAX_AUDIO_BUFFER_SIZE (SAMPLE_RATE * SAMPLE_BITS / 8 * MAX_RECORD_TIME + WAV_HEADER_SIZE)
HTTPClient http;
uint8_t *audioBuffer = nullptr;
bool isRecording = false;
bool sendPostFlag = false;
bool requestSwitchToRxMode = false;
unsigned long lastDebounceTime = 0;
const unsigned long debounceDelay = 100;
size_t audioBufferIndex = 0;
QueueHandle_t xQueue;
unsigned long inactivityTimeout = 10000; // in milliseconds
unsigned long lastDataTime = millis();
// Function prototypes
void setup_wifi();
void setup_button();
void setup_i2s(i2s_mode_t mode);
void switch_i2s_mode(i2s_mode_t mode);
void IRAM_ATTR button_isr_handler();
void record_audio_task(void *param);
void send_audio_data(uint8_t *data, size_t length);
void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate);
void setup() {
Serial.begin(115200);
while (!Serial)
;
audioBuffer = (uint8_t *)ps_malloc(MAX_AUDIO_BUFFER_SIZE);
if (audioBuffer == nullptr) {
Serial.println("Failed to allocate memory for audio buffer");
return;
}
setup_wifi();
setup_button();
setup_i2s_tx();
setup_i2s_rx();
xQueue = xQueueCreate(10, sizeof(bool));
xTaskCreate(record_audio_task, "RecordAudioTask", 16384, NULL, 1, NULL);
}
void loop() {
}
void setup_wifi() {
WiFi.begin(ssid, password);
while (WiFi.status() != WL_CONNECTED) {
delay(500);
Serial.println("Connecting to WiFi...");
}
Serial.println("Connected to WiFi");
}
void setup_button() {
pinMode(BUTTON_PIN, INPUT_PULLUP);
attachInterrupt(digitalPinToInterrupt(BUTTON_PIN), button_isr_handler, CHANGE);
}
void setup_i2s_tx() {
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX ),
.sample_rate = 16000,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_STAND_PCM_SHORT,
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 512,
.use_apll = false,
.tx_desc_auto_clear = false, // Only applicable in TX mode
.fixed_mclk = 0
};
i2s_pin_config_t pin_config = {
.bck_io_num = I2S_BCLK,
.ws_io_num = I2S_LRC,
.data_out_num = I2S_DOUT,
.data_in_num = -1 // Not used
};
i2s_driver_install((i2s_port_t)1, &i2s_config, 0, NULL);
i2s_set_pin((i2s_port_t)1, &pin_config);
i2s_zero_dma_buffer((i2s_port_t)1);
}
void setup_i2s_rx() {
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_PDM| I2S_MODE_RX),
.sample_rate = SAMPLE_RATE,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_STAND_I2S,
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 1024,
.use_apll = false,
.tx_desc_auto_clear = true, // Only applicable in TX mode
.fixed_mclk = 0
};
i2s_pin_config_t pin_config = {
.bck_io_num = -1, // Not used
.ws_io_num = 42, // IIS_LCLK for microphone
.data_out_num = -1, // Not used
.data_in_num = 41 // IIS_DOUT for microphone
};
// Uninstall the existing driver before setting a new configuration
i2s_driver_install((i2s_port_t)0, &i2s_config, 0, NULL);
i2s_set_pin((i2s_port_t)0, &pin_config);
i2s_zero_dma_buffer((i2s_port_t)0);
}
void IRAM_ATTR button_isr_handler() {
unsigned long interruptTime = millis();
if (interruptTime - lastDebounceTime > debounceDelay) {
bool currentButtonState = digitalRead(BUTTON_PIN) == LOW;
if (currentButtonState != isRecording) {
isRecording = currentButtonState;
lastDebounceTime = interruptTime;
if (isRecording) {
requestSwitchToRxMode = true; // Request to switch to RX mode
}
xQueueSendFromISR(xQueue, &isRecording, NULL);
}
}
}
void record_audio_task(void *param) {
bool shouldRecord = false;
bool currentlyRecording = false;
Serial.println("Record audio task started.");
while (true) {
// Check for recording state updates
while (xQueueReceive(xQueue, &shouldRecord, 0) == pdTRUE) {
if (shouldRecord && !currentlyRecording) {
currentlyRecording = true;
Serial.println("Starting recording...");
audioBufferIndex = WAV_HEADER_SIZE; // Reset index for new recording
} else if (!shouldRecord && currentlyRecording) {
currentlyRecording = false;
Serial.println("Stopping recording.");
// Update WAV header and prepare to send data
generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
sendPostFlag = true;
}
}
if (currentlyRecording) {
size_t bytesRead = 0;
TickType_t i2sReadTimeoutTicks = 1; // 1 tick timeout for minimal blocking
// Attempt to read audio data from I2S with minimal blocking
esp_err_t result = i2s_read((i2s_port_t)0, audioBuffer + audioBufferIndex, MAX_AUDIO_BUFFER_SIZE - audioBufferIndex, &bytesRead, i2sReadTimeoutTicks);
if (result == ESP_OK && bytesRead > 0) {
audioBufferIndex += bytesRead;
// Check for buffer overflow
if (audioBufferIndex >= MAX_AUDIO_BUFFER_SIZE) {
currentlyRecording = false;
Serial.println("Max recording length reached, stopping recording.");
// Update WAV header with actual data siz e and prepare to send data
generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
sendPostFlag = true; // Set flag to indicate data is ready to be sent
}
}
// Immediately check the queue again to see if recording should stop
if (xQueueReceive(xQueue, &shouldRecord, 0) == pdTRUE && !shouldRecord) {
currentlyRecording = false;
Serial.println("Stopping recording via queue message.");
generate_wav_header(audioBuffer, audioBufferIndex - WAV_HEADER_SIZE, SAMPLE_RATE);
sendPostFlag = true; // Prepare to send data
}
// Use a short delay to yield to other tasks
vTaskDelay(1 / portTICK_PERIOD_MS);
} else {
// If not recording, check less frequently
vTaskDelay(10 / portTICK_PERIOD_MS);
}
// Check if the audio data is ready to be sent
if (sendPostFlag) {
send_audio_data(audioBuffer, audioBufferIndex); // Send the recorded audio data
audioBufferIndex = WAV_HEADER_SIZE; // Reset index for the next recording
sendPostFlag = false; // Reset the flag
}
}
}
void send_audio_data(uint8_t *data, size_t length) {
if (WiFi.status() == WL_CONNECTED) {
HTTPClient http;
// First Request: Send audio data to /api/audio
http.begin("http://192.168.1.137:8000/api/text");
http.addHeader("Content-Type", "audio/wav");
http.setTimeout(30000); // Long timeout for potential audio processing
Serial.println("Sending audio data to /api/audio...");
int httpResponseCode = http.POST(data, length);
if (httpResponseCode > 0) {
String responseText = http.getString(); // Get the text response
Serial.println(responseText);
http.end(); // End the first HTTP connection
// Second Request: Use the responseText as the X-Input-Text header
http.begin("http://192.168.1.137:8000/api/file-text-audio");
http.addHeader("X-Input-Text", responseText);
http.setTimeout(30000); // Long timeout for streaming
Serial.println("Requesting audio stream with text response...");
httpResponseCode = http.POST(""); // The body can be empty or whatever is expected by your API
if (httpResponseCode == 200) {
WiFiClient *stream = http.getStreamPtr();
uint8_t buffer[256];
memset(buffer, 0, sizeof(buffer));
while (http.connected()) {
// Check if data is available to read
if (stream->available()) {
int bytesRead = stream->readBytes(buffer, sizeof(buffer));
if (bytesRead > 0) {
// Reset the last data time on receiving data
lastDataTime = millis();
Serial.println("Streaming audio...");
size_t bytes_written = 0;
esp_err_t result = i2s_write((i2s_port_t)1, buffer, bytesRead, &bytes_written, portMAX_DELAY);
if (result != ESP_OK || bytes_written < bytesRead) {
Serial.println("Error writing to I2S or partial write occurred");
}
// Clear the buffer after processing each chunk
memset(buffer, 0, sizeof(buffer));
}
} else if (millis() - lastDataTime > inactivityTimeout) {
// If no data has been received for the duration of the inactivity timeout, assume the stream has ended
Serial.println("Stream ended due to inactivity timeout.");
break;
}
}
http.end(); // End the HTTP connection
} else {
Serial.print("Error on sending POST to /api/file-text-audio: ");
Serial.println(httpResponseCode);
}
} else {
Serial.println("Not connected to WiFi");
}
}
}
void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate) {
uint32_t file_size = wav_size + WAV_HEADER_SIZE - 8;
uint32_t byte_rate = SAMPLE_RATE * SAMPLE_BITS / 8;
const uint8_t set_wav_header[] = {
'R', 'I', 'F', 'F', // ChunkID
file_size, file_size >> 8, file_size >> 16, file_size >> 24, // ChunkSize
'W', 'A', 'V', 'E', // Format
'f', 'm', 't', ' ', // Subchunk1ID
0x10, 0x00, 0x00, 0x00, // Subchunk1Size (16 for PCM)
0x01, 0x00, // AudioFormat (1 for PCM)
0x01, 0x00, // NumChannels (1 channel)
sample_rate, sample_rate >> 8, sample_rate >> 16, sample_rate >> 24, // SampleRate
byte_rate, byte_rate >> 8, byte_rate >> 16, byte_rate >> 24, // ByteRate
0x02, 0x00, // BlockAlign
0x10, 0x00, // BitsPerSample (16 bits)
'd', 'a', 't', 'a', // Subchunk2ID
wav_size, wav_size >> 8, wav_size >> 16, wav_size >> 24, // Subchunk2Size
};
memcpy(wav_header, set_wav_header, sizeof(set_wav_header));
}
Code: Select all
import { Handlers } from "$fresh/server.ts";
import { OpenAI } from "https://deno.land/x/openai@v4.26.1/mod.ts";
import { FfmpegClass } from "https://deno.land/x/deno_ffmpeg@v3.1.0/mod.ts";
const openaiApiKey = Deno.env.get("OPENAI_API_KEY");
if (!openaiApiKey) {
throw new Error("OPENAI_API_KEY environment variable is not set.");
}
const client = new OpenAI({ apiKey: openaiApiKey });
export const handler: Handlers = async (req: Request): Promise<Response> => {
const inputText = req.headers.get("X-Input-Text");
try {
if (!inputText) {
return new Response(JSON.stringify({ status: "error", message: "No input text provided" }), {
status: 400,
headers: {
"Content-Type": "application/json",
},
});
}
const ttsResponse = await client.audio.speech.create({
model: "tts-1",
voice: "nova",
// input: decodeURIComponent(inputText) + ".....",
input: "I obsess trying to deeply understand the general nature of ideas. Understanding their nature, I tend to have more ideas and do a better job caring for, protecting, and developing them. Ideas, by definition, are always fragile. If they were resolved, they wouldn't be ideas. They would be products that were ready to ship. I've come to learn that you have to make an extraordinary effort not to focus on the problems which are implicated with any new idea. These problems are known, they're quantifiable, and understood.",
response_format: "mp3",
});
if (ttsResponse.body) {
// Create a subprocess that invokes FFmpeg, reading from stdin and outputting PCM data
const ffmpegProcess = Deno.run({
cmd: [
"ffmpeg",
"-i", "pipe:0", // Input from stdin
"-c:a", "pcm_s16le", // Convert to PCM
"-ar", "16000", // Set sample rate to 16000 Hz
"-ac", "1", // Set channel count to 1 (mono)
"-f", "wav", // Set format to raw wav
"pipe:1", // Output to stdout
],
stdin: "piped",
stdout: "piped",
stderr: "piped",
});
// Stream audio data directly to FFmpeg's stdin
const reader = ttsResponse.body.getReader();
const writeStream = ffmpegProcess.stdin;
(async () => {
while (true) {
const { done, value } = await reader.read();
if (done) break;
await writeStream.write(value);
}
reader.releaseLock();
writeStream.close();
})();
// Instead of collecting all chunks and sending them at once, stream them as they become available
return new Response(new ReadableStream({
async start(controller) {
const { stdout } = ffmpegProcess;
const outputReader = stdout.readable.getReader();
while (true) {
const { done, value } = await outputReader.read();
if (done) break;
controller.enqueue(value);
}
controller.close();
ffmpegProcess.close();
}
}), {
headers: {
"Content-Type": "audio/wav",
},
});
} else {
return new Response(JSON.stringify({ status: "error", message: "No audio data received" }), {
status: 500,
headers: {
"Content-Type": "application/json",
},
});
}
} catch (error) {
console.error(error);
return new Response(
JSON.stringify({ status: "error", message: error.message }),
{
status: 500,
headers: {
"Content-Type": "application/json",
},
},
);
}
};