SPRESENESで音声をFFT解析してLEDを光らせてみた

初めまして、ぽこりんたラボラトリです！
今回2025年SPRESENSE活用コンテストに応募し、SPRESENSEを用いてマイクから音声を受け取りFFT解析してその音量や周波数に応じてLEDテープを光らせてみました！

本題に入る前にいきなりですが、今回のコンテストではディスプレイもご提供いただいていたのですが、開発が間に合わず使用することができませんでした。
当初はSpresense SDKを用いて開発しようとしていたのですが、組み込み素人の私では解析が間に合わなかったため急遽Arduino環境による開発に移行したため、ディスプレイを使用するところまで到達できませんでした。
ちょっとプログラミングができるからと言って無茶はよくなかったですね...精進します。

では本題に入っていきます。
今回行ったのは前述のとおりですが、中身の処理は大別すると以下の二つに分けられます。

マイクから入力を受け取り、FFT解析を行う。
FFT解析の結果から音量とスペクトルのピーク周波数を抽出し、音量に合わせて光らせるLEDの数、ピーク値に合わせて色を変化させる。

FFT解析はサブコアで行い、LEDの点灯はメインコアで行っています。

SPRESENSE メインボード
SPRESENSE 拡張ボード
エレクトリックコンデンサーマイク（WM-61A相当品）
WS2813（LEDテープ）

使用部品	ピン名称
マイク+	MICA
マイク+	MIC BIAS A（2.2 kΩの抵抗付き）
マイク-	GND
LED+	Vout 5V
LED-	GND
LED通信線	D11

それぞれのピン名称はSPRESENSE Developmentのハードウェア構成およびマイクに関してはマイクの使用方法を参照してください。

今回使用しているマイクは推奨品のものとは違いますが、推奨品とほぼ同じスペックでかつマイクの出力インピーダンスが負荷抵抗の値になるという記述があったので、マイクバイアスへの入力には2.2 kΩの負荷抵抗をつけています。
（もしこの理解が間違っていたらご指摘ください）

また実装の章でも触れますが、今回LEDの制御にはSPIピンを使用しています。

今回は先人の知恵を大いに活用させていただきました。
SPRESENSE Arduino 開発ガイドのSignalProcessing チュートリアルをベースにしています。
LEDの制御はこちらのライブラリを使用させていただきました。こちらはArduino IDEのLibrary Managerには載っていないので、GithubからZIPファイルでダウンロードしてインポートする必要があります。


メインコア
/*
 *  MainAudio.ino - FFT Example with Audio (Sound Detector)
 *  Copyright 2019 Sony Semiconductor Solutions Corporation
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include <MP.h>
#include <Audio.h>
#include <SPI.h>
#include <SPI_NeoPixel.h>
//https://github.com/lipoyang/SPI_NeoPixel

AudioClass *theAudio;

/* Select mic channel number */
const int mic_channel_num = 1;

const int subcore = 1;

const int MAX_POWER = 200; // 表示する音量の最大値
const float MAX_FREQ = 3000.0f; // 最大周波数;
const int NUM_PIXELS = 30; //NeopixelのLED数
//spiの送信のピン　= 11
SPI_NeoPixel neopixel(NUM_PIXELS);

struct Request {
  void *buffer;
  int  sample;
  int  channel;
};

struct Result {
  bool found[mic_channel_num];
  float power[mic_channel_num];
  float peak[mic_channel_num];
  int  channel;
};

void setup()
{
  Serial.begin(115200);
  delay(1000);
  while (!Serial);

  neopixel.begin();
  neopixel.clear();

  Serial.println("Init Audio Library");
  theAudio = AudioClass::getInstance();
  theAudio->begin();

  Serial.println("Init Audio Recorder");
  /* Select input device as AMIC */
  theAudio->setRecorderMode(AS_SETRECDR_STS_INPUTDEVICE_MIC, 210);

  /* Set PCM capture */
  uint8_t channel;
  switch (mic_channel_num) {
  case 1: channel = AS_CHANNEL_MONO;   break;
  case 2: channel = AS_CHANNEL_STEREO; break;
  case 4: channel = AS_CHANNEL_4CH;    break;
  }
  theAudio->initRecorder(AS_CODECTYPE_PCM, "/mnt/spif/BIN", AS_SAMPLINGRATE_48000, channel);

  /* Launch SubCore */
  int ret = MP.begin(subcore);
  if (ret < 0) {
    printf("MP.begin error = %d\n", ret);
  }
  /* receive with non-blocking */
  MP.RecvTimeout(1);

  Serial.println("Rec start!");
  theAudio->startRecorder();
}

void loop()
{
  int8_t   sndid = 100; /* user-defined msgid */
  int8_t   rcvid = 0;
  Request  request;
  Result*  result;

  static const int32_t buffer_sample = 768 * mic_channel_num;
  static const int32_t buffer_size = buffer_sample * sizeof(int16_t);
  static char  buffer[buffer_size];
  uint32_t read_size;
  float power = 0.0;
  float peak = 0.0;
  
  /* Read frames to record in buffer */
  int err = theAudio->readFrames(buffer, buffer_size, &read_size);

  if (err != AUDIOLIB_ECODE_OK && err != AUDIOLIB_ECODE_INSUFFICIENT_BUFFER_AREA) {
    printf("Error err = %d\n", err);
    sleep(1);
    theAudio->stopRecorder();
    exit(1);
  }
  if ((read_size != 0) && (read_size == buffer_size)) {
    request.buffer   = buffer;
    request.sample = buffer_sample / mic_channel_num;
    request.channel  = mic_channel_num;
    MP.Send(sndid, &request, subcore);
  } else {
    /* Receive detector results from SubCore */
    neopixel.clear();
    int ret = MP.Recv(&rcvid, &result, subcore);
    if (ret >= 0) {
      for (int i=0;i<mic_channel_num;i++) {
        power = result->power[i];
        peak = result->peak[i];
        printf("Peak: %f\n", peak);

        byte color = int((peak / MAX_FREQ) *256) & 255;
        int vPower = (power / MAX_POWER) * NUM_PIXELS; // ボリュームバーの数
        for (int j=0; j<vPower; j++){
          neopixel.setPixelColor(j, Wheel(color, 128));
        }
        neopixel.show();
      }
    }
  }
}

// 周波数から
uint32_t Wheel(byte freq, uint8_t maxBrightness) {
  freq = 255 - freq;
  if (freq < 85) {
    return neopixel.Color(((255 - freq * 3) * maxBrightness) / 255, (freq * 3 * maxBrightness) / 255, 0);
  }
  if (freq < 170) {
    freq -= 85;
    return neopixel.Color(0, ((freq * 3 * maxBrightness) / 255), ((255 - freq * 3 * maxBrightness) / 255));
  }
  freq -= 170;
  return neopixel.Color(((freq * 3 * maxBrightness) / 255), 0, ((255 - freq * 3 * maxBrightness) / 255));
}


サブコア
/*
 *  SubFFT.ino - FFT Example with Audio (Sound Detector)
 *  Copyright 2019 Sony Semiconductor Solutions Corporation
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include <MP.h>

#include "FFT.h"

/*-----------------------------------------------------------------*/
/*
 * FFT parameters
 */
/* Select FFT length */

#define FFT_LEN 1024

/* Number of channels*/
#define MAX_CHANNEL_NUM 1

#define SAMPLING_RATE   48000 // ex.) 48000, 16000

#define FFT_LEN         1024 // ex.) 128, 256, 1024
#define OVERLAP         (FFT_LEN/2)  // ex.) 0, 128, 256

FFTClass<MAX_CHANNEL_NUM, FFT_LEN> FFT;

/*-----------------------------------------------------------------*/
/*
 * Detector parameters
 */
#define POWER_THRESHOLD       30  // Power
#define LENGTH_THRESHOLD      30  // 20ms
#define INTERVAL_THRESHOLD    100 // 100ms

#define BOTTOM_SAMPLING_RATE  100 
#define TOP_SAMPLING_RATE     3000 

#define FS2BAND(x)            ((x)*FFT_LEN/SAMPLING_RATE)
#define BOTTOM_BAND           (FS2BAND(BOTTOM_SAMPLING_RATE))
#define TOP_BAND              (FS2BAND(TOP_SAMPLING_RATE))

#define MS2FRAME(x)           (((x)*SAMPLING_RATE/1000/(FFT_LEN-OVERLAP))+1)
#define LENGTH_FRAME          MS2FRAME(LENGTH_THRESHOLD)
#define INTERVAL_FRAME        MS2FRAME(INTERVAL_THRESHOLD)

/*-----------------------------------------------------------------*/
/* Allocate the larger heap size than default */

USER_HEAP_SIZE(64 * 1024);

/* MultiCore definitions */

struct Request {
  void *buffer;
  int  sample;
  int  chnum;
};

struct Result {
  Result(){
    clear();
    powerClear();
    peakClear();
  }

  bool found[MAX_CHANNEL_NUM];
  float power[MAX_CHANNEL_NUM];
  float peak[MAX_CHANNEL_NUM];
  int  channel;

  void clear(){
    for(int i=0;i<MAX_CHANNEL_NUM;i++){
      found[i]=false;
    }
  }

  void powerClear(){
    for(int i=0;i<MAX_CHANNEL_NUM;i++){
      power[i]=0.0;
    }
  }

  void peakClear(){
    for(int i=0; i<MAX_CHANNEL_NUM; i++){
      peak[i]=0.0;
    }
  }
};

void setup()
{
  /* Initialize MP library */
  int ret = MP.begin();
  if (ret < 0) {
    errorLoop(2);
  }

  /* receive with non-blocking */
  MP.RecvTimeout(MP_RECV_POLLING);

  FFT.begin();
}

#define RESULT_SIZE 4
void loop()
{
  int      ret;
  int8_t   sndid = 10; /* user-defined msgid */
  int8_t   rcvid;
  Request *request;
  static Result result[RESULT_SIZE];
  static int pos=0;

  result[pos].clear();
  result[pos].powerClear();
  result[pos].peakClear();

  static float pDst[FFT_LEN/2];

  /* Receive PCM captured buffer from MainCore */
  ret = MP.Recv(&rcvid, &request);
  if (ret >= 0) {
      FFT.put((q15_t*)request->buffer,request->sample);
  }

  while(!FFT.empty(0)){
      result[pos].channel = MAX_CHANNEL_NUM;
    for (int i = 0; i < MAX_CHANNEL_NUM; i++) {
      FFT.get(pDst,i);
      result[pos].found[i] = detect_sound(BOTTOM_BAND,TOP_BAND,pDst,i);
      result[pos].power[i] = get_max_power(BOTTOM_BAND,TOP_BAND,pDst,i);
      result[pos].peak[i] = get_peak_frequency(pDst, FFT_LEN);
    }
    ret = MP.Send(sndid, &result[pos],0);
    pos = (pos+1)%RESULT_SIZE;
    if (ret < 0) {
      errorLoop(1);
    }
  }

}


/*-----------------------------------------------------------------*/
/*
 * Detector functions
 */
struct Sounds {
  Sounds(){
    clear();
  }

  int continuity[MAX_CHANNEL_NUM];
  int interval[MAX_CHANNEL_NUM];

  void clear(){
    for(int i=0;i<MAX_CHANNEL_NUM;i++){
      continuity[i]=0;
      interval[i]=0;
    }
  }
};

float get_max_power(int bottom, int top, float* pdata, int channel)
{
  static Sounds sounds;
  float power = 0.0;
  if(bottom > top) return 0.0;

  for(int i=bottom;i<=top;i++){
//     printf("!!%2.8f\n",*(pdata+i));
    sounds.continuity[channel]++;
//      printf("con=%d\n",continuity);
    if(sounds.continuity[channel] > LENGTH_FRAME){ // length is enough.
      sounds.interval[channel] = INTERVAL_FRAME;
      if(*(pdata+i) > power){
        power = *(pdata+i);
      }
    }
  }
  sounds.continuity[channel]=0;
  return power;
}

float get_peak_frequency(float *pData, int fftLen)
{
  float g_fs = 48000.0f;
  uint32_t index;
  float maxValue;
  float delta;
  float peakFs;

  arm_max_f32(pData, fftLen / 2, &maxValue, &index);

  delta = 0.5 * (pData[index - 1] - pData[index + 1])
    / (pData[index - 1] + pData[index + 1] - (2.0f * pData[index]));
  peakFs = (index + delta) * g_fs / (fftLen - 1);

  return peakFs;
}


bool detect_sound(int bottom, int top, float* pdata, int channel )
{
  static Sounds sounds;
  if(bottom > top) return false;

  if(sounds.interval[channel]> 0){ /* Do not detect in interval time.*/
    sounds.interval[channel]--;
    sounds.continuity[channel]=0;
    return false;
  }

  for(int i=bottom;i<=top;i++){
//     printf("!!%2.8f\n",*(pdata+i));
    if(*(pdata+i) > POWER_THRESHOLD){ // find sound.
//      printf("!!%2.8f\n",*(pdata+i));
      sounds.continuity[channel]++;
//      printf("con=%d\n",continuity);
      if(sounds.continuity[channel] > LENGTH_FRAME){ // length is enough.
        sounds.interval[channel] = INTERVAL_FRAME;
        return true;
      }else{
//      puts("continue sound");
        return false;
      }
    }
  }
  sounds.continuity[channel]=0;
  return false;
}


void errorLoop(int num)
{
  int i;

  while (1) {
    for (i = 0; i < num; i++) {
      ledOn(LED0);
      delay(300);
      ledOff(LED0);
      delay(300);
    }
    delay(1000);
  }
}