Can't compile Floating Point Assembly for XIAO ESP32-C3

Hello everyone. I’m trying to implement Ivan Kostoski’s code for an ESP32-based sound level meter on a XIAO ESP32-C3. According to the wiki, that chip “is a 32-bit RISC-V CPU, which includes an FPU (Floating Point Unit) for 32-bit single-precision arithmetic”. So, based on this I wrote some RISC-V assembly code for floating point operations. however, when I try to compile I get the error “unrecognized opcode” on every floating point instruction. When I switch to integer instructions, the code does compile.

Here’s my code:

extern "C" {
  int sos_filter_f32(float *input, float *output, int len, const SOS_Coefficients &coeffs, SOS_Delay_State &w);
} 
__asm__ (
  //
  // RISC-V implementation of IIR Second-Order Section filter 
  // Assumes a0 and b0 coefficients are one (1.0)
  //
  // float* a2 = input;
  // float* a3 = output;
  // int    a4 = len;
  // float* a5 = coeffs;
  // float* a6 = w; 
  // float  a7 = gain;
  //
  ".text                    \n"
  ".align  4                \n"
  ".global sos_filter_f32   \n"
  ".type   sos_filter_f32,@function\n"
  "sos_filter_f32:          \n"
  "  flw     f0, 0(a5)      \n" // float f0 = coeffs.b1;
  "  flw     f1, 4(a5)      \n" // float f1 = coeffs.b2;
  "  flw     f2, 8(a5)      \n" // float f2 = coeffs.a1;
  "  flw     f3, 12(a5)     \n" // float f3 = coeffs.a2;
  "  flw     f4, 0(a6)      \n" // float f4 = w[0];
  "  flw     f5, 4(a6)      \n" // float f5 = w[1];
  "  loop:                  \n"
  "    bnez    a4, 1f       \n" // for (; len>0; len--) { 
  "    j exit               \n"
  "  i:                     \n"
  "    flw     f6, a2       \n" //   float f6 = *input++;
  "    addi    a2, a2, 4    \n" //   post-increment by 4
  "    fmadd.s f6, f2, f4   \n" //   f6 += f2 * f4; // coeffs.a1 * w0
  "    fmadd.s f6, f3, f5   \n" //   f6 += f3 * f5; // coeffs.a2 * w1
  "    fmv.s   f7, f6       \n" //   f7 = f6; // b0 assumed 1.0
  "    fmadd.s f7, f0, f4   \n" //   f7 += f0 * f4; // coeffs.b1 * w0
  "    fmadd.s f7, f1, f5   \n" //   f7 += f1 * f5; // coeffs.b2 * w1 -> result
  "    fsw     f7, a3       \n" //   *output++ = f7;
  "    addi    a3, a3, 4    \n" //   post-increment by 4
  "    fmv.s   f5, f4       \n" //   f5 = f4; // w1 = w0
  "    fmv.s   f4, f6       \n" //   f4 = f6; // w0 = f6
  "    addi    a4, a4, -1   \n" //   update loop counter
  "    bnez    a4, 1b       \n"
  "    j exit               \n"
  "  exit:                  \n" // }
  "  fsw     f4, 0(a6)      \n" // w[0] = f4;
  "  fsw     f5, 4(a6)      \n" // w[1] = f5;
  "  fmvi    a2, 0          \n" // return 0;
  "  ret                    \n"
);

The reference code, written for the Xtensa ISA, is the following:

extern "C" {
  int sos_filter_f32(float *input, float *output, int len, const SOS_Coefficients &coeffs, SOS_Delay_State &w);
} 
__asm__ (
  //
  // ESP32 implementation of IIR Second-Order Section filter 
  // Assumes a0 and b0 coefficients are one (1.0)
  //
  // float* a2 = input;
  // float* a3 = output;
  // int    a4 = len;
  // float* a5 = coeffs;
  // float* a6 = w; 
  // float  a7 = gain;
  //
  ".text                    \n"
  ".align  4                \n"
  ".global sos_filter_f32   \n"
  ".type   sos_filter_f32,@function\n"
  "sos_filter_f32:          \n"
  "  entry   a1, 16         \n"
  "  lsi     f0, a5, 0      \n" // float f0 = coeffs.b1;
  "  lsi     f1, a5, 4      \n" // float f1 = coeffs.b2;
  "  lsi     f2, a5, 8      \n" // float f2 = coeffs.a1;
  "  lsi     f3, a5, 12     \n" // float f3 = coeffs.a2;
  "  lsi     f4, a6, 0      \n" // float f4 = w[0];
  "  lsi     f5, a6, 4      \n" // float f5 = w[1];
  "  loopnez a4, 1f         \n" // for (; len>0; len--) { 
  "    lsip    f6, a2, 4    \n" //   float f6 = *input++;
  "    madd.s  f6, f2, f4   \n" //   f6 += f2 * f4; // coeffs.a1 * w0
  "    madd.s  f6, f3, f5   \n" //   f6 += f3 * f5; // coeffs.a2 * w1
  "    mov.s   f7, f6       \n" //   f7 = f6; // b0 assumed 1.0
  "    madd.s  f7, f0, f4   \n" //   f7 += f0 * f4; // coeffs.b1 * w0
  "    madd.s  f7, f1, f5   \n" //   f7 += f1 * f5; // coeffs.b2 * w1 -> result
  "    ssip    f7, a3, 4    \n" //   *output++ = f7;
  "    mov.s   f5, f4       \n" //   f5 = f4; // w1 = w0
  "    mov.s   f4, f6       \n" //   f4 = f6; // w0 = f6
  "  1:                     \n" // }
  "  ssi     f4, a6, 0      \n" // w[0] = f4;
  "  ssi     f5, a6, 4      \n" // w[1] = f5;
  "  movi.n   a2, 0         \n" // return 0;
  "  retw.n                 \n"
);

It seems that Seeed´s wiki is wrong about the C3 having an FPU.

I wrote some C++ code using float variables, compiled it for the C3 with the Arduino IDE, and then disassembled it to see what kind of instructions it was using.
There were no floating-point instructions, only their integer equivalents (see an excerpt here).
Also, by looking at the compilation output in verbose mode I noticed that the C3 was being labeled as a RISCV32IMC chip, which, again, means that it doesn´t support floating-point instructions (see here for more info).

If I missed something please let me know, but I think this settles it. Now I need to get another ESP.