I'm using an stm32f4 chip (cortex-m4) with an FPU and
sqrt( 9.7 * 9.7 ) returns 94.17..
i am using the arm-none-eabi-gcc compiler and don't get any errors on compile.
my makefile is really long because the same file is used for stm32f4 and sam4 chips. I don't even know the relevant parts to post. Any help is appreciated.
EDIT : some flag config from makefile
C_FLAGS = -mcpu=cortex-m4 -mthumb
C_FLAGS += -std=gnu89 -g -ggdb3 -fverbose-asm
C_FLAGS += --param max-inline-insns-single=500
C_FLAGS += -fsingle-precision-constant
C_FLAGS += -mfpu=fpv4-sp-d16
C_FLAGS += -mfloat-abi=hard
C_WARNINGS = -Wall
C_WARNINGS += -Wdouble-promotion
This is my exact code:
float x, y, z, mag;
x = ((float) (((int32_t) x_adc) + 0x8000)) * scale - offset;
y = ((float) (((int32_t) y_adc) + 0x8000)) * scale - offset;
z = ((float) (((int32_t) z_adc) + 0x8000)) * scale - offset;
mag = sqrt(x*x + y*y + z*z);
DEBUG("XYZ = %.2f, %.2f, %.2f = %.2f\n", x, y, z, mag );
The DEBUG macro prints to a debug log on the usb_cdc interface and LCD screen. Also, i had been overwriting Wdouble-promotion and am now getting a double promotion warning on the DEBUG line (which calls snprintf with __VA_ARGS__
). I assume this is not relevant.
I changed the optimization level to 0 (still getting same result with sqrt), this is the .lss:
mag = sqrt(x*x + y*y + z*z);
8019a36: ed97 7a05 vldr s14, [r7, #20]
8019a3a: edd7 7a05 vldr s15, [r7, #20]
8019a3e: ee27 7a27 vmul.f32 s14, s14, s15
8019a42: edd7 6a04 vldr s13, [r7, #16]
8019a46: edd7 7a04 vldr s15, [r7, #16]
8019a4a: ee66 7aa7 vmul.f32 s15, s13, s15
8019a4e: ee37 7a27 vadd.f32 s14, s14, s15
8019a52: edd7 6a03 vldr s13, [r7, #12]
8019a56: edd7 7a03 vldr s15, [r7, #12]
8019a5a: ee66 7aa7 vmul.f32 s15, s13, s15
8019a5e: ee77 7a27 vadd.f32 s15, s14, s15
8019a62: ee17 0a90 vmov r0, s15
8019a66: f7e6 fcf3 bl 8000450 <__aeabi_f2d>
8019a6a: 4602 mov r2, r0
8019a6c: 460b mov r3, r1
8019a6e: ec43 2b10 vmov d0, r2, r3
8019a72: f007 fe9f bl 80217b4 <sqrt>
8019a76: ec53 2b10 vmov r2, r3, d0
8019a7a: 4610 mov r0, r2
8019a7c: 4619 mov r1, r3
8019a7e: f7e6 fffd bl 8000a7c <__aeabi_d2f>
8019a82: 4603 mov r3, r0
8019a84: 60bb str r3, [r7, #8]
if( 1 ) {
DEBUG("XYZ = %.2f, %.2f, %.2f = %.2f\n", x, y, z, mag );
8019a86: 6978 ldr r0, [r7, #20]
8019a88: f7e6 fce2 bl 8000450 <__aeabi_f2d>
8019a8c: 4682 mov sl, r0
8019a8e: 468b mov fp, r1
8019a90: 6938 ldr r0, [r7, #16]
8019a92: f7e6 fcdd bl 8000450 <__aeabi_f2d>
8019a96: 4680 mov r8, r0
8019a98: 4689 mov r9, r1
8019a9a: 68f8 ldr r0, [r7, #12]
8019a9c: f7e6 fcd8 bl 8000450 <__aeabi_f2d>
8019aa0: 4604 mov r4, r0
8019aa2: 460d mov r5, r1
8019aa4: 68b8 ldr r0, [r7, #8]
8019aa6: f7e6 fcd3 bl 8000450 <__aeabi_f2d>
8019aaa: 4602 mov r2, r0
8019aac: 460b mov r3, r1
8019aae: e9cd ab00 strd sl, fp, [sp]
8019ab2: e9cd 8902 strd r8, r9, [sp, #8]
8019ab6: e9cd 4504 strd r4, r5, [sp, #16]
8019aba: e9cd 2306 strd r2, r3, [sp, #24]
8019abe: f243 707c movw r0, #14204 ; 0x377c
8019ac2: f6c0 0002 movt r0, #2050 ; 0x802
8019ac6: f240 4176 movw r1, #1142 ; 0x476
8019aca: f643 0218 movw r2, #14360 ; 0x3818
8019ace: f6c0 0202 movt r2, #2050 ; 0x802
8019ad2: f643 430c movw r3, #15372 ; 0x3c0c
8019ad6: f6c0 0302 movt r3, #2050 ; 0x802
8019ada: f7ed fee9 bl 80078b0 <debug_log>
This is with -O2
x = ((float) (((int32_t) x_adc) + 0x8000)) * scale - offset;
8010c3e: eeb0 9a48 vmov.f32 s18, s16
8010c42: ee10 9aa9 vnmls.f32 s18, s1, s19
y = ((float) (((int32_t) y_adc) + 0x8000)) * scale - offset;
z = ((float) (((int32_t) z_adc) + 0x8000)) * scale - offset;
8010c46: eef8 1ac1 vcvt.f32.s32 s3, s2
mag = sqrt(x*x + y*y + z*z);
8010c4a: ee28 2aa8 vmul.f32 s4, s17, s17
y_adc = ((int16_t) (buffer[2] & 0xff)) + (((int16_t) (buffer[3] & 0xff))<<8);
z_adc = ((int16_t) (buffer[4] & 0xff)) + (((int16_t) (buffer[5] & 0xff))<<8);
x = ((float) (((int32_t) x_adc) + 0x8000)) * scale - offset;
y = ((float) (((int32_t) y_adc) + 0x8000)) * scale - offset;
z = ((float) (((int32_t) z_adc) + 0x8000)) * scale - offset;
8010c4e: ee11 8aa9 vnmls.f32 s16, s3, s19
mag = sqrt(x*x + y*y + z*z);
8010c52: ee09 2a09 vmla.f32 s4, s18, s18
8010c56: ee08 2a08 vmla.f32 s4, s16, s16
8010c5a: ee12 0a10 vmov r0, s4
8010c5e: f7ef fbf7 bl 8000450 <__aeabi_f2d>
8010c62: ec41 0b10 vmov d0, r0, r1
8010c66: f007 f8f9 bl 8017e5c <sqrt>
if( 1 ) {
DEBUG("XYZ = %.2f, %.2f, %.2f = %.2f\n", x, y, z, mag );
8010c6a: ee19 0a10 vmov r0, s18
z_adc = ((int16_t) (buffer[4] & 0xff)) + (((int16_t) (buffer[5] & 0xff))<<8);
x = ((float) (((int32_t) x_adc) + 0x8000)) * scale - offset;
y = ((float) (((int32_t) y_adc) + 0x8000)) * scale - offset;
z = ((float) (((int32_t) z_adc) + 0x8000)) * scale - offset;
mag = sqrt(x*x + y*y + z*z);
8010c6e: ec55 4b10 vmov r4, r5, d0
if( 1 ) {
DEBUG("XYZ = %.2f, %.2f, %.2f = %.2f\n", x, y, z, mag );
8010c72: f7ef fbed bl 8000450 <__aeabi_f2d>
8010c76: e9cd 0100 strd r0, r1, [sp]
8010c7a: ee18 0a90 vmov r0, s17
8010c7e: f7ef fbe7 bl 8000450 <__aeabi_f2d>
8010c82: e9cd 0102 strd r0, r1, [sp, #8]
8010c86: ee18 0a10 vmov r0, s16
8010c8a: f7ef fbe1 bl 8000450 <__aeabi_f2d>
8010c8e: e9cd 0104 strd r0, r1, [sp, #16]
z_adc = ((int16_t) (buffer[4] & 0xff)) + (((int16_t) (buffer[5] & 0xff))<<8);
x = ((float) (((int32_t) x_adc) + 0x8000)) * scale - offset;
y = ((float) (((int32_t) y_adc) + 0x8000)) * scale - offset;
z = ((float) (((int32_t) z_adc) + 0x8000)) * scale - offset;
mag = sqrt(x*x + y*y + z*z);
8010c92: 4629 mov r1, r5
8010c94: 4620 mov r0, r4
8010c96: f7ef fef1 bl 8000a7c <__aeabi_d2f>
if( 1 ) {
DEBUG("XYZ = %.2f, %.2f, %.2f = %.2f\n", x, y, z, mag );
8010c9a: f7ef fbd9 bl 8000450 <__aeabi_f2d>
8010c9e: 4a0e ldr r2, [pc, #56] ; (8010cd8 <LIS331DLH_ReadAc+0x134>)
8010ca0: 4b0e ldr r3, [pc, #56] ; (8010cdc <LIS331DLH_ReadAc+0x138>)
8010ca2: e9cd 0106 strd r0, r1, [sp, #24]
8010ca6: 4808 ldr r0, [pc, #32] ; (8010cc8 <LIS331DLH_ReadAc+0x124>)
8010ca8: f240 4176 movw r1, #1142 ; 0x476
8010cac: f7f4 fb76 bl 800539c <debug_log>
8010cb0: e78b b.n 8010bca <LIS331DLH_ReadAc+0x26>
this all depends on how you are using it. you need to paste a lot more information/code. a simple test case, the disassembly of that simple test case, how you are determining the result, etc.
#include <math.h>
double fun ( void )
{
return(sqrt(9.7*9.7));
}
as written there is no reason for a math library it is a compile time computation (on the host/development machine).
00000000 <fun>:
0: 4902 ldr r1, [pc, #8] ; (c <fun+0xc>)
2: 4801 ldr r0, [pc, #4] ; (8 <fun+0x8>)
4: 4770 bx lr
6: 46c0 nop ; (mov r8, r8)
8: 66666666 strbtvs r6, [r6], -r6, ror #12
c: 40236666 eormi r6, r3, r6, ror #12
building for x86 to show that building for arm does not give a different pre-computed answer.
0000000000000000 <fun>:
0: f2 0f 10 05 00 00 00 movsd 0x0(%rip),%xmm0 # 8 <fun+0x8>
7: 00
8: c3 retq
0000000000000000 <.LC0>:
0: 66 data16
1: 66 data16
2: 66 data16
3: 66 data16
4: 66 data16
5: 66 data16
6: 23 .byte 0x23
7: 40 rex
switch to single precision
#include <math.h>
float fun ( void )
{
return(sqrtf(9.7F*9.7F));
}
float fun2 ( void )
{
return(9.7F);
}
and we get the single precision compile time computed answer
00000000 <fun>:
0: 4800 ldr r0, [pc, #0] ; (4 <fun+0x4>)
2: 4770 bx lr
4: 411b3333 tstmi fp, r3, lsr r3
00000008 <fun2>:
8: 4800 ldr r0, [pc, #0] ; (c <fun2+0x4>)
a: 4770 bx lr
c: 411b3333 tstmi fp, r3, lsr r3
using clang/llvm instead of gcc:
fun:
ldr.n r0, .LCPI0_0
bx lr
.align 2
.LCPI0_0:
.long 1092301619 @ 0x411b3333
fun2:
ldr.n r0, .LCPI1_0
bx lr
.align 2
.LCPI1_0:
.long 1092301619 @ 0x411b3333
so you need to repeat that, see if you are calling math functions, if so what are they feeding sqrt, did they actually link in the right stuff, then I would simply print the result in hex. then debug the other half, is it taking the right 9.7F value and depending on your printing function is it generating the right ascii?
How do I convert (single precision) float to hex you ask?
.thumb_func
.globl dummy
dummy:
bx lr
to verify the compiler is using r0 on the way in and out:
unsigned int fun3 ( void )
{
return(dummy(9.7F));
}
00000010 <fun3>:
10: b508 push {r3, lr}
12: 4803 ldr r0, [pc, #12] ; (20 <fun3+0x10>)
14: f7ff fffe bl 0 <dummy>
18: bc08 pop {r3}
1a: bc02 pop {r1}
1c: 4708 bx r1
1e: 46c0 nop ; (mov r8, r8)
20: 411b3333 tstmi fp, r3, lsr r3
by simply returning the bits in r0 which come in as float bits dont change, the definition of those bits is a figment of the compiler or programmers imagination anyway, bits are just bits subject to any interpretation.