Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added benchmark script, enabled ARMASM for Cortex-M3,4,7,33 #513

Merged
merged 3 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/test-configs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,28 +70,28 @@ jobs:
with:
arch: arm
config-file: ./config/examples/imx-rt1040.config
make-args: PKA=1
make-args: PKA=1 NO_ARM_ASM=1

imx_rt1050_test_pka:
uses: ./.github/workflows/test-build-mcux-sdk.yml
with:
arch: arm
config-file: ./config/examples/imx-rt1050.config
make-args: PKA=1
make-args: PKA=1 NO_ARM_ASM=1

imx_rt1060_test_pka:
uses: ./.github/workflows/test-build-mcux-sdk.yml
with:
arch: arm
config-file: ./config/examples/imx-rt1060.config
make-args: PKA=1
make-args: PKA=1 NO_ARM_ASM=1

imx_rt1064_test_pka:
uses: ./.github/workflows/test-build-mcux-sdk.yml
with:
arch: arm
config-file: ./config/examples/imx-rt1064.config
make-args: PKA=1
make-args: PKA=1 NO_ARM_ASM=1

kinetis_k64f_test:
uses: ./.github/workflows/test-build-mcux-sdk.yml
Expand Down
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ clean:
$(Q)rm -f $(MACHINE_OBJ) $(MAIN_TARGET) $(LSCRIPT)
$(Q)rm -f $(OBJS)
$(Q)rm -f tools/keytools/otp/otp-keystore-gen
$(Q)rm -f .stack_usage
$(Q)$(MAKE) -C test-app -s clean
$(Q)$(MAKE) -C tools/check_config -s clean
$(Q)$(MAKE) -C stage1 -s clean
Expand Down Expand Up @@ -385,6 +386,13 @@ line-count-nrf52:
line-count-x86:
cloc --force-lang-def cloc_lang_def.txt src/boot_x86_fsp.c src/boot_x86_fsp_payload.c src/boot_x86_fsp_start.S src/image.c src/keystore.c src/libwolfboot.c src/loader.c src/string.c src/update_disk.c src/x86/ahci.c src/x86/ata.c src/x86/common.c src/x86/gpt.c src/x86/hob.c src/pci.c src/x86/tgl_fsp.c hal/x86_fsp_tgl.c hal/x86_uart.c

stack-usage: wolfboot.bin
$(Q)echo $(STACK_USAGE) > .stack_usage

image-header-size: wolfboot.bin
$(Q)echo $(IMAGE_HEADER_SIZE) > .image_header_size


cppcheck:
cppcheck -f --enable=warning --enable=portability \
--suppress="ctunullpointer" --suppress="nullPointer" \
Expand Down
55 changes: 44 additions & 11 deletions arch.mk
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ ifeq ($(ARCH),ARM)
ifeq ($(CORTEX_A5),1)
FPU=-mfpu=vfp4-d16
CFLAGS+=-mcpu=cortex-a5 -mtune=cortex-a5 -static -z noexecstack
LDLAGS+=-mcpu=cortex-a5 -mtune=cortex-a5 -mtune=cortex-a5 -static -z noexecstack -Ttext 0x300000
LDLAGS+=-mcpu=cortex-a5 -mtune=cortex-a5 -mtune=cortex-a5 -static \
-z noexecstack -Ttext 0x300000
# Cortex-A uses boot_arm32.o
OBJS+=src/boot_arm32.o src/boot_arm32_start.o
ifeq ($(NO_ASM),1)
Expand All @@ -198,11 +199,37 @@ ifeq ($(CORTEX_A5),1)
OBJS+=./lib/wolfssl/wolfcrypt/src/port/arm/armv8-sha256.o
OBJS+=./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.o
OBJS+=./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.o
CFLAGS+=-DWOLFSSL_SP_ARM32_ASM -DWOLFSSL_ARMASM -DWOLFSSL_ARMASM_NO_HW_CRYPTO -DWOLFSSL_ARM_ARCH=7 -DWOLFSSL_ARMASM_INLINE -DWOLFSSL_ARMASM_NO_NEON
CFLAGS+=-DWOLFSSL_SP_ARM32_ASM -DWOLFSSL_ARMASM -DWOLFSSL_ARMASM_NO_HW_CRYPTO \
-DWOLFSSL_ARM_ARCH=7 -DWOLFSSL_ARMASM_INLINE -DWOLFSSL_ARMASM_NO_NEON
endif
else
# All others use boot_arm.o
OBJS+=src/boot_arm.o
ifneq ($(NO_ARM_ASM),1)
CORTEXM_ARM_EXTRA_OBJS= \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-aes.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-chacha.o \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tabs vs spaces? Looks like a few places

./lib/wolfssl/wolfcrypt/src/port/arm/armv8-sha256.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-sha512.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.o \
./lib/wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.o


CORTEXM_ARM_THUMB_EXTRA_OBJS= \
./lib/wolfssl/wolfcrypt/src/port/arm/thumb2-sha256-asm.o \
./lib/wolfssl/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.o

CORTEXM_ARM_EXTRA_CFLAGS+=-DWOLFSSL_ARMASM -DWOLFSSL_ARMASM_NO_HW_CRYPTO \
-DWOLFSSL_ARMASM_INLINE -DWOLFSSL_ARMASM_NO_NEON
endif
ifeq ($(CORTEX_M33),1)
CFLAGS+=-mcpu=cortex-m33 -DCORTEX_M33
LDFLAGS+=-mcpu=cortex-m33
Expand All @@ -212,28 +239,25 @@ else
endif
CFLAGS+=-mcmse
ifeq ($(WOLFCRYPT_TZ),1)
CORTEXM_ARM_EXTRA_OBJS=
CORTEXM_ARM_EXTRA_CFLAGS=
SECURE_OBJS+=./src/wc_callable.o
SECURE_OBJS+=./lib/wolfssl/wolfcrypt/src/random.o
CFLAGS+=-DWOLFCRYPT_SECURE_MODE
SECURE_LDFLAGS+=-Wl,--cmse-implib -Wl,--out-implib=./src/wc_secure_calls.o
endif
endif # TZEN=1
ifeq ($(NO_ASM),1)
ifeq ($(SPMATH),1)
ifeq ($(NO_ASM),1)
MATH_OBJS += ./lib/wolfssl/wolfcrypt/src/sp_c32.o
else
CFLAGS+=-DWOLFSSL_SP_ASM -DWOLFSSL_SP_ARM_CORTEX_M_ASM
MATH_OBJS += ./lib/wolfssl/wolfcrypt/src/sp_cortexm.o
CFLAGS+=$(CORTEXM_ARM_EXTRA_CFLAGS) -DWOLFSSL_ARM_ARCH=8
OBJS+=$(CORTEXM_ARM_EXTRA_OBJS)
endif
endif
else
ifeq ($(SPMATH),1)
CFLAGS+=-DWOLFSSL_SP_ASM -DWOLFSSL_SP_ARM_CORTEX_M_ASM
MATH_OBJS += ./lib/wolfssl/wolfcrypt/src/sp_cortexm.o
endif
endif
else
ifeq ($(CORTEX_M7),1)
CFLAGS+=-mcpu=cortex-m7
LDFLAGS+=-mcpu=cortex-m7
Expand All @@ -243,10 +267,12 @@ else
else
CFLAGS+=-DWOLFSSL_SP_ASM -DWOLFSSL_SP_ARM_CORTEX_M_ASM
MATH_OBJS += ./lib/wolfssl/wolfcrypt/src/sp_cortexm.o
CFLAGS+=$(CORTEXM_ARM_EXTRA_CFLAGS) -DWOLFSSL_ARM_ARCH=7
OBJS+=$(CORTEXM_ARM_EXTRA_OBJS)
endif
endif
endif
else
ifeq ($(CORTEX_M0),1)
ifeq ($(CORTEX_M0),1)
CFLAGS+=-mcpu=cortex-m0
LDFLAGS+=-mcpu=cortex-m0
ifeq ($(SPMATH),1)
Expand All @@ -255,6 +281,9 @@ else
else
CFLAGS+=-DWOLFSSL_SP_ASM -DWOLFSSL_SP_ARM_THUMB_ASM
MATH_OBJS += ./lib/wolfssl/wolfcrypt/src/sp_armthumb.o
# TODO: integrate thumb2-asm
#CFLAGS+=$(CORTEXM_ARM_EXTRA_CFLAGS) -DWOLFSSL_ARM_ARCH=6
#OBJS+=$(CORTEXM_ARM_THUMB_EXTRA_OBJS)
endif
endif
else
Expand All @@ -269,6 +298,8 @@ else
ifeq ($(SPMATH),1)
CFLAGS+=-DWOLFSSL_SP_ASM -DWOLFSSL_SP_ARM_CORTEX_M_ASM -DWOLFSSL_SP_NO_UMAAL
MATH_OBJS += ./lib/wolfssl/wolfcrypt/src/sp_cortexm.o
CFLAGS+=$(CORTEXM_ARM_EXTRA_CFLAGS) -DWOLFSSL_ARM_ARCH=7
OBJS+=$(CORTEXM_ARM_EXTRA_OBJS)
endif
endif
else
Expand All @@ -284,6 +315,8 @@ else
ifeq ($(SPMATH),1)
CFLAGS+=-DWOLFSSL_SP_ASM -DWOLFSSL_SP_ARM_CORTEX_M_ASM
MATH_OBJS += ./lib/wolfssl/wolfcrypt/src/sp_cortexm.o
CFLAGS+=$(CORTEXM_ARM_EXTRA_CFLAGS) -DWOLFSSL_ARM_ARCH=7
OBJS+=$(CORTEXM_ARM_EXTRA_OBJS)
endif
endif
endif
Expand Down
22 changes: 22 additions & 0 deletions docs/compile.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,28 @@ By default, wolfBoot is compiled for ARM Cortex-M3/4/7. To compile for Cortex-M0

`CORTEX_M0=1`

### Speed vs. size

On a number of targets, algorithm may be optimized automatically to use assembly
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"On a number of targets, algorithm may be optimized automatically to use assembly
optimizations." -> "Some targets support assembly optiomizations by default."?

optimizations. To disable assembly optimizations, use `NO_ASM=1`. This option will
produce smaller code, but will also impact on the boot time.

ARM-specific ARM optimizations affecting hash and symmetric key ciphers can be
disabled with the option `NO_ARM_ASM=1`. This is useful for example when you want
to use SP math optimizations for key verification, but exclude SHA2/AES optimizations
to save some space.

#### Example: ECC256 + SHA256 on STM32H7

Benchmark footprint vs. boot time SHA of 100KB image + signature verification

| Description | Selected options | wolfBoot size (B) | Boot time (s) |
|-------------|------------------|-------------------|---------------|
| Full ECC256 assembly optimizations. Fastest. | `SIGN=ECC256` | 21836 | .583 |
| Optimize ECC only (SP math assembly only) | `SIGN=ECC256 NO_ARM_ASM=1` | 18624 | .760 |
| No assembly optimizations (smallest) | `SIGN=ECC256 NO_ASM=1` | 14416 | 3.356 |


### Flash partitions

The file [include/target.h](../include/target.h) is generated according to the configured flash geometry,
Expand Down
9 changes: 8 additions & 1 deletion include/user_settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ extern int tolower(int c);
# define ED25519_SMALL
# define NO_ED25519_SIGN
# define NO_ED25519_EXPORT
# define WOLFSSL_SHA512
# define USE_SLOW_SHA512
# define WOLFSSL_SHA512
#endif

/* ED448 and SHA3/SHAKE256 */
Expand Down Expand Up @@ -267,6 +267,9 @@ extern int tolower(int c);
!defined(WOLFCRYPT_SECURE_MODE)
# define NO_SHA256
# endif
#ifndef WOLFSSL_SHA512
#define WOLFSSL_SHA512
#endif
#endif

/* If SP math is enabled determine word size */
Expand Down Expand Up @@ -499,4 +502,8 @@ extern int tolower(int c);

#endif /* WOLFBOOT_PKCS11_APP */

#ifndef XTOLOWER
#define XTOLOWER(x) (x)
#endif

#endif /* !_WOLFBOOT_USER_SETTINGS_H_ */
2 changes: 1 addition & 1 deletion options.mk
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ ifeq ($(SIGN),XMSS)
ifeq ($(WOLFBOOT_SMALL_STACK),1)
$(error WOLFBOOT_SMALL_STACK with XMSS not supported)
else
STACK_USAGE=2720
STACK_USAGE=9352
endif
endif

Expand Down
4 changes: 4 additions & 0 deletions test-app/app_stm32h7.c
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,9 @@ void uart_print(const char *s)
}
}

#define FILLER_SIZE (100 * 1024)
static volatile uint8_t filler_data[FILLER_SIZE] = { 0x01, 0x02, 0x03 };

void main(void)
{
uint8_t firmware_version = 0;
Expand All @@ -373,6 +376,7 @@ void main(void)
if (FIRMWARE_A)
ld3_write(LED_INIT);

filler_data[FILLER_SIZE - 1] = 0xAA;
/* LED Indicator of successful UART initialization. SUCCESS = ON, FAIL = OFF */
if (uart_setup(115200) < 0)
ld2_write(LED_OFF);
Expand Down
4 changes: 3 additions & 1 deletion tools/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ ifeq ($(ARCH),)
CORTEX_M7?=0
CORTEX_M3?=0
NO_ASM?=0
NO_ARM_ASM?=0
EXT_FLASH?=0
SPI_FLASH?=0
QSPI_FLASH?=0
Expand Down Expand Up @@ -104,5 +105,6 @@ CONFIG_VARS:= ARCH TARGET SIGN HASH MCUXSDK MCUXPRESSO MCUXPRESSO_CPU MCUXPRESSO
NXP_CUSTOM_DCD NXP_CUSTOM_DCD_OBJS \
FLASH_OTP_KEYSTORE \
KEYVAULT_OBJ_SIZE \
KEYVAULT_MAX_ITEMS
KEYVAULT_MAX_ITEMS \
NO_ARM_ASM

98 changes: 98 additions & 0 deletions tools/scripts/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you planning to publish the benchmark.sh results in a CI job like the markdown you posted in the PR description?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's the plan. Currently busy rebuilding my on-site infrastracture, I will eventually provide a dedicated jenkins node to run the benchmark.

#
function run_on_board() {
# GPIO2: RST
# GPIO3: BOOT (input)

if ! (st-flash reset &>/dev/null); then
echo -n "No data."
else
sleep 1
st-flash --connect-under-reset write factory.bin 0x8000000 &>/dev/null
sleep .2
echo "2" > /sys/class/gpio/export 2>/dev/null
echo "out" > /sys/class/gpio/gpio2/direction
echo "1" > /sys/class/gpio/gpio2/value # Release reset
echo "0" > /sys/class/gpio/gpio2/value # Keep reset low
sleep 1
echo -n " | "
echo "1" > /sys/class/gpio/gpio2/value # Release reset
START=`date +%s.%N`
while (test `cat /sys/class/gpio/gpio4/value` -eq 0); do
sleep .01
done
while (test `cat /sys/class/gpio/gpio4/value` -eq 0); do
sleep .01
done
END=`date +%s.%N`
echo "scale=3; $END/1 - $START/1 "| bc
echo "in" > /sys/class/gpio/gpio2/direction
echo "2" >/sys/class/gpio/unexport 2>/dev/null
fi
}

function set_benchmark {
NAME=$1
shift
CONFIG=$@
# Name
echo -n "| "
echo -n $NAME
echo -n " | "
# Configuration
echo -n $CONFIG | tr -d '\n'
echo -n " | "
make clean &>/dev/null
make keysclean &>/dev/null
make $@ factory.bin &>/dev/null || make $@ factory.bin
make $@ stack-usage &>/dev/null
make $@ image-header-size &>/dev/null
# Bootloader size
echo -n `ls -l wolfboot.bin | cut -d " " -f 5 | tr -d '\n'`
echo -n " | "
# Stack size
cat .stack_usage | tr -d '\n'
echo -n " | "
# Image header size
cat .image_header_size | tr -d '\n'
# Boot time
run_on_board 2>&1 | tr -d '\n'
echo " |"
}

echo "4" > /sys/class/gpio/export 2>/dev/null
echo "2" > /sys/class/gpio/unexport 2>/dev/null
make keytools &>/dev/null
cp config/examples/stm32h7.config .config
echo "in" > /sys/class/gpio/gpio4/direction
# Output benchmark results in a Markdown table
echo "| Name | Configuration | Bootloader size | Stack size | Image header size | Boot time |"
echo "|------|---------------|-----------------|------------|-------------------|-----------|"


set_benchmark "SHA2 only" SIGN=NONE
set_benchmark "SHA384 only" SIGN=NONE HASH=SHA384
set_benchmark "SHA3 only" SIGN=NONE HASH=SHA3
set_benchmark "SHA2 only,small" SIGN=NONE NO_ASM=1
set_benchmark "rsa2048" SIGN=RSA2048
set_benchmark "rsa3072" SIGN=RSA3072
set_benchmark "rsa4096" SIGN=RSA4096
set_benchmark "rsa4096 with sha384" SIGN=RSA4096 HASH=SHA384
set_benchmark "ecdsa256" SIGN=ECC256
set_benchmark "ecdsa384" SIGN=ECC384
set_benchmark "ecdsa521" SIGN=ECC521
set_benchmark "ecdsa256 with small stack" SIGN=ECC384 WOLFBOOT_SMALL_STACK=1
set_benchmark "ecdsa256 with fast math" SIGN=ECC384 SP_MATH=0
set_benchmark "ecdsa256, no asm" SIGN=ECC256 NO_ASM=1
set_benchmark "ecdsa384, no asm" SIGN=ECC384 NO_ASM=1
set_benchmark "ecdsa521, no asm" SIGN=ECC521 NO_ASM=1
set_benchmark "ecdsa384 with sha384" SIGN=ECC384 HASH=SHA384
set_benchmark "ed25519 with sha384, small" SIGN=ED25519 HASH=SHA384 NO_ASM=1
set_benchmark "ed25519 fast" SIGN=ED25519 NO_ASM=0
set_benchmark "ed448" SIGN=ED448
set_benchmark "ML_DSA-44" SIGN=ML_DSA ML_DSA_LEVEL=2 IMAGE_SIGNATURE_SIZE=2420 IMAGE_HEADER_SIZE=8192
set_benchmark "ML_DSA-65" SIGN=ML_DSA ML_DSA_LEVEL=3 IMAGE_SIGNATURE_SIZE=3309 IMAGE_HEADER_SIZE=8192
set_benchmark "ML_DSA-87" SIGN=ML_DSA ML_DSA_LEVEL=5 IMAGE_SIGNATURE_SIZE=4627 IMAGE_HEADER_SIZE=12288
set_benchmark "LMS 1-10-8" SIGN=LMS LMS_LEVELS=1 LMS_HEIGHT=10 LMS_WINTERNITZ=8 IMAGE_HEADER_SIZE=4096 IMAGE_SIGNATURE_SIZE=1456
set_benchmark "XMSS-SHA2_10_256'" XMSS_PARAMS='XMSS-SHA2_10_256' SIGN=XMSS IMAGE_SIGNATURE_SIZE=2500 IMAGE_HEADER_SIZE=8192

Loading
Loading