diff --git a/Marlin/stepper.cpp b/Marlin/stepper.cpp
index 57bef81a85510dc7f61efe2f24fb1c6a53713c87..9e11aca4e6d2c5990c2e9b73910814666347c756 100644
--- a/Marlin/stepper.cpp
+++ b/Marlin/stepper.cpp
@@ -243,59 +243,63 @@ volatile int32_t Stepper::endstops_trigsteps[XYZ];
 
 // intRes = longIn1 * longIn2 >> 24
 // uses:
-// r26 to store 0
-// r27 to store bits 16-23 of the 48bit result. The top bit is used to round the two byte result.
+// A[tmp] to store 0
+// B[tmp] to store bits 16-23 of the 48bit result. The top bit is used to round the two byte result.
 // note that the lower two bytes and the upper byte of the 48bit result are not calculated.
 // this can cause the result to be out by one as the lower bytes may cause carries into the upper ones.
-// B0 A0 are bits 24-39 and are the returned value
-// C1 B1 A1 is longIn1
-// D2 C2 B2 A2 is longIn2
+// B A are bits 24-39 and are the returned value
+// C B A is longIn1
+// D C B A is longIn2
 //
-#define MultiU24X32toH16(intRes, longIn1, longIn2) \
-  asm volatile ( \
-                 A("clr r26") \
-                 A("mul %A1, %B2") \
-                 A("mov r27, r1") \
-                 A("mul %B1, %C2") \
-                 A("movw %A0, r0") \
-                 A("mul %C1, %C2") \
-                 A("add %B0, r0") \
-                 A("mul %C1, %B2") \
-                 A("add %A0, r0") \
-                 A("adc %B0, r1") \
-                 A("mul %A1, %C2") \
-                 A("add r27, r0") \
-                 A("adc %A0, r1") \
-                 A("adc %B0, r26") \
-                 A("mul %B1, %B2") \
-                 A("add r27, r0") \
-                 A("adc %A0, r1") \
-                 A("adc %B0, r26") \
-                 A("mul %C1, %A2") \
-                 A("add r27, r0") \
-                 A("adc %A0, r1") \
-                 A("adc %B0, r26") \
-                 A("mul %B1, %A2") \
-                 A("add r27, r1") \
-                 A("adc %A0, r26") \
-                 A("adc %B0, r26") \
-                 A("lsr r27") \
-                 A("adc %A0, r26") \
-                 A("adc %B0, r26") \
-                 A("mul %D2, %A1") \
-                 A("add %A0, r0") \
-                 A("adc %B0, r1") \
-                 A("mul %D2, %B1") \
-                 A("add %B0, r0") \
-                 A("clr r1") \
-                 : \
-                 "=&r" (intRes) \
-                 : \
-                 "d" (longIn1), \
-                 "d" (longIn2) \
-                 : \
-                 "r26" , "r27" \
-               )
+static FORCE_INLINE uint16_t MultiU24X32toH16(uint32_t longIn1, uint32_t longIn2) {
+  register uint8_t tmp1;
+  register uint8_t tmp2;
+  register uint16_t intRes;
+  __asm__ __volatile__(
+    A("clr %[tmp1]")
+    A("mul %A[longIn1], %B[longIn2]")
+    A("mov %[tmp2], r1")
+    A("mul %B[longIn1], %C[longIn2]")
+    A("movw %A[intRes], r0")
+    A("mul %C[longIn1], %C[longIn2]")
+    A("add %B[intRes], r0")
+    A("mul %C[longIn1], %B[longIn2]")
+    A("add %A[intRes], r0")
+    A("adc %B[intRes], r1")
+    A("mul %A[longIn1], %C[longIn2]")
+    A("add %[tmp2], r0")
+    A("adc %A[intRes], r1")
+    A("adc %B[intRes], %[tmp1]")
+    A("mul %B[longIn1], %B[longIn2]")
+    A("add %[tmp2], r0")
+    A("adc %A[intRes], r1")
+    A("adc %B[intRes], %[tmp1]")
+    A("mul %C[longIn1], %A[longIn2]")
+    A("add %[tmp2], r0")
+    A("adc %A[intRes], r1")
+    A("adc %B[intRes], %[tmp1]")
+    A("mul %B[longIn1], %A[longIn2]")
+    A("add %[tmp2], r1")
+    A("adc %A[intRes], %[tmp1]")
+    A("adc %B[intRes], %[tmp1]")
+    A("lsr %[tmp2]")
+    A("adc %A[intRes], %[tmp1]")
+    A("adc %B[intRes], %[tmp1]")
+    A("mul %D[longIn2], %A[longIn1]")
+    A("add %A[intRes], r0")
+    A("adc %B[intRes], r1")
+    A("mul %D[longIn2], %B[longIn1]")
+    A("add %B[intRes], r0")
+    A("clr r1")
+      : [intRes] "=&r" (intRes),
+        [tmp1] "=&r" (tmp1),
+        [tmp2] "=&r" (tmp2)
+      : [longIn1] "d" (longIn1),
+        [longIn2] "d" (longIn2)
+      : "cc"
+  );
+  return intRes;
+}
 
 // Some useful constants
 
@@ -1506,10 +1510,7 @@ void Stepper::isr() {
           ? _eval_bezier_curve(acceleration_time)
           : current_block->cruise_rate;
     #else
-      MultiU24X32toH16(acc_step_rate, acceleration_time, current_block->acceleration_rate);
-      acc_step_rate += current_block->initial_rate;
-
-      // upper limit
+      acc_step_rate = MultiU24X32toH16(acceleration_time, current_block->acceleration_rate) + current_block->initial_rate;
       NOMORE(acc_step_rate, current_block->nominal_rate);
     #endif
 
@@ -1540,7 +1541,6 @@ void Stepper::isr() {
     #if ENABLED(BEZIER_JERK_CONTROL)
       // If this is the 1st time we process the 2nd half of the trapezoid...
       if (!bezier_2nd_half) {
-
         // Initialize the Bézier speed curve
         _calc_bezier_curve_coeffs(current_block->cruise_rate, current_block->final_rate, current_block->deceleration_time_inverse);
         bezier_2nd_half = true;
@@ -1553,14 +1553,14 @@ void Stepper::isr() {
     #else
 
       // Using the old trapezoidal control
-      MultiU24X32toH16(step_rate, deceleration_time, current_block->acceleration_rate);
-
+      step_rate = MultiU24X32toH16(deceleration_time, current_block->acceleration_rate);
       if (step_rate < acc_step_rate) { // Still decelerating?
         step_rate = acc_step_rate - step_rate;
         NOLESS(step_rate, current_block->final_rate);
       }
       else
         step_rate = current_block->final_rate;
+
     #endif
 
     // step_rate to timer interval
diff --git a/Marlin/stepper.h b/Marlin/stepper.h
index 7b3dd599de511b5fb774e50e8ade9a7303b2c064..5dec78390546177370c576b67fe39e0cbbff28e0 100644
--- a/Marlin/stepper.h
+++ b/Marlin/stepper.h
@@ -61,26 +61,28 @@ extern Stepper stepper;
 // uses:
 // r26 to store 0
 // r27 to store the byte 1 of the 24 bit result
-#define MultiU16X8toH16(intRes, charIn1, intIn2) \
-  asm volatile ( \
-                 A("clr r26") \
-                 A("mul %A1, %B2") \
-                 A("movw %A0, r0") \
-                 A("mul %A1, %A2") \
-                 A("add %A0, r1") \
-                 A("adc %B0, r26") \
-                 A("lsr r0") \
-                 A("adc %A0, r26") \
-                 A("adc %B0, r26") \
-                 A("clr r1") \
-                 : \
-                 "=&r" (intRes) \
-                 : \
-                 "d" (charIn1), \
-                 "d" (intIn2) \
-                 : \
-                 "r26" \
-               )
+static FORCE_INLINE uint16_t MultiU16X8toH16(uint8_t charIn1, uint16_t intIn2) {
+  register uint8_t tmp;
+  register uint16_t intRes;
+  __asm__ __volatile__ (
+    A("clr %[tmp]")
+    A("mul %[charIn1], %B[intIn2]")
+    A("movw %A[intRes], r0")
+    A("mul %[charIn1], %A[intIn2]")
+    A("add %A[intRes], r1")
+    A("adc %B[intRes], %[tmp]")
+    A("lsr r0")
+    A("adc %A[intRes], %[tmp]")
+    A("adc %B[intRes], %[tmp]")
+    A("clr r1")
+      : [intRes] "=&r" (intRes),
+        [tmp] "=&r" (tmp)
+      : [charIn1] "d" (charIn1),
+        [intIn2] "d" (intIn2)
+      : "cc"
+  );
+  return intRes;
+}
 
 class Stepper {
 
@@ -346,17 +348,15 @@ class Stepper {
       NOLESS(step_rate, F_CPU / 500000);
       step_rate -= F_CPU / 500000; // Correct for minimal speed
       if (step_rate >= (8 * 256)) { // higher step rate
-        unsigned short table_address = (unsigned short)&speed_lookuptable_fast[(unsigned char)(step_rate >> 8)][0];
-        unsigned char tmp_step_rate = (step_rate & 0x00FF);
-        unsigned short gain = (unsigned short)pgm_read_word_near(table_address + 2);
-        MultiU16X8toH16(timer, tmp_step_rate, gain);
-        timer = (unsigned short)pgm_read_word_near(table_address) - timer;
+        uint16_t table_address = (uint16_t)&speed_lookuptable_fast[(uint8_t)(step_rate >> 8)][0],
+                 gain = (uint16_t)pgm_read_word_near(table_address + 2);
+        timer = (uint16_t)pgm_read_word_near(table_address) - MultiU16X8toH16(step_rate & 0x00FF, gain);
       }
       else { // lower step rates
-        unsigned short table_address = (unsigned short)&speed_lookuptable_slow[0][0];
+        uint16_t table_address = (uint16_t)&speed_lookuptable_slow[0][0];
         table_address += ((step_rate) >> 1) & 0xFFFC;
-        timer = (unsigned short)pgm_read_word_near(table_address);
-        timer -= (((unsigned short)pgm_read_word_near(table_address + 2) * (unsigned char)(step_rate & 0x0007)) >> 3);
+        timer = (uint16_t)pgm_read_word_near(table_address)
+              - (((uint16_t)pgm_read_word_near(table_address + 2) * (uint8_t)(step_rate & 0x0007)) >> 3);
       }
       if (timer < 100) { // (20kHz - this should never happen)
         timer = 100;