diff --git a/Marlin/stepper.cpp b/Marlin/stepper.cpp
index a44ddbdab81c32d7c9daf601a7cd0498a53772d0..c8bcbbbf2b547f0eaf4002f7234b9d2bcea86304 100644
--- a/Marlin/stepper.cpp
+++ b/Marlin/stepper.cpp
@@ -54,7 +54,7 @@ static unsigned int cleaning_buffer_counter;
               locked_z2_motor = false;
 #endif
 
-// Counter variables for the bresenham line tracer
+// Counter variables for the Bresenham line tracer
 static long counter_x, counter_y, counter_z, counter_e;
 volatile static unsigned long step_events_completed; // The number of step events executed in the current block
 
@@ -66,7 +66,7 @@ volatile static unsigned long step_events_completed; // The number of step event
 
 static long acceleration_time, deceleration_time;
 //static unsigned long accelerate_until, decelerate_after, acceleration_rate, initial_rate, final_rate, nominal_rate;
-static unsigned short acc_step_rate; // needed for deccelaration start point
+static unsigned short acc_step_rate; // needed for deceleration start point
 static char step_loops;
 static unsigned short OCR1A_nominal;
 static unsigned short step_loops_nominal;
@@ -205,8 +205,14 @@ volatile signed char count_direction[NUM_AXIS] = { 1, 1, 1, 1 };
 // intRes = longIn1 * longIn2 >> 24
 // uses:
 // r26 to store 0
-// r27 to store the byte 1 of the 48bit result
-#define MultiU24X24toH16(intRes, longIn1, longIn2) \
+// r27 to store bits 16-23 of the 48bit result. The top bit is used to round the two byte result.
+// note that the lower two bytes and the upper byte of the 48bit result are not calculated.
+// this can cause the result to be out by one as the lower bytes may cause carries into the upper ones.
+// B0 A0 are bits 24-39 and are the returned value
+// C1 B1 A1 is longIn1
+// D2 C2 B2 A2 is longIn2
+//
+#define MultiU24X32toH16(intRes, longIn1, longIn2) \
   asm volatile ( \
     "clr r26 \n\t" \
     "mul %A1, %B2 \n\t" \
@@ -237,6 +243,11 @@ volatile signed char count_direction[NUM_AXIS] = { 1, 1, 1, 1 };
     "lsr r27 \n\t" \
     "adc %A0, r26 \n\t" \
     "adc %B0, r26 \n\t" \
+    "mul %D2, %A1 \n\t" \
+    "add %A0, r0 \n\t" \
+    "adc %B0, r1 \n\t" \
+    "mul %D2, %B1 \n\t" \
+    "add %B0, r0 \n\t" \
     "clr r1 \n\t" \
     : \
     "=&r" (intRes) \
@@ -313,7 +324,7 @@ void enable_endstops(bool check) { check_endstops = check; }
 //  The trapezoid is the shape the speed curve over time. It starts at block->initial_rate, accelerates
 //  first block->accelerate_until step_events_completed, then keeps going at constant speed until
 //  step_events_completed reaches block->decelerate_after after which it decelerates until the trapezoid generator is reset.
-//  The slope of acceleration is calculated with the leib ramp alghorithm.
+//  The slope of acceleration is calculated using v = u + at where t is the accumulated timer values of the steps so far.
 
 void st_wake_up() {
   //  TCNT1 = 0;
@@ -469,7 +480,7 @@ ISR(TIMER1_COMPA_vect) {
         if ((current_block->steps[A_AXIS] != current_block->steps[B_AXIS]) || (TEST(out_bits, A_AXIS) == TEST(out_bits, B_AXIS))) {
           if (TEST(out_bits, X_HEAD))
       #else
-          if (TEST(out_bits, X_AXIS))   // stepping along -X axis (regular cartesians bot)
+          if (TEST(out_bits, X_AXIS))   // stepping along -X axis (regular Cartesian bot)
       #endif
           { // -direction
             #ifdef DUAL_X_CARRIAGE
@@ -714,7 +725,7 @@ ISR(TIMER1_COMPA_vect) {
     unsigned short step_rate;
     if (step_events_completed <= (unsigned long)current_block->accelerate_until) {
 
-      MultiU24X24toH16(acc_step_rate, acceleration_time, current_block->acceleration_rate);
+      MultiU24X32toH16(acc_step_rate, acceleration_time, current_block->acceleration_rate);
       acc_step_rate += current_block->initial_rate;
 
       // upper limit
@@ -737,7 +748,7 @@ ISR(TIMER1_COMPA_vect) {
       #endif
     }
     else if (step_events_completed > (unsigned long)current_block->decelerate_after) {
-      MultiU24X24toH16(step_rate, deceleration_time, current_block->acceleration_rate);
+      MultiU24X32toH16(step_rate, deceleration_time, current_block->acceleration_rate);
 
       if (step_rate > acc_step_rate) { // Check step_rate stays positive
         step_rate = current_block->final_rate;