diff --git a/Marlin/stepper.cpp b/Marlin/stepper.cpp
index 08612b4906707bbf465486fcd289dc87a491afb9..11b51285eeecbee689fa2f370ca364f001b11df9 100644
--- a/Marlin/stepper.cpp
+++ b/Marlin/stepper.cpp
@@ -528,33 +528,69 @@ void Stepper::isr() {
         _APPLY_STEP(AXIS)(_INVERT_STEP_PIN(AXIS),0); \
       }
 
+    /**
+     * Estimate the number of cycles that the stepper logic already takes
+     * up between the start and stop of the X stepper pulse.
+     *
+     * Currently this uses very modest estimates of around 5 cycles.
+     * True values may be derived by careful testing.
+     *
+     * Once any delay is added, the cost of the delay code itself
+     * may be subtracted from this value to get a more accurate delay.
+     * Delays under 20 cycles (1.25µs) will be very accurate, using NOPs.
+     * Longer delays use a loop. The resolution is 8 cycles.
+     */
     #if HAS_X_STEP
-      #define _COUNT_STEPPERS_1 1
+      #define _CYCLE_APPROX_1 5
     #else
-      #define _COUNT_STEPPERS_1 0
+      #define _CYCLE_APPROX_1 0
+    #endif
+    #if ENABLED(X_DUAL_STEPPER_DRIVERS)
+      #define _CYCLE_APPROX_2 _CYCLE_APPROX_1 + 4
+    #else
+      #define _CYCLE_APPROX_2 _CYCLE_APPROX_1
     #endif
     #if HAS_Y_STEP
-      #define _COUNT_STEPPERS_2 _COUNT_STEPPERS_1 + 1
+      #define _CYCLE_APPROX_3 _CYCLE_APPROX_2 + 5
+    #else
+      #define _CYCLE_APPROX_3 _CYCLE_APPROX_2
+    #endif
+    #if ENABLED(Y_DUAL_STEPPER_DRIVERS)
+      #define _CYCLE_APPROX_4 _CYCLE_APPROX_3 + 4
     #else
-      #define _COUNT_STEPPERS_2 _COUNT_STEPPERS_1
+      #define _CYCLE_APPROX_4 _CYCLE_APPROX_3
     #endif
     #if HAS_Z_STEP
-      #define _COUNT_STEPPERS_3 _COUNT_STEPPERS_2 + 1
+      #define _CYCLE_APPROX_5 _CYCLE_APPROX_4 + 5
+    #else
+      #define _CYCLE_APPROX_5 _CYCLE_APPROX_4
+    #endif
+    #if ENABLED(Z_DUAL_STEPPER_DRIVERS)
+      #define _CYCLE_APPROX_6 _CYCLE_APPROX_5 + 4
     #else
-      #define _COUNT_STEPPERS_3 _COUNT_STEPPERS_2
+      #define _CYCLE_APPROX_6 _CYCLE_APPROX_5
     #endif
     #if DISABLED(ADVANCE) && DISABLED(LIN_ADVANCE)
-      #define _COUNT_STEPPERS_4 _COUNT_STEPPERS_3 + 1
+      #if ENABLED(MIXING_EXTRUDER)
+        #define _CYCLE_APPROX_7 _CYCLE_APPROX_6 + (MIXING_STEPPERS) * 6
+      #else
+        #define _CYCLE_APPROX_7 _CYCLE_APPROX_6 + 5
+      #endif
     #else
-      #define _COUNT_STEPPERS_4 _COUNT_STEPPERS_3
+      #define _CYCLE_APPROX_7 _CYCLE_APPROX_6
     #endif
 
-    #define CYCLES_EATEN_XYZE ((_COUNT_STEPPERS_4) * 5)
+    #define CYCLES_EATEN_XYZE _CYCLE_APPROX_7
     #define EXTRA_CYCLES_XYZE (STEP_PULSE_CYCLES - (CYCLES_EATEN_XYZE))
 
-    // If a minimum pulse time was specified get the timer 0 value
-    // which increments every 4µs on 16MHz and every 3.2µs on 20MHz.
-    // Two or 3 counts of TCNT0 should be a sufficient delay.
+    /**
+     * If a minimum pulse time was specified get the timer 0 value.
+     *
+     * TCNT0 has an 8x prescaler, so it increments every 8 cycles. 
+     * That's every 0.5µs on 16MHz and every 0.4µs on 20MHz.
+     * 20 counts of TCNT0 -by itself- is a good pulse delay.
+     * 10µs = 160 or 200 cycles.
+     */
     #if EXTRA_CYCLES_XYZE > 20
       uint32_t pulse_start = TCNT0;
     #endif
@@ -627,7 +663,7 @@ void Stepper::isr() {
       break;
     }
 
-    // For minimum pulse time wait before stopping pulses
+    // For minimum pulse time wait after stopping pulses also
     #if EXTRA_CYCLES_XYZE > 20
       if (i) while (EXTRA_CYCLES_XYZE > (uint32_t)(TCNT0 - pulse_start) * (INT0_PRESCALER)) { /* nada */ }
     #elif EXTRA_CYCLES_XYZE > 0