diff --git a/Marlin/src/HAL/Delay.h b/Marlin/src/HAL/Delay.h
new file mode 100644
index 0000000000000000000000000000000000000000..75619b23ef97d6e7d3f5f047eb26e6b353dcfefe
--- /dev/null
+++ b/Marlin/src/HAL/Delay.h
@@ -0,0 +1,133 @@
+/**
+ * Marlin 3D Printer Firmware
+ * Copyright (C) 2016 MarlinFirmware [https://github.com/MarlinFirmware/Marlin]
+ *
+ * Based on Sprinter and grbl.
+ * Copyright (C) 2011 Camiel Gubbels / Erik van der Zalm
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+
+  Busy wait delay Cycles routines:
+
+  DELAY_CYCLES(count): Delay execution in cycles
+  DELAY_NS(count): Delay execution in nanoseconds
+  DELAY_US(count): Delay execution in microseconds
+
+  */
+
+#ifndef MARLIN_DELAY_H
+#define MARLIN_DELAY_H
+
+#if defined(__arm__) || defined(__thumb__)
+
+  /* https://blueprints.launchpad.net/gcc-arm-embedded/+spec/delay-cycles */
+
+  #define nop() __asm__ __volatile__("nop;\n\t":::)
+
+  FORCE_INLINE static void __delay_4cycles(uint32_t cy) { // +1 cycle
+    #if ARCH_PIPELINE_RELOAD_CYCLES < 2
+      #define EXTRA_NOP_CYCLES A("nop")
+    #else
+      #define EXTRA_NOP_CYCLES ""
+    #endif
+
+    __asm__ __volatile__(
+      A(".syntax unified") // is to prevent CM0,CM1 non-unified syntax
+      L("1")
+      A("subs %[cnt],#1")
+      EXTRA_NOP_CYCLES
+      A("bne 1b")
+      : [cnt]"+r"(cy)   // output: +r means input+output
+      :                 // input:
+      : "cc"            // clobbers:
+    );
+  }
+
+  /* ---------------- Delay in cycles */
+  FORCE_INLINE static void DELAY_CYCLES(uint32_t x) {
+
+    if (__builtin_constant_p(x)) {
+      #define MAXNOPS 4
+
+      if (x <= (MAXNOPS)) {
+        switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); }
+      }
+      else { // because of +1 cycle inside delay_4cycles
+        const uint32_t rem = (x - 1) % (MAXNOPS);
+        switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); }
+        if ((x = (x - 1) / (MAXNOPS)))
+          __delay_4cycles(x); // if need more then 4 nop loop is more optimal
+      }
+      #undef MAXNOPS
+    }
+    else
+      __delay_4cycles(x / 4);
+  }
+  #undef nop
+
+#elif defined(__AVR__)
+
+  #define nop() __asm__ __volatile__("nop;\n\t":::)
+
+  FORCE_INLINE static void __delay_4cycles(uint8_t cy) {
+    __asm__ __volatile__(
+      L("1")
+      A("dec %[cnt]")
+      A("nop")
+      A("brne 1b")
+      : [cnt] "+r"(cy)  // output: +r means input+output
+      :                 // input:
+      : "cc"            // clobbers:
+    );
+  }
+
+  /* ---------------- Delay in cycles */
+  FORCE_INLINE static void DELAY_CYCLES(uint16_t x) {
+
+    if (__builtin_constant_p(x)) {
+      #define MAXNOPS 4
+
+      if (x <= (MAXNOPS)) {
+        switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); }
+      }
+      else {
+        const uint32_t rem = (x) % (MAXNOPS);
+        switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); }
+        if ((x = (x) / (MAXNOPS)))
+          __delay_4cycles(x); // if need more then 4 nop loop is more optimal
+      }
+
+      #undef MAXNOPS
+    }
+    else
+      __delay_4cycles(x / 4);
+  }
+  #undef nop
+
+#else
+  #error "Unsupported MCU architecture"
+#endif
+
+/* ---------------- Delay in nanoseconds */
+#define DELAY_NS(x) DELAY_CYCLES( (x) * (F_CPU/1000000L) / 1000L )
+
+/* ---------------- Delay in microseconds */
+#define DELAY_US(x) DELAY_CYCLES( (x) * (F_CPU/1000000L) )
+
+#endif // MARLIN_DELAY_H
+
diff --git a/Marlin/src/HAL/HAL_DUE/HAL_spi_Due.cpp b/Marlin/src/HAL/HAL_DUE/HAL_spi_Due.cpp
index 360d04bfff62e8d5cf291ee4d7262e87564d01b9..52ac1b9877130e05376005529349777e2317728f 100644
--- a/Marlin/src/HAL/HAL_DUE/HAL_spi_Due.cpp
+++ b/Marlin/src/HAL/HAL_DUE/HAL_spi_Due.cpp
@@ -42,6 +42,7 @@
 // --------------------------------------------------------------------------
 
 #include "../../inc/MarlinConfig.h"
+#include "../Delay.h"
 
 // --------------------------------------------------------------------------
 // Public Variables
@@ -58,66 +59,16 @@
   // software SPI
   // --------------------------------------------------------------------------
 
-  // set optimization so ARDUINO optimizes this file
+  // Make sure GCC optimizes this file.
+  // Note that this line triggers a bug in GCC which is fixed by casting.
+  // See the note below.
   #pragma GCC optimize (3)
 
-  /* ---------------- Delay Cycles routine -------------- */
-
-  /* https://blueprints.launchpad.net/gcc-arm-embedded/+spec/delay-cycles */
-
-  #define nop() __asm__ __volatile__("nop;\n\t":::)
-
-  FORCE_INLINE static void __delay_4cycles(uint32_t cy) { // +1 cycle
-    #if ARCH_PIPELINE_RELOAD_CYCLES<2
-      #define EXTRA_NOP_CYCLES "nop"
-    #else
-      #define EXTRA_NOP_CYCLES ""
-    #endif
-
-    __asm__ __volatile__(
-      ".syntax unified" "\n\t" // is to prevent CM0,CM1 non-unified syntax
-
-      L("loop%=")
-      A("subs %[cnt],#1")
-      A(EXTRA_NOP_CYCLES)
-      A("bne loop%=")
-      : [cnt]"+r"(cy) // output: +r means input+output
-      : // input:
-      : "cc" // clobbers:
-    );
-  }
-
-  FORCE_INLINE static void DELAY_CYCLES(uint32_t x) {
-
-    if (__builtin_constant_p(x)) {
-
-      #define MAXNOPS 4
-
-      if (x <= (MAXNOPS)) {
-        switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); }
-      }
-      else { // because of +1 cycle inside delay_4cycles
-        const uint32_t rem = (x - 1) % (MAXNOPS);
-        switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); }
-        if ((x = (x - 1) / (MAXNOPS)))
-          __delay_4cycles(x); // if need more then 4 nop loop is more optimal
-      }
-    }
-    else
-      __delay_4cycles(x / 4);
-  }
-
-  /* ---------------- Delay in nanoseconds and in microseconds */
-
-  #define DELAY_NS(x) DELAY_CYCLES( (x) * (F_CPU/1000000) / 1000)
-
-  typedef uint8_t (*pfnSpiTransfer) (uint8_t b);
+  typedef uint8_t (*pfnSpiTransfer)(uint8_t b);
   typedef void    (*pfnSpiRxBlock)(uint8_t* buf, uint32_t nbyte);
   typedef void    (*pfnSpiTxBlock)(const uint8_t* buf, uint32_t nbyte);
 
-
   /* ---------------- Macros to be able to access definitions from asm */
-
   #define _PORT(IO) DIO ##  IO ## _WPORT
   #define _PIN_MASK(IO) MASK(DIO ## IO ## _PIN)
   #define _PIN_SHIFT(IO) DIO ## IO ## _PIN
@@ -202,10 +153,10 @@
     return 0;
   }
 
- // Calculates the bit band alias address and returns a pointer address to word.
- // addr: The byte address of bitbanding bit.
- // bit:  The bit position of bitbanding bit.
-#define BITBAND_ADDRESS(addr, bit) \
+   // Calculates the bit band alias address and returns a pointer address to word.
+   // addr: The byte address of bitbanding bit.
+   // bit:  The bit position of bitbanding bit.
+  #define BITBAND_ADDRESS(addr, bit) \
     (((uint32_t)(addr) & 0xF0000000) + 0x02000000 + ((uint32_t)(addr)&0xFFFFF)*32 + (bit)*4)
 
   // run at ~8 .. ~10Mhz - Rx version (Tx line not altered)
@@ -319,8 +270,14 @@
   }
 
   // Pointers to generic functions for byte transfers
-  static pfnSpiTransfer spiTransferTx = spiTransferX;
-  static pfnSpiTransfer spiTransferRx = spiTransferX;
+
+  /**
+   * Note: The cast is unnecessary, but without it, this file triggers a GCC 4.8.3-2014 bug.
+   * Later GCC versions do not have this problem, but at this time (May 2018) Arduino still
+   * uses that buggy and obsolete GCC version!!
+   */
+  static pfnSpiTransfer spiTransferRx = (pfnSpiTransfer)spiTransferX;
+  static pfnSpiTransfer spiTransferTx = (pfnSpiTransfer)spiTransferX;
 
   // Block transfers run at ~8 .. ~10Mhz - Tx version (Rx data discarded)
   static void spiTxBlock0(const uint8_t* ptr, uint32_t todo) {
@@ -384,7 +341,7 @@
       A("str %[sck_mask],[%[sck_port],#0x4]")              /* CODR */
 
       /* Bit 0 */
-      A("str %[mosi_mask],[%[mosi_port], %[work],LSL #2]")  /* Access the proper SODR or CODR registers based on that bit */
+      A("str %[mosi_mask],[%[mosi_port], %[work],LSL #2]") /* Access the proper SODR or CODR registers based on that bit */
       A("str %[sck_mask],[%[sck_port]]")                   /* SODR */
       A("subs %[todo],#1")                                 /* Decrement count of pending words to send, update status */
       A("str %[sck_mask],[%[sck_port],#0x4]")              /* CODR */
@@ -491,8 +448,8 @@
   }
 
   // Pointers to generic functions for block tranfers
-  static pfnSpiTxBlock spiTxBlock = spiTxBlockX;
-  static pfnSpiRxBlock spiRxBlock = spiRxBlockX;
+  static pfnSpiTxBlock spiTxBlock = (pfnSpiTxBlock)spiTxBlockX;
+  static pfnSpiRxBlock spiRxBlock = (pfnSpiRxBlock)spiRxBlockX;
 
   #if MB(ALLIGATOR)  // control SDSS pin
     void spiBegin() {
@@ -580,23 +537,23 @@
   void spiInit(uint8_t spiRate) {
     switch (spiRate) {
       case 0:
-        spiTransferTx = spiTransferTx0;
-        spiTransferRx = spiTransferRx0;
-        spiTxBlock = spiTxBlock0;
-        spiRxBlock = spiRxBlock0;
+        spiTransferTx = (pfnSpiTransfer)spiTransferTx0;
+        spiTransferRx = (pfnSpiTransfer)spiTransferRx0;
+        spiTxBlock = (pfnSpiTxBlock)spiTxBlock0;
+        spiRxBlock = (pfnSpiRxBlock)spiRxBlock0;
         break;
       case 1:
-        spiTransferTx = spiTransfer1;
-        spiTransferRx = spiTransfer1;
-        spiTxBlock = spiTxBlockX;
-        spiRxBlock = spiRxBlockX;
+        spiTransferTx = (pfnSpiTransfer)spiTransfer1;
+        spiTransferRx = (pfnSpiTransfer)spiTransfer1;
+        spiTxBlock = (pfnSpiTxBlock)spiTxBlockX;
+        spiRxBlock = (pfnSpiRxBlock)spiRxBlockX;
         break;
       default:
         spiDelayCyclesX4 = (F_CPU/1000000) >> (6 - spiRate);
-        spiTransferTx = spiTransferX;
-        spiTransferRx = spiTransferX;
-        spiTxBlock = spiTxBlockX;
-        spiRxBlock = spiRxBlockX;
+        spiTransferTx = (pfnSpiTransfer)spiTransferX;
+        spiTransferRx = (pfnSpiTransfer)spiTransferX;
+        spiTxBlock = (pfnSpiTxBlock)spiTxBlockX;
+        spiRxBlock = (pfnSpiRxBlock)spiRxBlockX;
         break;
     }
 
@@ -614,7 +571,7 @@
 
   #pragma GCC reset_options
 
-#else
+#else // !SOFTWARE_SPI
 
   #if MB(ALLIGATOR)
 
@@ -714,7 +671,7 @@
       while ((SPI0->SPI_SR & SPI_SR_RDRF) == 0);
       // clear status
       SPI0->SPI_RDR;
-      //delayMicroseconds(1U);
+      //DELAY_US(1U);
     }
 
     void spiSend(const uint8_t* buf, size_t n) {
@@ -724,7 +681,7 @@
         while ((SPI0->SPI_SR & SPI_SR_TDRE) == 0);
         while ((SPI0->SPI_SR & SPI_SR_RDRF) == 0);
         SPI0->SPI_RDR;
-        //delayMicroseconds(1U);
+        //DELAY_US(1U);
       }
       spiSend(buf[n - 1]);
     }
@@ -767,7 +724,7 @@
       // wait for receive register
       while ((SPI0->SPI_SR & SPI_SR_RDRF) == 0);
       // get byte from receive register
-      //delayMicroseconds(1U);
+      //DELAY_US(1U);
       return SPI0->SPI_RDR;
     }
 
@@ -797,7 +754,7 @@
         SPI0->SPI_TDR = 0x000000FF | SPI_PCS(SPI_CHAN);
         while ((SPI0->SPI_SR & SPI_SR_RDRF) == 0);
         buf[i] = SPI0->SPI_RDR;
-        //delayMicroseconds(1U);
+        //DELAY_US(1U);
       }
       buf[nbyte] = spiRec();
     }
@@ -813,7 +770,7 @@
         while ((SPI0->SPI_SR & SPI_SR_TDRE) == 0);
         while ((SPI0->SPI_SR & SPI_SR_RDRF) == 0);
         SPI0->SPI_RDR;
-        //delayMicroseconds(1U);
+        //DELAY_US(1U);
       }
       spiSend(buf[511]);
     }
@@ -902,7 +859,7 @@
         spiTransfer(buf[i]);
     }
 
-  #endif  //MB(ALLIGATOR)
-#endif // ENABLED(SOFTWARE_SPI)
+  #endif // !ALLIGATOR
+#endif // !SOFTWARE_SPI
 
 #endif // ARDUINO_ARCH_SAM
diff --git a/Marlin/src/HAL/HAL_LPC1768/HAL.cpp b/Marlin/src/HAL/HAL_LPC1768/HAL.cpp
index 3afb253d6d0f782d3af642a206f5294ed778666e..b579aa620a35cc700b407b7e9658261eeeba3896 100644
--- a/Marlin/src/HAL/HAL_LPC1768/HAL.cpp
+++ b/Marlin/src/HAL/HAL_LPC1768/HAL.cpp
@@ -21,12 +21,13 @@
 #ifdef TARGET_LPC1768
 
 #include "../../inc/MarlinConfig.h"
+#include "../Delay.h"
 
 HalSerial usb_serial;
 
 // U8glib required functions
 extern "C" void u8g_xMicroDelay(uint16_t val) {
-  delayMicroseconds(val);
+  DELAY_US(val);
 }
 extern "C" void u8g_MicroDelay(void) {
   u8g_xMicroDelay(1);
diff --git a/Marlin/src/HAL/HAL_LPC1768/HAL.h b/Marlin/src/HAL/HAL_LPC1768/HAL.h
index 6aeafedf227484a7b5790e4449c72afff4ded17a..0944241df9ad335662a0766cdab9d4d11dd9b395 100644
--- a/Marlin/src/HAL/HAL_LPC1768/HAL.h
+++ b/Marlin/src/HAL/HAL_LPC1768/HAL.h
@@ -68,9 +68,9 @@ extern "C" volatile uint32_t _millis;
 #include "HAL_timers.h"
 #include "HardwareSerial.h"
 
-#define ST7920_DELAY_1 DELAY_20_NOP;DELAY_20_NOP;DELAY_20_NOP
-#define ST7920_DELAY_2 DELAY_20_NOP;DELAY_20_NOP;DELAY_20_NOP;DELAY_10_NOP;DELAY_5_NOP
-#define ST7920_DELAY_3 DELAY_20_NOP;DELAY_20_NOP;DELAY_20_NOP;DELAY_10_NOP;DELAY_5_NOP
+#define ST7920_DELAY_1 DELAY_NS(600)
+#define ST7920_DELAY_2 DELAY_NS(750)
+#define ST7920_DELAY_3 DELAY_NS(750)
 
 extern HalSerial usb_serial;
 
diff --git a/Marlin/src/HAL/HAL_LPC1768/SoftwareSerial.cpp b/Marlin/src/HAL/HAL_LPC1768/SoftwareSerial.cpp
index be4a6f7bf51fe4e93d72faf928bad8796dd2c3b5..9604d045ba1412d30a1d5b0923589ab49eb773b7 100644
--- a/Marlin/src/HAL/HAL_LPC1768/SoftwareSerial.cpp
+++ b/Marlin/src/HAL/HAL_LPC1768/SoftwareSerial.cpp
@@ -37,6 +37,7 @@
 //
 //#include <WInterrupts.h>
 #include "../../inc/MarlinConfig.h"
+#include "../Delay.h"
 #include <stdint.h>
 #include <stdarg.h>
 #include <Arduino.h>
@@ -78,28 +79,9 @@ static const DELAY_TABLE table[] = {
 // Private methods
 //
 
-#if 0
-/* static */
-inline void SoftwareSerial::tunedDelay(const uint32_t count) {
-
-  asm volatile(
-
-    "mov r3, %[loopsPerMicrosecond] \n\t" //load the initial loop counter
-    "1: \n\t"
-    "sub r3, r3, #1 \n\t"
-    "bne 1b \n\t"
-
-    ://empty output list
-    :[loopsPerMicrosecond] "r" (count)
-    :"r3", "cc" //clobber list
-  );
-
-}
-#else
 inline void SoftwareSerial::tunedDelay(const uint32_t count) {
-  delayMicroseconds(count);
+  DELAY_US(count);
 }
-#endif
 
 // This function sets the current object as the "listening"
 // one and returns true if it replaces another
diff --git a/Marlin/src/HAL/HAL_LPC1768/arduino.cpp b/Marlin/src/HAL/HAL_LPC1768/arduino.cpp
index 8fc7657790d5170edd12c547e41b4914edd82518..abdc6b353903c03f239d2bfe394316516542c244 100644
--- a/Marlin/src/HAL/HAL_LPC1768/arduino.cpp
+++ b/Marlin/src/HAL/HAL_LPC1768/arduino.cpp
@@ -26,6 +26,7 @@
 #include <lpc17xx_pinsel.h>
 
 #include "../../inc/MarlinConfig.h"
+#include "../Delay.h"
 
 // Interrupts
 void cli(void) { __disable_irq(); } // Disable
@@ -40,26 +41,9 @@ uint32_t millis() {
   return _millis;
 }
 
+// This is required for some Arduino libraries we are using
 void delayMicroseconds(uint32_t us) {
-  static const int nop_factor = (SystemCoreClock / 11000000);
-  static volatile int loops = 0;
-
-  //previous ops already burned most of 1us, burn the rest
-  loops = nop_factor / 4; //measured at 1us
-  while (loops > 0) --loops;
-
-  if (us < 2) return;
-  us--;
-
-  //redirect to delay for large values, then set new delay to remainder
-  if (us > 1000) {
-    delay(us / 1000);
-    us = us % 1000;
-  }
-
-  // burn cycles, time in interrupts will not be taken into account
-  loops = us * nop_factor;
-  while (loops > 0) --loops;
+  DELAY_US(us);
 }
 
 extern "C" void delay(const int msec) {
diff --git a/Marlin/src/HAL/HAL_LPC1768/u8g_com_HAL_LPC1768_ssd_sw_i2c.cpp  under construction b/Marlin/src/HAL/HAL_LPC1768/u8g_com_HAL_LPC1768_ssd_sw_i2c.cpp  under construction
index 2edc59c311068378ae375fecbb18ff34cb129ee8..d93ea2a1ff6b90fdac219c82ea8e60289ea4ecaf 100644
--- a/Marlin/src/HAL/HAL_LPC1768/u8g_com_HAL_LPC1768_ssd_sw_i2c.cpp  under construction	
+++ b/Marlin/src/HAL/HAL_LPC1768/u8g_com_HAL_LPC1768_ssd_sw_i2c.cpp  under construction	
@@ -63,7 +63,6 @@
 
   #include <U8glib.h>
 
-void delayMicroseconds(uint32_t us);
 //void pinMode(int16_t pin, uint8_t mode);
 //void digitalWrite(int16_t pin, uint8_t pin_status);
 
@@ -122,13 +121,13 @@ uint8_t u8g_i2c_start_sw(uint8_t sla) {  // assert start condition and then send
 
   LPC_GPIO(SDA_port_HAL_LPC1768_sw_I2C)->FIOCLR = LPC_PIN(SDA_pin_HAL_LPC1768_sw_I2C);
   LPC_GPIO(SCL_port_HAL_LPC1768_sw_I2C)->FIOCLR = LPC_PIN(SCL_pin_HAL_LPC1768_sw_I2C);
-  delayMicroseconds(2);
+  DELAY_US(2);
   LPC_GPIO(SCL_port_HAL_LPC1768_sw_I2C)->FIOSET = LPC_PIN(SCL_pin_HAL_LPC1768_sw_I2C);
-  delayMicroseconds(2);
+  DELAY_US(2);
   LPC_GPIO(SDA_port_HAL_LPC1768_sw_I2C)->FIOSET = LPC_PIN(SDA_pin_HAL_LPC1768_sw_I2C);
-  delayMicroseconds(2);
+  DELAY_US(2);
   LPC_GPIO(SDA_port_HAL_LPC1768_sw_I2C)->FIOCLR = LPC_PIN(SDA_pin_HAL_LPC1768_sw_I2C);
-  delayMicroseconds(2);
+  DELAY_US(2);
   LPC_GPIO(SCL_port_HAL_LPC1768_sw_I2C)->FIOCLR = LPC_PIN(SCL_pin_HAL_LPC1768_sw_I2C);
 
   u8g_i2c_send_byte_sw(I2C_SLA);  // send slave address with write bit
diff --git a/Marlin/src/core/macros.h b/Marlin/src/core/macros.h
index 34896c4f56544845dc4ad88aadc885a5995da9c1..4ba013bc1dbe3fffda513f3017c0dd2a30e7f5a6 100644
--- a/Marlin/src/core/macros.h
+++ b/Marlin/src/core/macros.h
@@ -50,58 +50,8 @@
   #define CYCLES_PER_MICROSECOND (F_CPU / 1000000L) // 16 or 20 on AVR
 #endif
 
-// Processor-level delays for hardware interfaces
-#ifndef _NOP
-  #define _NOP() do { __asm__ volatile ("nop"); } while (0)
-#endif
-#define DELAY_NOPS(X) \
-  switch (X) { \
-    case 20: _NOP(); case 19: _NOP(); case 18: _NOP(); case 17: _NOP(); \
-    case 16: _NOP(); case 15: _NOP(); case 14: _NOP(); case 13: _NOP(); \
-    case 12: _NOP(); case 11: _NOP(); case 10: _NOP(); case  9: _NOP(); \
-    case  8: _NOP(); case  7: _NOP(); case  6: _NOP(); case  5: _NOP(); \
-    case  4: _NOP(); case  3: _NOP(); case  2: _NOP(); case  1: _NOP(); \
-  }
-#define DELAY_0_NOP   NOOP
-#define DELAY_1_NOP   DELAY_NOPS( 1)
-#define DELAY_2_NOP   DELAY_NOPS( 2)
-#define DELAY_3_NOP   DELAY_NOPS( 3)
-#define DELAY_4_NOP   DELAY_NOPS( 4)
-#define DELAY_5_NOP   DELAY_NOPS( 5)
-#define DELAY_10_NOP  DELAY_NOPS(10)
-#define DELAY_20_NOP  DELAY_NOPS(20)
-
-#if CYCLES_PER_MICROSECOND <= 200
-  #define DELAY_100NS DELAY_NOPS((CYCLES_PER_MICROSECOND + 9) / 10)
-#else
-  #define DELAY_100NS DELAY_20_NOP
-#endif
-
-// Microsecond delays for hardware interfaces
-#if CYCLES_PER_MICROSECOND <= 20
-  #define DELAY_1US DELAY_NOPS(CYCLES_PER_MICROSECOND)
-  #define DELAY_US(X) \
-    switch (X) { \
-      case 20: DELAY_1US; case 19: DELAY_1US; case 18: DELAY_1US; case 17: DELAY_1US; \
-      case 16: DELAY_1US; case 15: DELAY_1US; case 14: DELAY_1US; case 13: DELAY_1US; \
-      case 12: DELAY_1US; case 11: DELAY_1US; case 10: DELAY_1US; case  9: DELAY_1US; \
-      case  8: DELAY_1US; case  7: DELAY_1US; case  6: DELAY_1US; case  5: DELAY_1US; \
-      case  4: DELAY_1US; case  3: DELAY_1US; case  2: DELAY_1US; case  1: DELAY_1US; \
-    }
-#else
-  #define DELAY_US(X) delayMicroseconds(X) // May not be usable in CRITICAL_SECTION
-  #define DELAY_1US DELAY_US(1)
-#endif
-#define DELAY_2US  DELAY_US( 2)
-#define DELAY_3US  DELAY_US( 3)
-#define DELAY_4US  DELAY_US( 4)
-#define DELAY_5US  DELAY_US( 5)
-#define DELAY_6US  DELAY_US( 6)
-#define DELAY_7US  DELAY_US( 7)
-#define DELAY_8US  DELAY_US( 8)
-#define DELAY_9US  DELAY_US( 9)
-#define DELAY_10US DELAY_US(10)
-#define DELAY_20US DELAY_US(20)
+// Nanoseconds per cycle
+#define NANOSECONDS_PER_CYCLE (1000000000.0 / F_CPU)
 
 // Remove compiler warning on an unused variable
 #define UNUSED(x) (void) (x)
diff --git a/Marlin/src/feature/Max7219_Debug_LEDs.cpp b/Marlin/src/feature/Max7219_Debug_LEDs.cpp
index 74cbb20f21153f4b0d25b056463f51995645a642..6946490ed46eb5840b065d96536824638a3a5416 100644
--- a/Marlin/src/feature/Max7219_Debug_LEDs.cpp
+++ b/Marlin/src/feature/Max7219_Debug_LEDs.cpp
@@ -60,19 +60,16 @@
 #include "../module/planner.h"
 #include "../module/stepper.h"
 #include "../Marlin.h"
+#include "../HAL/Delay.h"
 
 static uint8_t LEDs[8] = { 0 };
 
 #ifdef CPU_32_BIT
   // Approximate a 1µs delay on 32-bit ARM
-  void SIG_DELAY() {
-    int16_t delay_cycles = CYCLES_PER_MICROSECOND - 10;
-    while (delay_cycles >= 10) { DELAY_NOPS(6); delay_cycles -= 10; }
-    if (delay_cycles > 0) DELAY_NOPS(delay_cycles);
-  }
+  #define SIG_DELAY() DELAY_US(1)
 #else
   // Delay for 0.1875µs (16MHz AVR) or 0.15µs (20MHz AVR)
-  #define SIG_DELAY() DELAY_3_NOP
+  #define SIG_DELAY() DELAY_NS(188)
 #endif
 
 void Max7219_PutByte(uint8_t data) {
diff --git a/Marlin/src/feature/dac/dac_dac084s085.cpp b/Marlin/src/feature/dac/dac_dac084s085.cpp
index e1aeb7e4386d53081d44249a710ab8a3cb87a27a..934b3165bd148338ff55d44737f2eebb0bb8838c 100644
--- a/Marlin/src/feature/dac/dac_dac084s085.cpp
+++ b/Marlin/src/feature/dac/dac_dac084s085.cpp
@@ -12,6 +12,7 @@
 
 #include "../../Marlin.h"
 #include "../../module/stepper.h"
+#include "../../HAL/Delay.h"
 
 dac084s085::dac084s085() { }
 
@@ -27,11 +28,11 @@ void dac084s085::begin() {
   spiBegin();
 
   //init onboard DAC
-  delayMicroseconds(2U);
+  DELAY_US(2);
   WRITE(DAC0_SYNC, LOW);
-  delayMicroseconds(2U);
+  DELAY_US(2);
   WRITE(DAC0_SYNC, HIGH);
-  delayMicroseconds(2U);
+  DELAY_US(2);
   WRITE(DAC0_SYNC, LOW);
 
   spiSend(SPI_CHAN_DAC, externalDac_buf, COUNT(externalDac_buf));
@@ -39,11 +40,11 @@ void dac084s085::begin() {
 
   #if EXTRUDERS > 1
     //init Piggy DAC
-    delayMicroseconds(2U);
+    DELAY_US(2);
     WRITE(DAC1_SYNC, LOW);
-    delayMicroseconds(2U);
+    DELAY_US(2);
     WRITE(DAC1_SYNC, HIGH);
-    delayMicroseconds(2U);
+    DELAY_US(2);
     WRITE(DAC1_SYNC, LOW);
 
     spiSend(SPI_CHAN_DAC, externalDac_buf, COUNT(externalDac_buf));
@@ -66,20 +67,20 @@ void dac084s085::setValue(const uint8_t channel, const uint8_t value) {
 
   if (channel > 3) {        // DAC Piggy E1,E2,E3
     WRITE(DAC1_SYNC, LOW);
-    delayMicroseconds(2U);
+    DELAY_US(2);
     WRITE(DAC1_SYNC, HIGH);
-    delayMicroseconds(2U);
+    DELAY_US(2);
     WRITE(DAC1_SYNC, LOW);
   }
   else {                    // DAC onboard X,Y,Z,E0
     WRITE(DAC0_SYNC, LOW);
-    delayMicroseconds(2U);
+    DELAY_US(2);
     WRITE(DAC0_SYNC, HIGH);
-    delayMicroseconds(2U);
+    DELAY_US(2);
     WRITE(DAC0_SYNC, LOW);
   }
 
-  delayMicroseconds(2U);
+  DELAY_US(2);
   spiSend(SPI_CHAN_DAC, externalDac_buf, COUNT(externalDac_buf));
 }
 
diff --git a/Marlin/src/inc/Conditionals_LCD.h b/Marlin/src/inc/Conditionals_LCD.h
index 67bb5a738fe5070f7690c0241958cc73441e30d7..a9e4fd7333ffb39424a0315358964150a51cd363 100644
--- a/Marlin/src/inc/Conditionals_LCD.h
+++ b/Marlin/src/inc/Conditionals_LCD.h
@@ -120,13 +120,13 @@
 
   #define REPRAP_DISCOUNT_FULL_GRAPHIC_SMART_CONTROLLER
   #ifndef ST7920_DELAY_1
-    #define ST7920_DELAY_1 DELAY_2_NOP
+    #define ST7920_DELAY_1 DELAY_NS(125)
   #endif
   #ifndef ST7920_DELAY_2
-    #define ST7920_DELAY_2 DELAY_2_NOP
+    #define ST7920_DELAY_2 DELAY_NS(125)
   #endif
   #ifndef ST7920_DELAY_3
-    #define ST7920_DELAY_3 DELAY_2_NOP
+    #define ST7920_DELAY_3 DELAY_NS(125)
   #endif
 
 #elif ENABLED(MKS_12864OLED)
diff --git a/Marlin/src/lcd/dogm/ultralcd_st7920_u8glib_rrd_AVR.cpp b/Marlin/src/lcd/dogm/ultralcd_st7920_u8glib_rrd_AVR.cpp
index 91692eb622a9dbd92dc0a95bebc703874bc34dbb..cae764e68c3fff6b70f3445005624dba51443b04 100644
--- a/Marlin/src/lcd/dogm/ultralcd_st7920_u8glib_rrd_AVR.cpp
+++ b/Marlin/src/lcd/dogm/ultralcd_st7920_u8glib_rrd_AVR.cpp
@@ -24,6 +24,7 @@
 // file u8g_dev_st7920_128x64_HAL.cpp for the HAL version.
 
 #include "../../inc/MarlinConfig.h"
+#include "../../HAL/Delay.h"
 
 #if ENABLED(U8GLIB_ST7920)
 
@@ -46,30 +47,30 @@
 #pragma GCC optimize (3)
 
 // If you want you can define your own set of delays in Configuration.h
-//#define ST7920_DELAY_1 DELAY_0_NOP
-//#define ST7920_DELAY_2 DELAY_0_NOP
-//#define ST7920_DELAY_3 DELAY_0_NOP
+//#define ST7920_DELAY_1 DELAY_NS(0)
+//#define ST7920_DELAY_2 DELAY_NS(0)
+//#define ST7920_DELAY_3 DELAY_NS(0)
 
 #if F_CPU >= 20000000
-  #define CPU_ST7920_DELAY_1 DELAY_0_NOP
-  #define CPU_ST7920_DELAY_2 DELAY_0_NOP
-  #define CPU_ST7920_DELAY_3 DELAY_1_NOP
+  #define CPU_ST7920_DELAY_1 DELAY_NS(0)
+  #define CPU_ST7920_DELAY_2 DELAY_NS(0)
+  #define CPU_ST7920_DELAY_3 DELAY_NS(50)
 #elif MB(3DRAG) || MB(K8200) || MB(K8400) || MB(SILVER_GATE)
-  #define CPU_ST7920_DELAY_1 DELAY_0_NOP
-  #define CPU_ST7920_DELAY_2 DELAY_3_NOP
-  #define CPU_ST7920_DELAY_3 DELAY_0_NOP
+  #define CPU_ST7920_DELAY_1 DELAY_NS(0)
+  #define CPU_ST7920_DELAY_2 DELAY_NS(188)
+  #define CPU_ST7920_DELAY_3 DELAY_NS(0)
 #elif MB(MINIRAMBO)
-  #define CPU_ST7920_DELAY_1 DELAY_0_NOP
-  #define CPU_ST7920_DELAY_2 DELAY_4_NOP
-  #define CPU_ST7920_DELAY_3 DELAY_0_NOP
+  #define CPU_ST7920_DELAY_1 DELAY_NS(0)
+  #define CPU_ST7920_DELAY_2 DELAY_NS(250)
+  #define CPU_ST7920_DELAY_3 DELAY_NS(0)
 #elif MB(RAMBO)
-  #define CPU_ST7920_DELAY_1 DELAY_0_NOP
-  #define CPU_ST7920_DELAY_2 DELAY_0_NOP
-  #define CPU_ST7920_DELAY_3 DELAY_0_NOP
+  #define CPU_ST7920_DELAY_1 DELAY_NS(0)
+  #define CPU_ST7920_DELAY_2 DELAY_NS(0)
+  #define CPU_ST7920_DELAY_3 DELAY_NS(0)
 #elif F_CPU == 16000000
-  #define CPU_ST7920_DELAY_1 DELAY_0_NOP
-  #define CPU_ST7920_DELAY_2 DELAY_0_NOP
-  #define CPU_ST7920_DELAY_3 DELAY_1_NOP
+  #define CPU_ST7920_DELAY_1 DELAY_NS(0)
+  #define CPU_ST7920_DELAY_2 DELAY_NS(0)
+  #define CPU_ST7920_DELAY_3 DELAY_NS(63)
 #else
   #error "No valid condition for delays in 'ultralcd_st7920_u8glib_rrd.h'"
 #endif
@@ -101,8 +102,8 @@ static void ST7920_SWSPI_SND_8BIT(uint8_t val) {
   ST7920_SND_BIT; // 8
 }
 
-#if defined(DOGM_SPI_DELAY_US) && DOGM_SPI_DELAY_US > 0
-  #define U8G_DELAY() delayMicroseconds(DOGM_SPI_DELAY_US)
+#if DOGM_SPI_DELAY_US > 0
+  #define U8G_DELAY() DELAY_US(DOGM_SPI_DELAY_US)
 #else
   #define U8G_DELAY() u8g_10MicroDelay()
 #endif
diff --git a/Marlin/src/module/stepper.cpp b/Marlin/src/module/stepper.cpp
index 1c1234c6422de5030a64036981712f199be91c22..d62081d8548ed280e4a43aa52182cd107b4b4df0 100644
--- a/Marlin/src/module/stepper.cpp
+++ b/Marlin/src/module/stepper.cpp
@@ -41,15 +41,18 @@
  * along with Grbl.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-/* The timer calculations of this module informed by the 'RepRap cartesian firmware' by Zack Smith
-   and Philipp Tiefenbacher. */
+/**
+ * Timer calculations informed by the 'RepRap cartesian firmware' by Zack Smith
+ * and Philipp Tiefenbacher.
+ */
 
-/* Jerk controlled movements planner added by Eduardo José Tagle in April
-   2018, Equations based on Synthethos TinyG2 sources, but the fixed-point
-   implementation is a complete new one, as we are running the ISR with a
-   variable period.
-   Also implemented the Bézier velocity curve evaluation in ARM assembler,
-   to avoid impacting ISR speed. */
+/**
+ * Jerk controlled movements planner added Apr 2018 by Eduardo José Tagle.
+ * Equations based on Synthethos TinyG2 sources, but the fixed-point
+ * implementation is new, as we are running the ISR with a variable period.
+ * Also implemented the Bézier velocity curve evaluation in ARM assembler,
+ * to avoid impacting ISR speed.
+ */
 
 #include "stepper.h"
 
@@ -67,6 +70,7 @@
 #include "../gcode/queue.h"
 #include "../sd/cardreader.h"
 #include "../Marlin.h"
+#include "../HAL/Delay.h"
 
 #if MB(ALLIGATOR)
   #include "../feature/dac/dac_dac084s085.h"
@@ -1471,7 +1475,7 @@ void Stepper::isr() {
       while (EXTRA_CYCLES_XYZE > (uint32_t)(HAL_timer_get_count(PULSE_TIMER_NUM) - pulse_start) * (PULSE_TIMER_PRESCALE)) { /* nada */ }
       pulse_start = HAL_timer_get_count(PULSE_TIMER_NUM);
     #elif EXTRA_CYCLES_XYZE > 0
-      DELAY_NOPS(EXTRA_CYCLES_XYZE);
+      DELAY_NS(EXTRA_CYCLES_XYZE * NANOSECONDS_PER_CYCLE);
     #endif
 
     #if HAS_X_STEP
@@ -1506,7 +1510,7 @@ void Stepper::isr() {
     #if EXTRA_CYCLES_XYZE > 20
       if (i) while (EXTRA_CYCLES_XYZE > (uint32_t)(HAL_timer_get_count(PULSE_TIMER_NUM) - pulse_start) * (PULSE_TIMER_PRESCALE)) { /* nada */ }
     #elif EXTRA_CYCLES_XYZE > 0
-      if (i) DELAY_NOPS(EXTRA_CYCLES_XYZE);
+      if (i) DELAY_NS(EXTRA_CYCLES_XYZE * NANOSECONDS_PER_CYCLE);
     #endif
 
   } // steps_loop
@@ -1739,7 +1743,7 @@ void Stepper::isr() {
         while (EXTRA_CYCLES_E > (hal_timer_t)(HAL_timer_get_count(PULSE_TIMER_NUM) - pulse_start) * (PULSE_TIMER_PRESCALE)) { /* nada */ }
         pulse_start = HAL_timer_get_count(PULSE_TIMER_NUM);
       #elif EXTRA_CYCLES_E > 0
-        DELAY_NOPS(EXTRA_CYCLES_E);
+        DELAY_NS(EXTRA_CYCLES_E * NANOSECONDS_PER_CYCLE);
       #endif
 
       switch (LA_active_extruder) {
@@ -1762,7 +1766,7 @@ void Stepper::isr() {
       #if EXTRA_CYCLES_E > 20
         if (e_steps) while (EXTRA_CYCLES_E > (hal_timer_t)(HAL_timer_get_count(PULSE_TIMER_NUM) - pulse_start) * (PULSE_TIMER_PRESCALE)) { /* nada */ }
       #elif EXTRA_CYCLES_E > 0
-        if (e_steps) DELAY_NOPS(EXTRA_CYCLES_E);
+        if (e_steps) DELAY_NS(EXTRA_CYCLES_E * NANOSECONDS_PER_CYCLE);
       #endif
 
     } // e_steps
@@ -2146,13 +2150,13 @@ void Stepper::report_positions() {
   #else
     #define _SAVE_START NOOP
     #if EXTRA_CYCLES_BABYSTEP > 0
-      #define _PULSE_WAIT DELAY_NOPS(EXTRA_CYCLES_BABYSTEP)
+      #define _PULSE_WAIT DELAY_NS(EXTRA_CYCLES_BABYSTEP * NANOSECONDS_PER_CYCLE)
     #elif STEP_PULSE_CYCLES > 0
       #define _PULSE_WAIT NOOP
     #elif ENABLED(DELTA)
-      #define _PULSE_WAIT delayMicroseconds(2);
+      #define _PULSE_WAIT DELAY_US(2);
     #else
-      #define _PULSE_WAIT delayMicroseconds(4);
+      #define _PULSE_WAIT DELAY_US(4);
     #endif
   #endif
 
diff --git a/Marlin/src/module/temperature.cpp b/Marlin/src/module/temperature.cpp
index 0060050243e0c76d907e30df7a11ae8b7ddddb25..46f3dc58c35dd170077e31dc232946545385f1e4 100644
--- a/Marlin/src/module/temperature.cpp
+++ b/Marlin/src/module/temperature.cpp
@@ -30,6 +30,7 @@
 #include "../lcd/ultralcd.h"
 #include "planner.h"
 #include "../core/language.h"
+#include "../HAL/Delay.h"
 
 #if ENABLED(HEATER_0_USES_MAX6675)
   #include "../libs/private_spi.h"
@@ -681,14 +682,14 @@ float Temperature::get_pid_output(const int8_t e) {
         #if ENABLED(PID_EXTRUSION_SCALING)
           cTerm[HOTEND_INDEX] = 0;
           if (_HOTEND_TEST) {
-            long e_position = stepper.position(E_AXIS);
+            const long e_position = stepper.position(E_AXIS);
             if (e_position > last_e_position) {
               lpq[lpq_ptr] = e_position - last_e_position;
               last_e_position = e_position;
             }
-            else {
+            else
               lpq[lpq_ptr] = 0;
-            }
+
             if (++lpq_ptr >= lpq_len) lpq_ptr = 0;
             cTerm[HOTEND_INDEX] = (lpq[lpq_ptr] * planner.steps_to_mm[E_AXIS]) * PID_PARAM(Kc, HOTEND_INDEX);
             pid_output += cTerm[HOTEND_INDEX];
@@ -1629,7 +1630,7 @@ void Temperature::disable_all_heaters() {
 
     WRITE(MAX6675_SS, 0); // enable TT_MAX6675
 
-    DELAY_100NS;          // Ensure 100ns delay
+    DELAY_NS(100);       // Ensure 100ns delay
 
     // Read a big-endian temperature value
     max6675_temp = 0;
diff --git a/Marlin/src/pins/pins_ANET_10.h b/Marlin/src/pins/pins_ANET_10.h
index eee100b120d780667a9124a0842639da5033b47c..18a52c4e8c92f9bb63f3a135b3508b9678be08dc 100644
--- a/Marlin/src/pins/pins_ANET_10.h
+++ b/Marlin/src/pins/pins_ANET_10.h
@@ -177,13 +177,13 @@
     #define BTN_EN2          10
     #define BTN_ENC          16
     #ifndef ST7920_DELAY_1
-      #define ST7920_DELAY_1 DELAY_0_NOP
+      #define ST7920_DELAY_1 DELAY_NS(0)
     #endif
     #ifndef ST7920_DELAY_2
-      #define ST7920_DELAY_2 DELAY_1_NOP
+      #define ST7920_DELAY_2 DELAY_NS(63)
     #endif
     #ifndef ST7920_DELAY_3
-      #define ST7920_DELAY_3 DELAY_2_NOP
+      #define ST7920_DELAY_3 DELAY_NS(125)
     #endif
     #define STD_ENCODER_PULSES_PER_STEP 4
     #define STD_ENCODER_STEPS_PER_MENU_ITEM 1
diff --git a/Marlin/src/pins/pins_MELZI_CREALITY.h b/Marlin/src/pins/pins_MELZI_CREALITY.h
index b42075b76d8bc6548f2bee6128162328dfef81f3..8bfd3b860a22ece5f53672a1fded2fd628c117a1 100644
--- a/Marlin/src/pins/pins_MELZI_CREALITY.h
+++ b/Marlin/src/pins/pins_MELZI_CREALITY.h
@@ -55,13 +55,13 @@
 
 // Alter timing for graphical display
 #ifndef ST7920_DELAY_1
-  #define ST7920_DELAY_1 DELAY_2_NOP
+  #define ST7920_DELAY_1 DELAY_NS(125)
 #endif
 #ifndef ST7920_DELAY_2
-  #define ST7920_DELAY_2 DELAY_2_NOP
+  #define ST7920_DELAY_2 DELAY_NS(125)
 #endif
 #ifndef ST7920_DELAY_3
-  #define ST7920_DELAY_3 DELAY_2_NOP
+  #define ST7920_DELAY_3 DELAY_NS(125)
 #endif
 
 #if ENABLED(MINIPANEL)
diff --git a/Marlin/src/pins/pins_MELZI_MALYAN.h b/Marlin/src/pins/pins_MELZI_MALYAN.h
index 3888b537d11e7935cf8fe215bb7bfe2364cb3d07..a81526f065d17e1768aa1c85c017970c23a7635d 100644
--- a/Marlin/src/pins/pins_MELZI_MALYAN.h
+++ b/Marlin/src/pins/pins_MELZI_MALYAN.h
@@ -44,11 +44,11 @@
 
 // Alter timing for graphical display
 #ifndef ST7920_DELAY_1
-  #define ST7920_DELAY_1 DELAY_2_NOP
+  #define ST7920_DELAY_1 DELAY_NS(125)
 #endif
 #ifndef ST7920_DELAY_2
-  #define ST7920_DELAY_2 DELAY_2_NOP
+  #define ST7920_DELAY_2 DELAY_NS(125)
 #endif
 #ifndef ST7920_DELAY_3
-  #define ST7920_DELAY_3 DELAY_2_NOP
+  #define ST7920_DELAY_3 DELAY_NS(125)
 #endif
diff --git a/Marlin/src/pins/pins_MELZI_TRONXY.h b/Marlin/src/pins/pins_MELZI_TRONXY.h
index 7c66c50c6007a76f7a490ae71cc5fbccca8cb41c..0da7934302c3daf6f66273442a0f775af4547482 100644
--- a/Marlin/src/pins/pins_MELZI_TRONXY.h
+++ b/Marlin/src/pins/pins_MELZI_TRONXY.h
@@ -51,11 +51,11 @@
 #define BTN_ENC         26
 
 #ifndef ST7920_DELAY_1
-  #define ST7920_DELAY_1 DELAY_0_NOP
+  #define ST7920_DELAY_1 DELAY_NS(0)
 #endif
 #ifndef ST7920_DELAY_2
-  #define ST7920_DELAY_2 DELAY_2_NOP
+  #define ST7920_DELAY_2 DELAY_NS(125)
 #endif
 #ifndef ST7920_DELAY_3
-  #define ST7920_DELAY_3 DELAY_0_NOP
+  #define ST7920_DELAY_3 DELAY_NS(0)
 #endif
diff --git a/Marlin/src/pins/pins_PRINTRBOARD_REVF.h b/Marlin/src/pins/pins_PRINTRBOARD_REVF.h
index bf3a023c4de269d3859843a1a9e2c180381e85e3..359a8b85db3438697287f16f446c7eb067a64685 100644
--- a/Marlin/src/pins/pins_PRINTRBOARD_REVF.h
+++ b/Marlin/src/pins/pins_PRINTRBOARD_REVF.h
@@ -244,13 +244,13 @@
 
     // increase delays
     #ifndef ST7920_DELAY_1
-      #define ST7920_DELAY_1 DELAY_5_NOP
+      #define ST7920_DELAY_1 DELAY_NS(313)
     #endif
     #ifndef ST7920_DELAY_2
-      #define ST7920_DELAY_2 DELAY_5_NOP
+      #define ST7920_DELAY_2 DELAY_NS(313)
     #endif
     #ifndef ST7920_DELAY_3
-      #define ST7920_DELAY_3 DELAY_5_NOP
+      #define ST7920_DELAY_3 DELAY_NS(313)
     #endif
 
   #else
diff --git a/Marlin/src/pins/pins_SANGUINOLOLU_11.h b/Marlin/src/pins/pins_SANGUINOLOLU_11.h
index c27eab1c8afe009d1902e5e57fa924e04609618c..ed4a521e7ebb647400bc355a1e97a94470b2375e 100644
--- a/Marlin/src/pins/pins_SANGUINOLOLU_11.h
+++ b/Marlin/src/pins/pins_SANGUINOLOLU_11.h
@@ -239,13 +239,13 @@
     #define BTN_EN2             30
 
     #ifndef ST7920_DELAY_1
-      #define ST7920_DELAY_1 DELAY_0_NOP
+      #define ST7920_DELAY_1 DELAY_NS(0)
     #endif
     #ifndef ST7920_DELAY_2
-      #define ST7920_DELAY_2 DELAY_3_NOP
+      #define ST7920_DELAY_2 DELAY_NS(188)
     #endif
     #ifndef ST7920_DELAY_3
-      #define ST7920_DELAY_3 DELAY_0_NOP
+      #define ST7920_DELAY_3 DELAY_NS(0)
     #endif
 
   #elif ENABLED(ZONESTAR_LCD) // For the Tronxy Melzi boards
diff --git a/Marlin/src/pins/pins_STM32F4.h b/Marlin/src/pins/pins_STM32F4.h
index fe5b6ca63114b47e39b785edb7410c70f6def9b0..addc6a3eac46da97abab439cc0689a5dd425312c 100644
--- a/Marlin/src/pins/pins_STM32F4.h
+++ b/Marlin/src/pins/pins_STM32F4.h
@@ -176,19 +176,14 @@
 //
 // ST7920 Delays
 //
-
-#define STM_NOP __asm__("nop\n\t")
-#define STM_DELAY_SHORT { STM_NOP; STM_NOP; STM_NOP; STM_NOP; }
-#define STM_DELAY_LONG  { STM_DELAY_SHORT; STM_DELAY_SHORT; STM_NOP; STM_NOP; }
-
 #ifndef ST7920_DELAY_1
-  #define ST7920_DELAY_1 { STM_DELAY_SHORT; STM_DELAY_SHORT; }
+  #define ST7920_DELAY_1 DELAY_NS(96)
 #endif
 
 #ifndef ST7920_DELAY_2
-  #define ST7920_DELAY_2 { STM_DELAY_SHORT; }
+  #define ST7920_DELAY_2 DELAY_NS(48)
 #endif
 
 #ifndef ST7920_DELAY_3
-  #define ST7920_DELAY_3 { STM_DELAY_LONG; STM_DELAY_LONG; STM_DELAY_LONG; STM_DELAY_LONG; STM_DELAY_LONG; STM_DELAY_LONG; }
+  #define ST7920_DELAY_3 DELAY_NS(715)
 #endif