diff --git a/arch/arm/cpu/armv7/sunxi/dram.c b/arch/arm/cpu/armv7/sunxi/dram.c
index 30483913b3544d3926c8779101bf3314d817e958..be6750d880cb905b9d7f7c0893a4743f2e0d63ad 100644
--- a/arch/arm/cpu/armv7/sunxi/dram.c
+++ b/arch/arm/cpu/armv7/sunxi/dram.c
@@ -136,6 +136,14 @@ static void mctl_itm_enable(void)
 	clrbits_le32(&dram->ccr, DRAM_CCR_ITM_OFF);
 }
 
+static void mctl_itm_reset(void)
+{
+	mctl_itm_disable();
+	udelay(1); /* ITM reset needs a bit of delay */
+	mctl_itm_enable();
+	udelay(1);
+}
+
 static void mctl_enable_dll0(u32 phase)
 {
 	struct sunxi_dram_reg *dram = (struct sunxi_dram_reg *)SUNXI_DRAMC_BASE;
@@ -356,6 +364,37 @@ static void mctl_setup_dram_clock(u32 clk, u32 mbus_clk)
 	udelay(22);
 }
 
+/*
+ * The data from rslrX and rdgrX registers (X=rank) is stored
+ * in a single 32-bit value using the following format:
+ *   bits [31:26] - DQS gating system latency for byte lane 3
+ *   bits [25:24] - DQS gating phase select for byte lane 3
+ *   bits [23:18] - DQS gating system latency for byte lane 2
+ *   bits [17:16] - DQS gating phase select for byte lane 2
+ *   bits [15:10] - DQS gating system latency for byte lane 1
+ *   bits [ 9:8 ] - DQS gating phase select for byte lane 1
+ *   bits [ 7:2 ] - DQS gating system latency for byte lane 0
+ *   bits [ 1:0 ] - DQS gating phase select for byte lane 0
+ */
+static void mctl_set_dqs_gating_delay(int rank, u32 dqs_gating_delay)
+{
+	struct sunxi_dram_reg *dram = (struct sunxi_dram_reg *)SUNXI_DRAMC_BASE;
+	u32 lane, number_of_lanes = mctl_get_number_of_lanes();
+	/* rank0 gating system latency (3 bits per lane: cycles) */
+	u32 slr = readl(rank == 0 ? &dram->rslr0 : &dram->rslr1);
+	/* rank0 gating phase select (2 bits per lane: 90, 180, 270, 360) */
+	u32 dgr = readl(rank == 0 ? &dram->rdgr0 : &dram->rdgr1);
+	for (lane = 0; lane < number_of_lanes; lane++) {
+		u32 tmp = dqs_gating_delay >> (lane * 8);
+		slr &= ~(7 << (lane * 3));
+		slr |= ((tmp >> 2) & 7) << (lane * 3);
+		dgr &= ~(3 << (lane * 2));
+		dgr |= (tmp & 3) << (lane * 2);
+	}
+	writel(slr, rank == 0 ? &dram->rslr0 : &dram->rslr1);
+	writel(dgr, rank == 0 ? &dram->rdgr0 : &dram->rdgr1);
+}
+
 static int dramc_scan_readpipe(void)
 {
 	struct sunxi_dram_reg *dram = (struct sunxi_dram_reg *)SUNXI_DRAMC_BASE;
@@ -618,7 +657,7 @@ unsigned long dramc_init(struct dram_para *para)
 	writel(para->emr2, &dram->emr2);
 	writel(para->emr3, &dram->emr3);
 
-	/* set DQS window mode */
+	/* disable drift compensation and set passive DQS window mode */
 	clrsetbits_le32(&dram->ccr, DRAM_CCR_DQS_DRIFT_COMP, DRAM_CCR_DQS_GATE);
 
 #ifdef CONFIG_SUN7I
@@ -631,11 +670,25 @@ unsigned long dramc_init(struct dram_para *para)
 
 	/* scan read pipe value */
 	mctl_itm_enable();
+
+	/* Hardware DQS gate training */
 	ret_val = dramc_scan_readpipe();
 
 	if (ret_val < 0)
 		return 0;
 
+	/* allow to override the DQS training results with a custom delay */
+	if (para->dqs_gating_delay)
+		mctl_set_dqs_gating_delay(0, para->dqs_gating_delay);
+
+	/* set the DQS gating window type */
+	if (para->active_windowing)
+		clrbits_le32(&dram->ccr, DRAM_CCR_DQS_GATE);
+	else
+		setbits_le32(&dram->ccr, DRAM_CCR_DQS_GATE);
+
+	mctl_itm_reset();
+
 	/* configure all host port */
 	mctl_configure_hostport();
 
diff --git a/arch/arm/include/asm/arch-sunxi/dram.h b/arch/arm/include/asm/arch-sunxi/dram.h
index 71301dbf33f085ab9bda0c853887da485386f945..1945f75441f4dce6cb0609527edf028e4512cd15 100644
--- a/arch/arm/include/asm/arch-sunxi/dram.h
+++ b/arch/arm/include/asm/arch-sunxi/dram.h
@@ -88,6 +88,8 @@ struct dram_para {
 	u32 emr1;
 	u32 emr2;
 	u32 emr3;
+	u32 dqs_gating_delay;
+	u32 active_windowing;
 };
 
 #define DRAM_CCR_COMMAND_RATE_1T (0x1 << 5)