diff --git a/arch/arm/cpu/armv8/cache.S b/arch/arm/cpu/armv8/cache.S
index d8462365006288d6820415ba66003f2f26fddf3b..ab8c08917ad7c5ba1b9e48799b629c3f9f0f5286 100644
--- a/arch/arm/cpu/armv8/cache.S
+++ b/arch/arm/cpu/armv8/cache.S
@@ -112,7 +112,7 @@ ENDPROC(__asm_flush_dcache_all)
 
 ENTRY(__asm_invalidate_dcache_all)
 	mov	x16, lr
-	mov	x0, #0xffff
+	mov	x0, #0x1
 	bl	__asm_dcache_all
 	mov	lr, x16
 	ret
diff --git a/arch/arm/cpu/armv8/cache_v8.c b/arch/arm/cpu/armv8/cache_v8.c
index 6bde1cf6a00e01f97f258fd38733de8db179152a..b1ea8227cb68841a3a87680d27bf3f57206e2da4 100644
--- a/arch/arm/cpu/armv8/cache_v8.c
+++ b/arch/arm/cpu/armv8/cache_v8.c
@@ -59,15 +59,15 @@ static void mmu_setup(void)
 	el = current_el();
 	if (el == 1) {
 		set_ttbr_tcr_mair(el, gd->arch.tlb_addr,
-				  TCR_FLAGS | TCR_EL1_IPS_BITS,
+				  TCR_EL1_RSVD | TCR_FLAGS | TCR_EL1_IPS_BITS,
 				  MEMORY_ATTRIBUTES);
 	} else if (el == 2) {
 		set_ttbr_tcr_mair(el, gd->arch.tlb_addr,
-				  TCR_FLAGS | TCR_EL2_IPS_BITS,
+				  TCR_EL2_RSVD | TCR_FLAGS | TCR_EL2_IPS_BITS,
 				  MEMORY_ATTRIBUTES);
 	} else {
 		set_ttbr_tcr_mair(el, gd->arch.tlb_addr,
-				  TCR_FLAGS | TCR_EL3_IPS_BITS,
+				  TCR_EL3_RSVD | TCR_FLAGS | TCR_EL3_IPS_BITS,
 				  MEMORY_ATTRIBUTES);
 	}
 	/* enable the mmu */
diff --git a/arch/arm/include/asm/armv8/mmu.h b/arch/arm/include/asm/armv8/mmu.h
index 0c928d40e7ebe6e35a2a51f3745dd923fbe3e4b4..587ee39909917d272d788d1cc81b7f68cae96faa 100644
--- a/arch/arm/include/asm/armv8/mmu.h
+++ b/arch/arm/include/asm/armv8/mmu.h
@@ -103,13 +103,17 @@
 #define TCR_EL2_IPS_BITS	(3 << 16)	/* 42 bits physical address */
 #define TCR_EL3_IPS_BITS	(3 << 16)	/* 42 bits physical address */
 
-/* PTWs cacheable, inner/outer WBWA and non-shareable */
+/* PTWs cacheable, inner/outer WBWA and inner shareable */
 #define TCR_FLAGS		(TCR_TG0_64K |		\
-				TCR_SHARED_NON |	\
+				TCR_SHARED_INNER |	\
 				TCR_ORGN_WBWA |		\
 				TCR_IRGN_WBWA |		\
 				TCR_T0SZ(VA_BITS))
 
+#define TCR_EL1_RSVD		(1 << 31)
+#define TCR_EL2_RSVD		(1 << 31 | 1 << 23)
+#define TCR_EL3_RSVD		(1 << 31 | 1 << 23)
+
 #ifndef __ASSEMBLY__
 
 void set_pgtable_section(u64 *page_table, u64 index,
diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S
index afd4f102dc87d4c0bcc5d9f2198f121d66f9f62f..4c3a94af572c58b4938885c0258b8ddd9ce13053 100644
--- a/arch/arm/lib/crt0.S
+++ b/arch/arm/lib/crt0.S
@@ -25,7 +25,8 @@
  *    the GD ('global data') structure, both located in some readily
  *    available RAM (SRAM, locked cache...). In this context, VARIABLE
  *    global data, initialized or not (BSS), are UNAVAILABLE; only
- *    CONSTANT initialized data are available.
+ *    CONSTANT initialized data are available. GD should be zeroed
+ *    before board_init_f() is called.
  *
  * 2. Call board_init_f(). This function prepares the hardware for
  *    execution from system RAM (DRAM, DDR...) As system RAM may not
@@ -34,24 +35,29 @@
  *    data include the relocation destination, the future stack, and
  *    the future GD location.
  *
- * (the following applies only to non-SPL builds)
- *
  * 3. Set up intermediate environment where the stack and GD are the
  *    ones allocated by board_init_f() in system RAM, but BSS and
  *    initialized non-const data are still not available.
  *
- * 4. Call relocate_code(). This function relocates U-Boot from its
- *    current location into the relocation destination computed by
- *    board_init_f().
+ * 4a.For U-Boot proper (not SPL), call relocate_code(). This function
+ *    relocates U-Boot from its current location into the relocation
+ *    destination computed by board_init_f().
+ *
+ * 4b.For SPL, board_init_f() just returns (to crt0). There is no
+ *    code relocation in SPL.
  *
  * 5. Set up final environment for calling board_init_r(). This
  *    environment has BSS (initialized to 0), initialized non-const
  *    data (initialized to their intended value), and stack in system
- *    RAM. GD has retained values set by board_init_f(). Some CPUs
- *    have some work left to do at this point regarding memory, so
- *    call c_runtime_cpu_setup.
+ *    RAM (for SPL moving the stack and GD into RAM is optional - see
+ *    CONFIG_SPL_STACK_R). GD has retained values set by board_init_f().
+ *
+ * 6. For U-Boot proper (not SPL), some CPUs have some work left to do
+ *    at this point regarding memory, so call c_runtime_cpu_setup.
+ *
+ * 7. Branch to board_init_r().
  *
- * 6. Branch to board_init_r().
+ * For more information see 'Board Initialisation Flow in README.
  */
 
 /*
diff --git a/arch/arm/lib/crt0_64.S b/arch/arm/lib/crt0_64.S
index 98a906ee111c6110bf0b5aa2b839bc2143ddd892..8b34e04dadae398179d756b108d4838880b193e3 100644
--- a/arch/arm/lib/crt0_64.S
+++ b/arch/arm/lib/crt0_64.S
@@ -27,7 +27,8 @@
  *    the GD ('global data') structure, both located in some readily
  *    available RAM (SRAM, locked cache...). In this context, VARIABLE
  *    global data, initialized or not (BSS), are UNAVAILABLE; only
- *    CONSTANT initialized data are available.
+ *    CONSTANT initialized data are available. GD should be zeroed
+ *    before board_init_f() is called.
  *
  * 2. Call board_init_f(). This function prepares the hardware for
  *    execution from system RAM (DRAM, DDR...) As system RAM may not
@@ -36,24 +37,31 @@
  *    data include the relocation destination, the future stack, and
  *    the future GD location.
  *
- * (the following applies only to non-SPL builds)
- *
  * 3. Set up intermediate environment where the stack and GD are the
  *    ones allocated by board_init_f() in system RAM, but BSS and
  *    initialized non-const data are still not available.
  *
- * 4. Call relocate_code(). This function relocates U-Boot from its
- *    current location into the relocation destination computed by
- *    board_init_f().
+ * 4a.For U-Boot proper (not SPL), call relocate_code(). This function
+ *    relocates U-Boot from its current location into the relocation
+ *    destination computed by board_init_f().
+ *
+ * 4b.For SPL, board_init_f() just returns (to crt0). There is no
+ *    code relocation in SPL.
  *
  * 5. Set up final environment for calling board_init_r(). This
  *    environment has BSS (initialized to 0), initialized non-const
  *    data (initialized to their intended value), and stack in system
- *    RAM. GD has retained values set by board_init_f(). Some CPUs
- *    have some work left to do at this point regarding memory, so
- *    call c_runtime_cpu_setup.
+ *    RAM (for SPL moving the stack and GD into RAM is optional - see
+ *    CONFIG_SPL_STACK_R). GD has retained values set by board_init_f().
+ *
+ * TODO: For SPL, implement stack relocation on AArch64.
  *
- * 6. Branch to board_init_r().
+ * 6. For U-Boot proper (not SPL), some CPUs have some work left to do
+ *    at this point regarding memory, so call c_runtime_cpu_setup.
+ *
+ * 7. Branch to board_init_r().
+ *
+ * For more information see 'Board Initialisation Flow in README.
  */
 
 ENTRY(_main)
@@ -106,6 +114,8 @@ relocation_return:
  */
 	bl	c_runtime_cpu_setup		/* still call old routine */
 
+/* TODO: For SPL, call spl_relocate_stack_gd() to alloc stack relocation */
+
 /*
  * Clear BSS section
  */
diff --git a/arch/arm/lib/gic_64.S b/arch/arm/lib/gic_64.S
index a3e18f7713e571ead9d9f81c6242e486e1c092da..62d0022408bce01ebe02f3a4a91feda2145e4688 100644
--- a/arch/arm/lib/gic_64.S
+++ b/arch/arm/lib/gic_64.S
@@ -46,11 +46,19 @@ ENTRY(gic_init_secure)
 	ldr	w9, [x0, GICD_TYPER]
 	and	w10, w9, #0x1f		/* ITLinesNumber */
 	cbz	w10, 1f			/* No SPIs */
-	add	x11, x0, (GICD_IGROUPRn + 4)
+	add	x11, x0, GICD_IGROUPRn
 	mov	w9, #~0			/* Config SPIs as Grp1 */
+	str	w9, [x11], #0x4
 0:	str	w9, [x11], #0x4
 	sub	w10, w10, #0x1
 	cbnz	w10, 0b
+
+	ldr	x1, =GICC_BASE		/* GICC_CTLR */
+	mov	w0, #3			/* EnableGrp0 | EnableGrp1 */
+	str	w0, [x1]
+
+	mov	w0, #1 << 7		/* allow NS access to GICC_PMR */
+	str	w0, [x1, #4]		/* GICC_PMR */
 #endif
 1:
 	ret