diff --git a/drivers/mmc/bcm2835_sdhost.c b/drivers/mmc/bcm2835_sdhost.c
index 96428333b0abb2d28f3d845998fa2d44a70ce9ef..1ce019af579fb5b1e06bb04ac816733678bb51cf 100644
--- a/drivers/mmc/bcm2835_sdhost.c
+++ b/drivers/mmc/bcm2835_sdhost.c
@@ -163,7 +163,6 @@ struct bcm2835_host {
 	int			clock;		/* Current clock speed */
 	unsigned int		max_clk;	/* Max possible freq */
 	unsigned int		blocks;		/* remaining PIO blocks */
-	int			irq;		/* Device IRQ */
 
 	u32			ns_per_fifo_word;
 
@@ -173,14 +172,7 @@ struct bcm2835_host {
 
 	struct mmc_cmd	*cmd;		/* Current command */
 	struct mmc_data		*data;		/* Current data request */
-	bool			data_complete:1;/* Data finished before cmd */
 	bool			use_busy:1;	/* Wait for busy interrupt */
-	bool			wait_data_complete:1;	/* Wait for data */
-
-	/* for threaded irq handler */
-	bool			irq_block;
-	bool			irq_busy;
-	bool			irq_data;
 
 	struct udevice		*dev;
 	struct mmc		*mmc;
@@ -240,17 +232,9 @@ static void bcm2835_reset_internal(struct bcm2835_host *host)
 	writel(host->cdiv, host->ioaddr + SDCDIV);
 }
 
-static int bcm2835_finish_command(struct bcm2835_host *host);
-
-static void bcm2835_wait_transfer_complete(struct bcm2835_host *host)
+static int bcm2835_wait_transfer_complete(struct bcm2835_host *host)
 {
-	int timediff;
-	u32 alternate_idle;
-
-	alternate_idle = (host->data->flags & MMC_DATA_READ) ?
-		SDEDM_FSM_READWAIT : SDEDM_FSM_WRITESTART1;
-
-	timediff = 0;
+	int timediff = 0;
 
 	while (1) {
 		u32 edm, fsm;
@@ -261,7 +245,10 @@ static void bcm2835_wait_transfer_complete(struct bcm2835_host *host)
 		if ((fsm == SDEDM_FSM_IDENTMODE) ||
 		    (fsm == SDEDM_FSM_DATAMODE))
 			break;
-		if (fsm == alternate_idle) {
+
+		if ((fsm == SDEDM_FSM_READWAIT) ||
+		    (fsm == SDEDM_FSM_WRITESTART1) ||
+		    (fsm == SDEDM_FSM_READDATA)) {
 			writel(edm | SDEDM_FORCE_DATA_MODE,
 			       host->ioaddr + SDEDM);
 			break;
@@ -273,9 +260,11 @@ static void bcm2835_wait_transfer_complete(struct bcm2835_host *host)
 				"wait_transfer_complete - still waiting after %d retries\n",
 				timediff);
 			bcm2835_dumpregs(host);
-			return;
+			return -ETIMEDOUT;
 		}
 	}
+
+	return 0;
 }
 
 static int bcm2835_transfer_block_pio(struct bcm2835_host *host, bool is_read)
@@ -322,6 +311,9 @@ static int bcm2835_transfer_block_pio(struct bcm2835_host *host, bool is_read)
 			      fsm_state != SDEDM_FSM_READCRC)) ||
 			    (!is_read &&
 			     (fsm_state != SDEDM_FSM_WRITEDATA &&
+			      fsm_state != SDEDM_FSM_WRITEWAIT1 &&
+			      fsm_state != SDEDM_FSM_WRITEWAIT2 &&
+			      fsm_state != SDEDM_FSM_WRITECRC &&
 			      fsm_state != SDEDM_FSM_WRITESTART1 &&
 			      fsm_state != SDEDM_FSM_WRITESTART2))) {
 				hsts = readl(host->ioaddr + SDHSTS);
@@ -358,9 +350,8 @@ static int bcm2835_transfer_pio(struct bcm2835_host *host)
 
 	is_read = (host->data->flags & MMC_DATA_READ) != 0;
 	ret = bcm2835_transfer_block_pio(host, is_read);
-
-	if (host->wait_data_complete)
-		bcm2835_wait_transfer_complete(host);
+	if (ret)
+		return ret;
 
 	sdhsts = readl(host->ioaddr + SDHSTS);
 	if (sdhsts & (SDHSTS_CRC16_ERROR |
@@ -379,21 +370,8 @@ static int bcm2835_transfer_pio(struct bcm2835_host *host)
 	return ret;
 }
 
-static void bcm2835_set_transfer_irqs(struct bcm2835_host *host)
-{
-	u32 all_irqs = SDHCFG_DATA_IRPT_EN | SDHCFG_BLOCK_IRPT_EN |
-		SDHCFG_BUSY_IRPT_EN;
-
-	host->hcfg = (host->hcfg & ~all_irqs) |
-		SDHCFG_DATA_IRPT_EN |
-		SDHCFG_BUSY_IRPT_EN;
-
-	writel(host->hcfg, host->ioaddr + SDHCFG);
-}
-
-static
-void bcm2835_prepare_data(struct bcm2835_host *host, struct mmc_cmd *cmd,
-			  struct mmc_data *data)
+static void bcm2835_prepare_data(struct bcm2835_host *host, struct mmc_cmd *cmd,
+				 struct mmc_data *data)
 {
 	WARN_ON(host->data);
 
@@ -401,14 +379,9 @@ void bcm2835_prepare_data(struct bcm2835_host *host, struct mmc_cmd *cmd,
 	if (!data)
 		return;
 
-	host->wait_data_complete = cmd->cmdidx != MMC_CMD_READ_MULTIPLE_BLOCK;
-	host->data_complete = false;
-
 	/* Use PIO */
 	host->blocks = data->blocks;
 
-	bcm2835_set_transfer_irqs(host);
-
 	writel(data->blocksize, host->ioaddr + SDHBCT);
 	writel(data->blocks, host->ioaddr + SDHBLC);
 }
@@ -483,36 +456,6 @@ static int bcm2835_send_command(struct bcm2835_host *host, struct mmc_cmd *cmd,
 	return 0;
 }
 
-static int bcm2835_transfer_complete(struct bcm2835_host *host)
-{
-	int ret = 0;
-
-	WARN_ON(!host->data_complete);
-
-	host->data = NULL;
-
-	return ret;
-}
-
-static void bcm2835_finish_data(struct bcm2835_host *host)
-{
-	host->hcfg &= ~(SDHCFG_DATA_IRPT_EN | SDHCFG_BLOCK_IRPT_EN);
-	writel(host->hcfg, host->ioaddr + SDHCFG);
-
-	host->data_complete = true;
-
-	if (host->cmd) {
-		/* Data managed to finish before the
-		 * command completed. Make sure we do
-		 * things in the proper order.
-		 */
-		dev_dbg(dev, "Finished early - HSTS %08x\n",
-			readl(host->ioaddr + SDHSTS));
-	} else {
-		bcm2835_transfer_complete(host);
-	}
-}
-
 static int bcm2835_finish_command(struct bcm2835_host *host)
 {
 	struct mmc_cmd *cmd = host->cmd;
@@ -562,8 +505,6 @@ static int bcm2835_finish_command(struct bcm2835_host *host)
 
 	/* Processed actual command. */
 	host->cmd = NULL;
-	if (host->data && host->data_complete)
-		ret = bcm2835_transfer_complete(host);
 
 	return ret;
 }
@@ -608,159 +549,44 @@ static int bcm2835_check_data_error(struct bcm2835_host *host, u32 intmask)
 	return ret;
 }
 
-static void bcm2835_busy_irq(struct bcm2835_host *host)
-{
-	if (WARN_ON(!host->cmd)) {
-		bcm2835_dumpregs(host);
-		return;
-	}
-
-	if (WARN_ON(!host->use_busy)) {
-		bcm2835_dumpregs(host);
-		return;
-	}
-	host->use_busy = false;
-
-	bcm2835_finish_command(host);
-}
-
-static void bcm2835_data_irq(struct bcm2835_host *host, u32 intmask)
+static int bcm2835_transmit(struct bcm2835_host *host)
 {
+	u32 intmask = readl(host->ioaddr + SDHSTS);
 	int ret;
 
-	/*
-	 * There are no dedicated data/space available interrupt
-	 * status bits, so it is necessary to use the single shared
-	 * data/space available FIFO status bits. It is therefore not
-	 * an error to get here when there is no data transfer in
-	 * progress.
-	 */
-	if (!host->data)
-		return;
-
+	/* Check for errors */
 	ret = bcm2835_check_data_error(host, intmask);
 	if (ret)
-		goto finished;
-
-	if (host->data->flags & MMC_DATA_WRITE) {
-		/* Use the block interrupt for writes after the first block */
-		host->hcfg &= ~(SDHCFG_DATA_IRPT_EN);
-		host->hcfg |= SDHCFG_BLOCK_IRPT_EN;
-		writel(host->hcfg, host->ioaddr + SDHCFG);
-		bcm2835_transfer_pio(host);
-	} else {
-		bcm2835_transfer_pio(host);
-		host->blocks--;
-		if ((host->blocks == 0))
-			goto finished;
-	}
-	return;
+		return ret;
 
-finished:
-	host->hcfg &= ~(SDHCFG_DATA_IRPT_EN | SDHCFG_BLOCK_IRPT_EN);
-	writel(host->hcfg, host->ioaddr + SDHCFG);
-}
-
-static void bcm2835_data_threaded_irq(struct bcm2835_host *host)
-{
-	if (!host->data)
-		return;
-	if ((host->blocks == 0))
-		bcm2835_finish_data(host);
-}
-
-static void bcm2835_block_irq(struct bcm2835_host *host)
-{
-	if (WARN_ON(!host->data)) {
-		bcm2835_dumpregs(host);
-		return;
-	}
-
-	WARN_ON(!host->blocks);
-	if ((--host->blocks == 0))
-		bcm2835_finish_data(host);
-	else
-		bcm2835_transfer_pio(host);
-}
+	ret = bcm2835_check_cmd_error(host, intmask);
+	if (ret)
+		return ret;
 
-static irqreturn_t bcm2835_irq(int irq, void *dev_id)
-{
-	irqreturn_t result = IRQ_NONE;
-	struct bcm2835_host *host = dev_id;
-	u32 intmask;
-
-	intmask = readl(host->ioaddr + SDHSTS);
-
-	writel(SDHSTS_BUSY_IRPT |
-	       SDHSTS_BLOCK_IRPT |
-	       SDHSTS_SDIO_IRPT |
-	       SDHSTS_DATA_FLAG,
-	       host->ioaddr + SDHSTS);
-
-	if (intmask & SDHSTS_BLOCK_IRPT) {
-		bcm2835_check_data_error(host, intmask);
-		host->irq_block = true;
-		result = IRQ_WAKE_THREAD;
+	/* Handle wait for busy end */
+	if (host->use_busy && (intmask & SDHSTS_BUSY_IRPT)) {
+		writel(SDHSTS_BUSY_IRPT, host->ioaddr + SDHSTS);
+		host->use_busy = false;
+		bcm2835_finish_command(host);
 	}
 
-	if (intmask & SDHSTS_BUSY_IRPT) {
-		if (!bcm2835_check_cmd_error(host, intmask)) {
-			host->irq_busy = true;
-			result = IRQ_WAKE_THREAD;
-		} else {
-			result = IRQ_HANDLED;
+	/* Handle PIO data transfer */
+	if (host->data) {
+		ret = bcm2835_transfer_pio(host);
+		if (ret)
+			return ret;
+		host->blocks--;
+		if (host->blocks == 0) {
+			/* Wait for command to complete for real */
+			ret = bcm2835_wait_transfer_complete(host);
+			if (ret)
+				return ret;
+			/* Transfer complete */
+			host->data = NULL;
 		}
 	}
 
-	/* There is no true data interrupt status bit, so it is
-	 * necessary to qualify the data flag with the interrupt
-	 * enable bit.
-	 */
-	if ((intmask & SDHSTS_DATA_FLAG) &&
-	    (host->hcfg & SDHCFG_DATA_IRPT_EN)) {
-		bcm2835_data_irq(host, intmask);
-		host->irq_data = true;
-		result = IRQ_WAKE_THREAD;
-	}
-
-	return result;
-}
-
-static irqreturn_t bcm2835_threaded_irq(int irq, void *dev_id)
-{
-	struct bcm2835_host *host = dev_id;
-
-	if (host->irq_block) {
-		host->irq_block = false;
-		bcm2835_block_irq(host);
-	}
-
-	if (host->irq_busy) {
-		host->irq_busy = false;
-		bcm2835_busy_irq(host);
-	}
-
-	if (host->irq_data) {
-		host->irq_data = false;
-		bcm2835_data_threaded_irq(host);
-	}
-
-	return IRQ_HANDLED;
-}
-
-static void bcm2835_irq_poll(struct bcm2835_host *host)
-{
-	u32 intmask;
-
-	while (1) {
-		intmask = readl(host->ioaddr + SDHSTS);
-		if (intmask & (SDHSTS_BUSY_IRPT | SDHSTS_BLOCK_IRPT |
-			       SDHSTS_SDIO_IRPT | SDHSTS_DATA_FLAG)) {
-			bcm2835_irq(0, host);
-			bcm2835_threaded_irq(0, host);
-			return;
-		}
-	}
+	return 0;
 }
 
 static void bcm2835_set_clock(struct bcm2835_host *host, unsigned int clock)
@@ -864,8 +690,11 @@ static int bcm2835_send_cmd(struct udevice *dev, struct mmc_cmd *cmd,
 	}
 
 	/* Wait for completion of busy signal or data transfer */
-	while (host->use_busy || host->data)
-		bcm2835_irq_poll(host);
+	while (host->use_busy || host->data) {
+		ret = bcm2835_transmit(host);
+		if (ret)
+			break;
+	}
 
 	return ret;
 }
diff --git a/include/configs/rpi.h b/include/configs/rpi.h
index 69a22e17009a6ab7b31e288502efbb58a3669cb1..649a425bcdeaa20a06199875de378ebf49fdb8a4 100644
--- a/include/configs/rpi.h
+++ b/include/configs/rpi.h
@@ -95,39 +95,50 @@
  *
  * I suspect address 0 is used as the SMP pen on the RPi2, so avoid this.
  *
- * fdt_addr_r simply shouldn't overlap anything else. However, the RPi's
- *   binary firmware loads a DT to address 0x100, so we choose this address to
- *   match it. This allows custom boot scripts to pass this DT on to Linux
- *   simply by not over-writing the data at this address. When using U-Boot,
- *   U-Boot (and scripts it executes) typicaly ignore the DT loaded by the FW
- *   and loads its own DT from disk (triggered by boot.scr or extlinux.conf).
+ * Older versions of the boot firmware place the firmware-loaded DTB at 0x100,
+ * newer versions place it in high memory. So prevent U-Boot from doing its own
+ * DTB + initrd relocation so that we won't accidentally relocate the initrd
+ * over the firmware-loaded DTB and generally try to lay out things starting
+ * from the bottom of RAM.
  *
- * pxefile_addr_r can be pretty much anywhere that doesn't conflict with
- *   something else. Put it low in memory to avoid conflicts.
+ * kernel_addr_r has different constraints on ARM and Aarch64.  For 32-bit ARM,
+ * it must be within the first 128M of RAM in order for the kernel's
+ * CONFIG_AUTO_ZRELADDR option to work. The kernel itself will be decompressed
+ * to 0x8000 but the decompressor clobbers 0x4000-0x8000 as well. The
+ * decompressor also likes to relocate itself to right past the end of the
+ * decompressed kernel, so in total the sum of the compressed and and
+ * decompressed kernel needs to be reserved.
  *
- * kernel_addr_r must be within the first 128M of RAM in order for the
- *   kernel's CONFIG_AUTO_ZRELADDR option to work. Since the kernel will
- *   decompress itself to 0x8000 after the start of RAM, kernel_addr_r
- *   should not overlap that area, or the kernel will have to copy itself
- *   somewhere else before decompression. Similarly, the address of any other
- *   data passed to the kernel shouldn't overlap the start of RAM. Pushing
- *   this up to 16M allows for a sizable kernel to be decompressed below the
- *   compressed load address.
+ *   For Aarch64, the kernel image is uncompressed and must be loaded at
+ *   text_offset bytes (specified in the header of the Image) into a 2MB
+ *   boundary. The 'booti' command relocates the image if necessary. Linux uses
+ *   a default text_offset of 0x80000.  In summary, loading at 0x80000
+ *   satisfies all these constraints and reserving memory up to 0x02400000
+ *   permits fairly large (roughly 36M) kernels.
  *
- * scriptaddr can be pretty much anywhere that doesn't conflict with something
- *   else. Choosing 32M allows for the compressed kernel to be up to 16M.
+ * scriptaddr and pxefile_addr_r can be pretty much anywhere that doesn't
+ * conflict with something else. Reserving 1M for each of them at
+ * 0x02400000-0x02500000 and 0x02500000-0x02600000 should be plenty.
  *
- * ramdisk_addr_r simply shouldn't overlap anything else. Choosing 33M allows
- *   for any boot script to be up to 1M, which is hopefully plenty.
+ * On ARM, both the DTB and any possible initrd must be loaded such that they
+ * fit inside the lowmem mapping in Linux. In practice, this usually means not
+ * more than ~700M away from the start of the kernel image but this number can
+ * be larger OR smaller depending on e.g. the 'vmalloc=xxxM' command line
+ * parameter given to the kernel. So reserving memory from low to high
+ * satisfies this constraint again. Reserving 1M at 0x02600000-0x02700000 for
+ * the DTB leaves rest of the free RAM to the initrd starting at 0x02700000.
+ * Even with the smallest possible CPU-GPU memory split of the CPU getting
+ * only 64M, the remaining 25M starting at 0x02700000 should allow quite
+ * large initrds before they start colliding with U-Boot.
  */
 #define ENV_MEM_LAYOUT_SETTINGS \
 	"fdt_high=ffffffff\0" \
 	"initrd_high=ffffffff\0" \
-	"fdt_addr_r=0x00000100\0" \
-	"pxefile_addr_r=0x00100000\0" \
-	"kernel_addr_r=0x01000000\0" \
-	"scriptaddr=0x02000000\0" \
-	"ramdisk_addr_r=0x02100000\0" \
+	"kernel_addr_r=0x00080000\0" \
+	"scriptaddr=0x02400000\0" \
+	"pxefile_addr_r=0x02500000\0" \
+	"fdt_addr_r=0x02600000\0" \
+	"ramdisk_addr_r=0x02700000\0"
 
 #define BOOT_TARGET_DEVICES(func) \
 	func(MMC, mmc, 0) \