diff --git a/cMIPS/docs/cMIPS.pdf b/cMIPS/docs/cMIPS.pdf
index 8092a007ecab134b226874eee489e3cdfc1d8bf9..132dcc6716f71839a57c6520fa453512337bedb6 100644
Binary files a/cMIPS/docs/cMIPS.pdf and b/cMIPS/docs/cMIPS.pdf differ
diff --git a/cMIPS/include/cMIPSio.c b/cMIPS/include/cMIPSio.c
index 5554749aa652ebc53cb54ddbb2752a9d8e9c3ced..c1f74aac4eee700bc3ba85524a948bdb49e877d7 100644
--- a/cMIPS/include/cMIPSio.c
+++ b/cMIPS/include/cMIPSio.c
@@ -26,8 +26,9 @@ void print(int n) {
 // write a character to VHDL simulator's standard output
 void to_stdout(char c) {
   int *IO = (int *)IO_STDOUT_ADDR;
-
-  *IO = c; // prints only after receiving a '\0' or a '\n' (line-feed, 0x0a)
+  
+  // prints line only after receiving a '\0' or a '\n' (line-feed, 0x0a)
+  *IO = (unsigned char)c;
 }
 
 // read a character from VHDL simulator's standard input
@@ -91,8 +92,9 @@ void dumpRAM(void) {
 // system statistics -- read system counters
 //=======================================================================
 void readStats(sStats *s) {
-  int *IO = (int *)IO_STATS_ADDR;
 #if 0
+  int *IO = (int *)IO_STATS_ADDR;
+
   s->dc_ref    = *(IO+0);
   s->dc_rd_hit = *(IO+1);
   s->dc_wr_hit = *(IO+2);
diff --git a/cMIPS/tests/jr_2.expected b/cMIPS/tests/jr_2.expected
index 1c70bb742e984c02ccb0a4f688f19ddba74474f2..09972f8846e0b03bff8ff1f4fe69a02aa3bf4fba 100644
--- a/cMIPS/tests/jr_2.expected
+++ b/cMIPS/tests/jr_2.expected
@@ -7,3 +7,12 @@
 00000054
 00000060
 0000006c
+0000000c
+00000018
+00000024
+00000030
+0000003c
+00000048
+00000054
+00000060
+0000006c
diff --git a/cMIPS/tests/jr_2.s b/cMIPS/tests/jr_2.s
index 4a853a64b82d8f8d14cfc33b9033fe6f070233da..b5f9208e279f8090eaf569c7935e6bc19c6ebbb7 100644
--- a/cMIPS/tests/jr_2.s
+++ b/cMIPS/tests/jr_2.s
@@ -6,25 +6,60 @@
 	.ent _start
 _start: la $16, x_IO_BASE_ADDR
 	la $15,(x_DATA_BASE_ADDR+0x10)
+
+	##
+	## let's check stalls for add r1 ; jr r1
+	##
+
 	la $5, snd
-	li $3,1
-	li $4,5
-	addi  $29,$0,100
-	move  $8,$zero
-snd:	#sw   $31, 0($16) # $31 <- 0,snd+4
-	add  $8,$8,$3    # $8  <-  1, 7,13,19,25,31,
-	add  $8,$8,$4    # $8  <-  6,12,18,24,30,36,
-	add  $9,$8,$8    # $9  <- 12,24,36,48,60,72,
-	sw   $9, 4($16)
-	slt  $28,$9,$29
-        beq  $28,$0,.L1
+	li $3, 1
+	li $4, 5
+	addi  $29, $0, 100
+	move  $8, $zero
+snd:	#sw   $31, 0($16)  # $31 <- 0,snd+4
+	add  $8, $8, $3    # $8  <-  1, 7,13,19,25,31,
+	add  $8, $8, $4    # $8  <-  6,12,18,24,30,36,
+	add  $9, $8, $8    # $9  <- 12,24,36,48,60,72,
+	sw   $9, 0($16)
+	slt  $28, $9, $29
+        beq  $28, $0, trd
 	nop
-	add  $9,$0,$5
+	add  $9, $0, $5
 	jr   $9
 	nop
+
+	##
+	## now let's check stalls for lw r1 ; jr r1
+	##
+	
+trd:	la   $10, loop	# start of loop address
+	la   $11, addr	# keep it in in memory
+	sw   $10, 0($11)
+	move $8, $zero
+	li   $3, 1
+	li   $4, 5
+	
+loop:	
+	add  $8, $8, $3    # $8  <-  1, 7,13,19,25,31,
+	add  $8, $8, $4    # $8  <-  6,12,18,24,30,36,
+	add  $9, $8, $8    # $9  <- 12,24,36,48,60,72,
+	sw   $9, 0($16)
+	slt  $28, $9, $29
+        beq  $28, $0, .L1
+	nop
+	la   $11, addr	# keep it in in memory
+	lw   $9, 0($11)
+	jr   $9
+	nop
+
 .L1:
 end:	nop
 	nop
 	wait
 	nop
 	.end _start
+
+	.data
+	.align 4
+	.space 128
+addr:	.word  0 
diff --git a/cMIPS/tests/lwFWDsw.s b/cMIPS/tests/lwFWDsw.s
index d6745c3ced471a26f50d784b8870bbfd89205cc7..242839113f5d8277a3724b135836a354fdcc709b 100644
--- a/cMIPS/tests/lwFWDsw.s
+++ b/cMIPS/tests/lwFWDsw.s
@@ -4,20 +4,22 @@
 	.set noreorder
 	.globl _start
 	.ent _start
+
 _start:	nop
-	la  $15, x_DATA_BASE_ADDR + 0x10
-	la  $16, x_IO_BASE_ADDR
-	addi  $3,$0,-10
-	ori   $5,$0,4
-        addi  $9,$0,10
+	la    $15, x_DATA_BASE_ADDR
+	la    $16, x_IO_BASE_ADDR
+	addi  $3, $0, -10
+	ori   $5, $0, 4
+        addi  $9, $0, 10
 	nop
-snd:	sw   $3, 4($15)
-	addi $3,$3,1
-	lw   $4, 4($15)
-	sw   $4, 0($16)
-	add  $15,$15,$5
-	slt  $8,$3,$9
-        bne  $8,$0,snd
+
+snd:	sw   $3, 4($15)		# mem[i+1] <= count
+	addi $3, $3, 1		# count ++
+	lw   $4, 4($15)		# $4 <= mem[i+1]
+	sw   $4, 0($16)		# print $4
+	add  $15, $15, $5	# i++
+	slt  $8, $3, $9		# reached 10 rounds?
+        bne  $8, $0, snd	#    no, continue
         nop
         wait
         nop
diff --git a/cMIPS/tests/lwFWDsw2.s b/cMIPS/tests/lwFWDsw2.s
index 2c3d3784bfd2b06160d13c2ce20562192493bbdc..72119a0bceccf200ad0331c4b8b189956eee853b 100644
--- a/cMIPS/tests/lwFWDsw2.s
+++ b/cMIPS/tests/lwFWDsw2.s
@@ -5,12 +5,12 @@
 	.globl _start
 	.ent _start
 _start:	nop
-	la    $17, (x_DATA_BASE_ADDR) # base address of RAM
+	la    $17, x_DATA_BASE_ADDR # base address of RAM
 	addiu $15, $17, 4*4       # $15 <- &RAM[4]
 	la    $16, x_IO_BASE_ADDR # address to print out results
-	addi  $3,$0,-10           # value to print = -10
-	addi  $5,$0,4             # scan from RAM[4]..RAM[24]
-        addi  $9,$0,10            # stop when done 20 loops = +10
+	addi  $3, $0, -10         # value to print = -10
+	addi  $5, $0, 4           # scan from RAM[4]..RAM[24]
+        addi  $9, $0, 10          # stop when done 20 loops = +10
 	sw    $15, 0($17)         # save pointer to RAM[0]
 	sw    $15, 0($16)         #  and print it out
 	nop
diff --git a/cMIPS/tests/lwsw.s b/cMIPS/tests/lwsw.s
index fa7cd5181a17cb578a92f59c47bf3eed9a25096b..b6040230defe241ce8b3d0c7cad30a5655869dd3 100644
--- a/cMIPS/tests/lwsw.s
+++ b/cMIPS/tests/lwsw.s
@@ -5,17 +5,21 @@
 	.ent _start
 _start:	la    $15, (x_DATA_BASE_ADDR+0x10)
 	la    $16, x_IO_BASE_ADDR
-	addi  $3,$0,-10
-	ori   $5,$0,4
+	addi  $3, $0, -10
+	ori   $5, $0, 4
 	nop
+	
 snd:	sw   $3, 4($15)
-	addi $3,$3,1
+	addi $3, $3, 1
 	lw   $4, 4($15)
-	add  $15,$15,$5
+	add  $15, $15, $5
 	sw   $4, 0($16)
-	bne  $3,$0,snd
+	bne  $3, $0, snd
 	nop
 	wait
 	nop
 	nop
 	.end _start
+
+	# fffffff6 fffffff7 fffffff8 fffffff9 fffffffa fffffffb fffffffc fffffffd fffffffe ffffffff
+
diff --git a/cMIPS/tests/lwswIncr.s b/cMIPS/tests/lwswIncr.s
index 229577bfac4d22f7b30b525a14a11ef26159aa50..d97431733f8e29ca1f77216214a6338473b91c1c 100644
--- a/cMIPS/tests/lwswIncr.s
+++ b/cMIPS/tests/lwswIncr.s
@@ -3,22 +3,22 @@
 	.align 2
 	.globl _start
 	.ent _start
-_start: la  $15, x_IO_BASE_ADDR
-	la  $16, x_IO_BASE_ADDR
-	la  $14, x_DATA_BASE_ADDR
-	addi  $3,$0,-16
-	ori   $5,$0,2
-	la   $29,(x_IO_BASE_ADDR+0x40)
+_start: la    $15, 0		# start
+	la    $29, 0x40		# end
+	la    $16, x_IO_BASE_ADDR
+	la    $14, x_DATA_BASE_ADDR
+	addi  $3, $0, -16
+	addi  $5, $0, 2
 	nop
-	nop
-snd:	add  $3,$5,$3
-	sw   $3, 0($14)
-	addi $14,$14,4
-	lw   $3, -4($14)
-	addi $15,$15,4
-	sw   $3, ($16)
-	slt  $30,$15,$29
-	bne  $30,$0,snd
+
+snd:	add  $3, $5, $3
+	sw   $3, 0($14)		# mem[i] <= count
+	addi $14, $14, 4	# i++
+	lw   $3, -4($14)	# $3 <= mem[i-1]
+	addi $15, $15, 4	# limit += 4
+	sw   $3, 0($16)		# print count
+	slt  $30, $15, $29	# limit = 0x40 ?
+	bne  $30, $0, snd	#   no, continue
 	nop
 	wait
 	nop
@@ -26,5 +26,3 @@ snd:	add  $3,$5,$3
 	
 
 # fffffff2 fffffff4 fffffff6 fffffff8 fffffffa fffffffc fffffffe 00000000 00000002 00000004 00000006 00000008 0000000a 0000000c 0000000e 00000010
-
-	
\ No newline at end of file
diff --git a/cMIPS/tests/swlw.s b/cMIPS/tests/swlw.s
index 081056452fdc423c7553449d9e13c1c4374a7c6d..fa9428c4a7f402767e5b0d12fe0e76a8d5af9d6d 100644
--- a/cMIPS/tests/swlw.s
+++ b/cMIPS/tests/swlw.s
@@ -6,19 +6,19 @@
 	.ent _start
 _start: la    $15, (x_DATA_BASE_ADDR+0x10)
 	la    $16, x_IO_BASE_ADDR
-	addi  $3,$0,10
-	ori   $5,$0,2
-        addi  $29,$0,800
-	sw    $5, -4($15)
+	addi  $3, $0, 10
+	ori   $5, $0, 2		# count = 2
+        addi  $29, $0, 800
+	sw    $5, -4($15)	# mem[i-1] <= count
 	nop
-snd:	add  $3,$5,$3
-	sw   $3, 4($15)
-	lw   $4, -4($15)
-	lw   $9, 4($15)
-	add  $5,$5,$5   #  2, 4, 8,16,32,64,128,256,512,1024
-	sw   $9, 0($16) # 10,12,16,24,40,72,136,264,520,1032
-        slt  $28,$9,$29
-        bne  $28,$0,snd
+snd:	add  $3, $5, $3		# $3 <= count + 10
+	sw   $3, 4($15)		# mem[i+1] <= $3
+	lw   $4, -4($15)	# $4 <= mem[i-1]
+	lw   $9, 4($15)		# $9 <= mem[i+1]
+	add  $5, $5, $5		# count *= 2 : 2,4,8,16,32,64,128,256,512,1024
+	sw   $9, 0($16)		# print: 10,12,16,24,40,72,136,264,520,1032
+        slt  $28, $9, $29	# less than 800?
+        bne  $28, $0, snd	#   yes, continue
 	nop
 	nop
 	nop
diff --git a/cMIPS/vhdl/core.vhd b/cMIPS/vhdl/core.vhd
index 7aafb4df5241ebb32c01841e464028e3a80ee607..4b1ca9481fc59287dd97020bb57ddf7666c795f7 100644
--- a/cMIPS/vhdl/core.vhd
+++ b/cMIPS/vhdl/core.vhd
@@ -304,6 +304,7 @@ architecture rtl of core is
   signal br_target, br_addend, br_tgt_pl4, br_tgt_displ, j_target : reg32;
   signal RF_PCincd, RF_instruction : reg32;
   signal eq_fwd_A,eq_fwd_B : reg32;
+  signal dbg_jr_stall: integer;         -- debugging only
   
   -- register fetch/read and instruction decode --  
   component reg_IF_RF is
@@ -523,12 +524,12 @@ architecture rtl of core is
     ('1','1',NIL,  '1','0','0',opNOP,"001","00", '0', "00",cNOP,"00"),--39
     ('0','0',SB,   '1','1','0',opADD,"001","00", '1', "00",cNOP,"11"),--sb=40
     ('0','0',SH,   '1','1','0',opADD,"001","00", '1', "00",cNOP,"11"),--sh=41
-    ('1','1',NIL,  '1','1','0',opNOP,"001","00", '0', "00",cNOP,"00"),--swl=42
+    ('1','1',NIL,  '1','0','0',opNOP,"001","00", '0', "00",cNOP,"00"),--swl=42
     ('0','0',SW,   '1','1','0',opADD,"001","00", '1', "00",cNOP,"11"),--sw=43
     ('1','1',NIL,  '1','0','0',opNOP,"001","00", '0', "00",cNOP,"00"),--44
     ('1','1',NIL,  '1','0','0',opNOP,"001","00", '0', "00",cNOP,"00"),--45
-    ('1','1',NIL,  '1','1','0',opNOP,"001","00", '0', "00",cNOP,"00"),--swr=46
-    ('1','1',NIL,  '1','1','0',opNOP,"001","00", '0', "00",cNOP,"00"),--cache=47
+    ('1','1',NIL,  '1','0','0',opNOP,"001","00", '0', "00",cNOP,"00"),--swr=46
+    ('1','1',NIL,  '1','0','0',opNOP,"001","00", '0', "00",cNOP,"00"),--cache=47
     ('0','1',LL,   '0','1','0',opADD,"000","01", '1', "00",cNOP,"11"),--ll=48
     ('1','1',NIL,  '1','0','0',opNOP,"001","00", '0', "00",cNOP,"00"),--lwc1=49
     ('1','1',NIL,  '1','0','0',opNOP,"001","00", '0', "00",cNOP,"00"),--lwc2=50
@@ -907,19 +908,29 @@ begin
   j_target <= RF_PCincd(31 downto 28) & RF_instruction(25 downto 0) & b"00";
 
   RF_JR_STALL: process (funct_word,a_rs,EX_a_c,MM_a_c,EX_wreg,MM_wreg)
+    variable i_dbg_jr_stall : integer := 0;  -- debug only
   begin
     if ( (funct_word.PCsel = b"11")and          -- load-delay slot
             (EX_a_c /= a_rs)and(EX_wreg = '0')and
             (MM_a_c =  a_rs)and(MM_wreg = '0')and(MM_a_c /= b"00000") ) then
       jr_stall <= '1';
+      i_dbg_jr_stall := 1;
     elsif ( (funct_word.PCsel = b"11")and       -- ALU hazard
          (EX_a_c =  a_rs)and(EX_wreg = '0')and(EX_a_c /= b"00000") ) then
       jr_stall <= '1';
+      i_dbg_jr_stall := 2;
+    elsif ( (funct_word.PCsel = b"11")and       -- 2nd load-delay slot
+            (MM_a_c = a_rs)and(MM_wreg = '0')and(MM_a_c /= b"00000") and
+            (MM_aVal = '0') ) then
+      jr_stall <= '1';
+      i_dbg_jr_stall := 3;
     else
       jr_stall <= '0';
-    end if; 
+      i_dbg_jr_stall := 0;
+    end if;
+    dbg_jr_stall <= i_dbg_jr_stall;
   end process RF_JR_STALL;
-
+  
   
   RF_SW_STALL: process (ctrl_word,a_rs,EX_a_c,EX_wreg,EX_is_load)
     variable is_store : boolean := false;
diff --git a/cMIPS/vhdl/tb_cMIPS.vhd b/cMIPS/vhdl/tb_cMIPS.vhd
index bbb4f6ed973b37c78f3118fabfd2c2ad7177eda9..93ff89519885a721aa51d719114c61f57e909681 100644
--- a/cMIPS/vhdl/tb_cMIPS.vhd
+++ b/cMIPS/vhdl/tb_cMIPS.vhd
@@ -632,17 +632,16 @@ entity inst_addr_decode is              -- CPU side triggers access
         cpu_i_aVal  : in  std_logic;    -- CPU instr addr valid (act=0)
         addr        : in  reg32;        -- CPU address
         aVal        : out std_logic);   -- decoded address in range (act=0)
-  constant LO_ADDR  : integer := 0;
-  constant HI_ADDR  : integer := log2_ceil(INST_MEM_SZ);
 end entity inst_addr_decode;
 
 architecture behavioral of inst_addr_decode is
+  constant HI_ADDR : integer := HI_SEL_BITS;
+  constant LO_ADDR : integer := log2_ceil(INST_BASE_ADDR + INST_MEM_SZ);
+  constant PREFIX : std_logic_vector(HI_ADDR downto LO_ADDR) := (others=>'0');
 begin
 
   aVal <= '0' when ( cpu_i_aVal = '0' and rst = '1'
-                     and (addr(HI_SEL_BITS downto LO_SEL_BITS)
-                          =
-                          x_INST_BASE_ADDR(HI_SEL_BITS downto LO_SEL_BITS)) )
+                     and (addr(HI_ADDR downto LO_ADDR) = PREFIX) )
           else '1';
   
 end architecture behavioral;