diff --git a/cMIPS/tests/ll_sc.expected b/cMIPS/tests/ll_sc.expected
index 623e40a4d09333c359175d4b43df96a68e280554..62dee1bece51849f9c574b556bddba6b95de6b3e 100644
--- a/cMIPS/tests/ll_sc.expected
+++ b/cMIPS/tests/ll_sc.expected
@@ -15,3 +15,9 @@
 00000001
 00000001
 00000000
+
+ffffffff
+000000ff
+
+ffffffff
+ok
diff --git a/cMIPS/tests/ll_sc.s b/cMIPS/tests/ll_sc.s
index 2611de3371bfa63b499f9e202f56174faf5eeb88..b23653f345b677bc8943c79a966dee39fffab64a 100644
--- a/cMIPS/tests/ll_sc.s
+++ b/cMIPS/tests/ll_sc.s
@@ -3,9 +3,8 @@
 	.text
 	.align 2
 	.set noreorder
-	.global _start
-	.global _exit
-	.global exit
+	.global _start, _exit, exit
+	
 	.ent _start
 _start: nop
 	li   $k0,0x10000002     # RESET_STATUS, kernel mode, all else disabled
@@ -71,8 +70,8 @@ L:	ll    $t1, 0($t0)     # load-linked
 
 fwd:	addi $t2, $t1, 1   # increment value read by LL
 	sc   $t2, 0($t0)   # try to store, checking for atomicity
-	sw   $t2, 0($15)   # prints 0000.0001 if SC succeeds
 	addiu $t0,$t0,4    # use a new address in each round
+	sw   $t2, 0($15)   # prints 0000.0001 if SC succeeds
 	beq  $t2, $zero, L # if not atomic (0), try again, does not print 4
 	sw   $zero, 0($t0) # store zero to new address
 
@@ -84,12 +83,81 @@ fwd:	addi $t2, $t1, 1   # increment value read by LL
 	lw   $t2, 0($t0)   # print untouched location = 0000.0000
 	sw   $t2, 0($15)
 	nop
-	j exit
+
+	##
+	## do a SC to the same address as ll -- must succeed
+	##
+
+test1:	li   $30, '\n'     	# print a blank line to separate tests
+        sw   $30, x_IO_ADDR_RANGE($15)
+
+	la  $t0, x_DATA_BASE_ADDR
+	li  $a0, 0xffffffff
+	sw  $a0, 0($t0)
+
+	ll  $a1, 0($t0)
+	nop 
+	sw  $a1, 0($15)	
+	nop
+	li  $a2, 256-1
+	sc  $a2, 0($t0)		# same address -- must succeed
+	
+	beq $a2, $zero, error
 	nop
+	lw  $a3, 0($t0)
+	sw  $a3, 0($15) 	# print out 0xff	
+
+	
+	##
+	## try to sc to a different adress from ll -- must fail
+	##
+	
+test2:	li   $30, '\n'     	# print a blank line to separate tests
+        sw   $30, x_IO_ADDR_RANGE($15)
+	
+	la  $t0, x_DATA_BASE_ADDR
+	li  $a0, 0xffffffff	# store -1 to data[0]
+	li  $a1, 0x88442211	# store 88442211 to data[1]
+	sw  $a0, 0($t0)
+	sw  $a1, 4($t0) 	# address to store_c != addr to load_l
+	
+	ll  $a1, 0($t0)	 	# load-linked from data[0]
+	li  $a2, 4096-1 	# attempt to write 0x0ffff to data[1]
+	sw  $a1, 0($15)		# display data[0]
+	nop
+	sc  $a2, 4($t0)	 	# different address from ll -- must fail
+
+	beq $a2, $zero, fail_ok
+	nop
+
+succ_nok: 			# should never come here
+	lw  $a3, 4($t0)
+	sw  $a2, 0($15)		# print out wrong value stored to data[1]
+	beq $a3, $a2, error	# sc did change data[1]
+	nop
+
+	
+fail_ok:			# sc ought to have failed, which is good
+	li   $30, 'o'
+        sw   $30, x_IO_ADDR_RANGE($15)
+        li   $30, 'k'
+	sw   $30, x_IO_ADDR_RANGE($15)	
+        li   $30, '\n'              # print a blank line
+	j exit
+        sw   $30, x_IO_ADDR_RANGE($15)	
+	
+		
+error:  li   $30, 'e'
+        sw   $30, x_IO_ADDR_RANGE($15)
+        li   $30, 'r'
+        sw   $30, x_IO_ADDR_RANGE($15)
+        sw   $30, x_IO_ADDR_RANGE($15)
+        li   $31, 'o'
+        sw   $31, x_IO_ADDR_RANGE($15)
+        sw   $30, x_IO_ADDR_RANGE($15)
+        li   $31, '\n'              # print a blank line
+	j exit
+        sw   $31, x_IO_ADDR_RANGE($15)
 	
 	.end main
 
-#	.data
-#	.align 2
-#area:	.space 64,0
-	
\ No newline at end of file
diff --git a/cMIPS/vhdl/core.vhd b/cMIPS/vhdl/core.vhd
index 28e6f9b5743c428bf2737b07e2ec0b9097a7a8d6..cd8c84112a0bbd5ecbb6d3965fe3ca1ab297fe03 100644
--- a/cMIPS/vhdl/core.vhd
+++ b/cMIPS/vhdl/core.vhd
@@ -151,7 +151,7 @@ architecture rtl of core is
   signal trap_instr,EX_trap_instr: instr_type;
   signal RF_PC,EX_PC,MM_PC,WB_PC, LLaddr: reg32;
   signal ll_sc_bit, MM_LLbit,WB_LLbit: std_logic;
-  signal LL_update,LL_SC_abort,LL_SC_differ: std_logic;
+  signal LL_update, LL_SC_abort, LL_SC_abort_d, LL_SC_differ: std_logic;
   signal EX_trapped, MM_trapped, EX_ovfl,MM_ex_ovfl, trap_taken: boolean;
   signal int_req, MM_int_req: reg8;
   signal EX_nmi,MM_nmi : std_logic;
@@ -294,7 +294,7 @@ architecture rtl of core is
   signal PCsel : reg2;
   signal excp_PCsel : reg3;
 
-  signal rom_stall, iaVal, if_stalled, stalled : std_logic;
+  signal rom_stall, iaVal, if_stalled, mem_stall, pipe_stall : std_logic;
   signal ram_stall, daVal, mm_stalled : std_logic;
   signal br_target, br_addend, br_tgt_pl4, br_tgt_displ, j_target : reg32;
   signal RF_PCincd, RF_instruction : reg32;
@@ -709,7 +709,7 @@ begin
 
   -- INSTR_FETCH_STATE_MACHINE: instruction-bus control
   U_ifetch_stalled: FFD port map (clk => phi2, rst => rst, set => '1',
-                                  D => stalled, Q => if_stalled);
+                                  D => mem_stall, Q => if_stalled);
 
   -- iaVal <= '1' when ((phi0 = '1' and if_stalled = '0')) else '0';
   
@@ -718,37 +718,29 @@ begin
   
   rom_stall <= not(iaVal) and not(i_wait);
 
-  stalled <= ram_stall or rom_stall;
-  not_stalled <= not(stalled);
+  mem_stall   <= ram_stall or rom_stall;
+  not_stalled <= not(mem_stall);
 
   -- end INSTR_FETCH_STATE_MACHINE --------------------------
   
  
   -- PROGRAM COUNTER AND INSTRUCTION FETCH ------------------
 
-  PCload <= '1' when ( (rom_stall = '1') or (ram_stall = '1') or
-                       (jr_stall = '1')  or (br_stall = '1')  or
-                       (sw_stall = '1')  or (tr_stall = '1')  or
-                       (exception_stall = '1') )
-            else '0';
-  IF_RF_ld <= '1' when ( (rom_stall = '1') or (ram_stall = '1') or
-                         (jr_stall = '1')  or (br_stall = '1')  or
-                         (sw_stall = '1')  or (tr_stall = '1')  or
-                         (exception_stall = '1') )
-              else '0';
-  RF_EX_ld <= rom_stall or ram_stall; -- or exception_stall;
-  EX_MM_ld <= rom_stall or ram_stall;
-  MM_WB_ld <= rom_stall or ram_stall;
-
-  
-  excp_IF_RF_ld <= '1' when ( (rom_stall = '1') or (ram_stall = '1') or
-                              (jr_stall = '1')  or (br_stall = '1')  or
-                              (sw_stall = '1')  or (tr_stall = '1')  or
-                              (exception_stall = '1') )
-                   else '0';
-  excp_RF_EX_ld <= rom_stall or ram_stall; -- or exception_stall;
-  excp_EX_MM_ld <= rom_stall or ram_stall;
-  excp_MM_WB_ld <= rom_stall or ram_stall;
+  pipe_stall <= rom_stall or ram_stall or jr_stall or br_stall or
+                sw_stall  or tr_stall  or exception_stall;
+
+  
+  PCload   <= '1' when pipe_stall = '1' else '0';
+  IF_RF_ld <= '1' when pipe_stall = '1' else '0';
+  RF_EX_ld <= mem_stall; -- or exception_stall;
+  EX_MM_ld <= mem_stall;
+  MM_WB_ld <= mem_stall;
+
+  
+  excp_IF_RF_ld <= '1' when pipe_stall = '1' else '0';
+  excp_RF_EX_ld <= mem_stall; -- or exception_stall;
+  excp_EX_MM_ld <= mem_stall;
+  excp_MM_WB_ld <= mem_stall;
 
 
   with PCsel select
@@ -945,44 +937,63 @@ begin
 
   RF_FORWARDING_BRANCH: process (a_rs,a_rt,EX_wreg,EX_a_c,MM_wreg,MM_a_c,
                                  MM_aVal,MM_result,MM_cop0_val,MM_mfc0,
-                                 regs_A,regs_B,is_branch)
+                                 regs_A,regs_B,is_branch,
+                                 EX_exception, LL_SC_abort)
+    variable rs_stall, rt_stall : boolean;
   begin
-    br_stall <= '0';
 
     if ( (is_branch = '1') and          -- forward_A
          (EX_wreg = '0') and (EX_a_c = a_rs) and (EX_a_c /= b"00000") ) then
-      br_stall <= '1';
-      eq_fwd_A <= regs_A;
-    elsif ( (MM_wreg = '0') and (MM_a_c = a_rs) and (MM_a_c /= b"00000") ) then
+      if EX_exception = exSC then
+        eq_fwd_A <= x"0000000" & b"000" & not(LL_SC_abort);
+        rs_stall := FALSE;
+      else
+        eq_fwd_A <= regs_A;
+        rs_stall := TRUE;
+      end if;
+      elsif ( (MM_wreg = '0') and (MM_a_c = a_rs) and (MM_a_c /= b"00000") ) then
       if ( (MM_aVal = '0') and (is_branch = '1') ) then   -- LW load-delay slot
-        br_stall <= '1';
+        rs_stall := TRUE;
         eq_fwd_A <= regs_A;
       elsif MM_mfc0 then          -- non-LW
         eq_fwd_A <= MM_cop0_val;
+        rs_stall := FALSE;
       else
-        eq_fwd_A <= MM_result; 
+        eq_fwd_A <= MM_result;
+        rs_stall := FALSE;
       end if;
     else
       eq_fwd_A <= regs_A;
+      rs_stall := FALSE;
     end if;
 
     if ( (is_branch = '1') and          -- forward_B
          (EX_wreg = '0') and (EX_a_c = a_rt) and (EX_a_c /= b"00000") ) then
-      br_stall <= '1';
-      eq_fwd_B <= regs_B;
+      if EX_exception = exSC then
+        eq_fwd_A <= x"0000000" & b"000" & not(LL_SC_abort);
+        rs_stall := FALSE;
+      else
+        eq_fwd_B <= regs_B;
+        rs_stall := TRUE;
+      end if;
     elsif ( (MM_wreg = '0') and (MM_a_c = a_rt) and (MM_a_c /= b"00000") ) then
       if ( (MM_aVal = '0') and (is_branch = '1') ) then   -- LW load-delay slot
-        br_stall <= '1';
+        rt_stall := TRUE;
         eq_fwd_B <= regs_B;
       elsif MM_mfc0 then          -- non-LW
         eq_fwd_B <= MM_cop0_val;
+        rt_stall := FALSE;
       else
         eq_fwd_B <= MM_result;
+        rt_stall := FALSE;
       end if;
     else
       eq_fwd_B <= regs_B;
+      rt_stall := FALSE;
     end if;
 
+  br_stall <= BOOL2SL(rs_stall or rt_stall);
+  
   end process RF_FORWARDING_BRANCH;
 
   
@@ -1361,11 +1372,13 @@ begin
 
   EX_wrmem_cond <= EX_wrmem
                    or BOOL2SL(abort_ref)  -- abort write if exception in MEM
+                   or LL_SC_abort         -- SC is to be killed
                    or ( BOOL2SL(nullify) and not(MM_is_delayslot) );
                                           -- abort memWrite if exception in EX
 
   EX_aVal_cond <= EX_aVal
                   or BOOL2SL(abort_ref)  -- abort ref if exception in MEM
+                  or LL_SC_abort         -- SC is to be killed
                   or ( BOOL2SL(nullify) and not(MM_is_delayslot) );
                                          -- abort memRef if previous excep in EX
 
@@ -1389,15 +1402,15 @@ begin
 
   -- DATA_BUS_STATE_MACHINE: data-bus control
   U_dmem_stalled: FFD port map (clk => phi2, rst => rst, set => '1',
-                                D => stalled, Q => mm_stalled);
+                                D => mem_stall, Q => mm_stalled);
 
-  d_aVal <= MM_aVal or LL_SC_abort;  -- interface signal/port
-  daVal  <= MM_aVal or LL_SC_abort;  -- internal signal
+  d_aVal <= MM_aVal;  -- interface signal/port
+  daVal  <= MM_aVal;  -- internal signal
   
   ram_stall <= not(daVal) and not(d_wait);
   -- end DATA_BUS_STATE_MACHINE -------------------------------------
  
-  wr <= MM_wrmem or LL_SC_abort;                -- abort write if SC fails
+  wr <= MM_wrmem;                -- abort write if SC fails
 
   
   rd_data_raw <= data_inp when (MM_wrmem = '1' and MM_aVal = '0') else
@@ -2353,16 +2366,19 @@ begin
     port map (clk, rst, BadVAddr_update, BadVAddr_inp, BadVAddr);
 
 
-  -- LLaddr & LLbit ------------------------------
+  -- LLaddr & LLbit --------------------------------------------------
+  -- check address of SC at stage EX, in time to kill memory reference
+  
   LL_update <= '0' when (update = '1' and update_reg = cop0reg_LLAddr)
                else '1';
 
-  COP0_LLaddr: register32 generic map(x"00000000")
-    port map (clk, rst, LL_update, MM_result, LLaddr);
+  COP0_LLaddr: register32 generic map(x"00000000")      -- update at MM
+    port map (clk, rst, LL_update, MM_v_addr, LLaddr);
 
-  LL_SC_differ <= '0' when (MM_v_addr = LLaddr) else '1';
+  LL_SC_differ <= '0' when (v_addr = LLaddr) else '1';  -- check at EX
 
-  LL_SC_abort  <= (LL_SC_differ or not(MM_LLbit)) when (is_exception = exSC)
+  LL_SC_abort  <= (LL_SC_differ or not(ll_sc_bit))
+                  when (EX_exception = exSC) --  and pipe_stall = '0')
                   else '0';
   
   COP0_LLbit: process(rst,clk)
@@ -2374,15 +2390,16 @@ begin
         when exERET =>
           ll_sc_bit <= '0';            -- break SC -> LL
         when exLL =>
-          ll_sc_bit <= not LL_update;  -- update only if instr is a LL
+          ll_sc_bit <= not LL_update;  -- update only if instr is an LL
         when others =>
           null;
       end case;
-
     end if;
   end process COP0_LLbit;
 
-  MM_llbit <= ll_sc_bit and not(LL_SC_abort);  
+  U_DELAY_LL_SC_ABORT: FFD port map (clk, rst, '1', LL_SC_abort, LL_SC_abort_d);
+
+  MM_llbit <= ll_sc_bit and not(LL_SC_abort_d);
 
   
   -- MMU-TLB ===============================================================