LWZUX - Load Word and Zero with Update Indexed | PowerPC Instruction Set Reference

Instruction Syntax

Mnemonic	Format	Flags
lwzux	rD,rA,rB	-

Instruction Encoding

Field	Bits	Description
Primary Opcode	0-5	011111 (0x1F)
rD	6-10	Destination register
rA	11-15	Source register A
rB	16-20	Source register B
XO	21-30	55 (Extended opcode)
Rc	31	Reserved (0)

Operation

EA ← (rA) + (rB)
rD ← MEM(EA, 4)
rA ← EA

A word (32 bits) is loaded from memory and placed in register rD. The effective address is computed by adding the contents of registers rA and rB. After the load, the effective address is stored back into register rA.

Note: This instruction cannot be used with rA=0. The update form requires a valid base register. This is the most advanced addressing mode for word loads, combining indexed addressing with automatic pointer advancement. Essential for high-performance data structure traversal and array processing with dynamic strides.

Affected Registers

rA - Updated with the effective address after the load operation.

For more information on memory addressing see Section 2.1.6, "Effective Address Calculation," in the PowerPC Microprocessor Family: The Programming Environments manual.

Examples

Operating System - Process Table Management

# Traverse process control blocks with variable sizes
lis r3, process_table@ha
addi r3, r3, process_table@l
lis r4, pcb_sizes@ha
addi r4, r4, pcb_sizes@l
lwz r5, num_processes(r0)   # Number of active processes

# Process Control Block fields: [pid, state, priority, memory_base, ...]
process_scheduler_loop:
    # Load PCB size for this process (variable due to different process types)
    lwz r6, 0(r4)           # Load PCB size offset
    
    # Load process information with automatic advancement
    lwzux r7, r3, r6        # Load process ID and advance by PCB size
    lwz r8, 4               # Standard word advance
    lwzux r9, r3, r8        # Load process state and advance
    lwzux r10, r3, r8       # Load priority and advance
    lwzux r11, r3, r8       # Load memory base address and advance
    
    # Process scheduling decision based on state and priority
    cmpwi r9, PROCESS_READY # Check if process is ready
    bne skip_scheduling     # Skip if not ready
    
    # Update process time slice and scheduling counters
    lwzux r12, r3, r8       # Load time slice remaining and advance
    subi r13, r12, 1        # Decrement time slice
    cmpwi r13, 0            # Check if time slice expired
    bgt update_timeslice    # Continue if time remaining
    
    # Time slice expired - perform context switch
    bl context_switch       # Switch to next process
    li r13, DEFAULT_TIMESLICE # Reset time slice
    
update_timeslice:
    stw r13, 0(r3)          # Store updated time slice
    
skip_scheduling:
    addi r4, r4, 4          # Next PCB size
    subi r5, r5, 1          # Decrement process counter
    cmpwi r5, 0
    bne process_scheduler_loop # Continue scheduling

Database Engine - Index B-Tree Traversal

# Traverse B-tree index with variable node sizes
lis r3, btree_root@ha
addi r3, r3, btree_root@l
lis r4, search_key@ha
lwz r5, search_key@l(r4)   # Key to search for
lis r6, node_sizes@ha
addi r6, r6, node_sizes@l

# B-tree node structure: [num_keys, key1, ptr1, key2, ptr2, ..., ptr_n+1]
btree_search_loop:
    # Load node size for variable-size nodes (different for leaf/internal)
    lwz r7, 0(r6)           # Load node size offset
    
    # Load number of keys in current node
    lwzux r8, r3, r7        # Load num_keys and advance by node size
    
    # Search through keys in current node
    li r9, 0                # Key index
    li r10, 4               # Standard word advance
    
key_search_loop:
    cmpw r9, r8             # Compare key index with num_keys
    bge key_not_found       # Branch if searched all keys
    
    lwzux r11, r3, r10      # Load current key and advance
    cmpw r5, r11            # Compare search key with current key
    beq key_found           # Branch if exact match
    blt follow_left_ptr     # Follow left pointer if search key < current key
    
    # search_key > current_key, continue to next key
    lwzux r12, r3, r10      # Load and skip pointer, advance
    addi r9, r9, 1          # Increment key index
    b key_search_loop       # Continue searching
    
follow_left_ptr:
    # Follow pointer to child node
    lwzux r13, r3, r10      # Load child pointer and advance
    mr r3, r13              # Update current node pointer
    b btree_search_loop     # Continue search in child node
    
key_found:
    # Key found - load associated data pointer
    lwzux r14, r3, r10      # Load data pointer and advance
    # Process found record
    bl process_record       # Process record at r14
    b search_complete
    
key_not_found:
    # Key not found in current node
    # Follow rightmost pointer for internal nodes
    lwzux r15, r3, r10      # Load rightmost pointer and advance
    cmpwi r15, 0            # Check if null pointer (leaf node)
    beq search_failed       # Key not found
    mr r3, r15              # Follow rightmost pointer
    b btree_search_loop     # Continue search

search_failed:
    # Key not found in B-tree
    li r3, -1               # Return error code

search_complete:

Graphics Engine - Polygon Mesh Processing

# Process polygon mesh with variable vertex counts per face
lis r3, mesh_data@ha
addi r3, r3, mesh_data@l
lis r4, face_sizes@ha
addi r4, r4, face_sizes@l
lwz r5, num_faces(r0)      # Number of faces in mesh

# Face structure: [vertex_count, v1_index, v2_index, ..., vn_index, material_id]
mesh_processing_loop:
    # Load face size (varies based on polygon type: triangle=3, quad=4, n-gon=n)
    lwz r6, 0(r4)          # Load face size offset
    
    # Load vertex count for this face
    lwzux r7, r3, r6       # Load vertex_count and advance by face size
    
    # Initialize face processing
    li r8, 0               # Vertex index counter
    lfd f10, zero_double(r0) # Face area accumulator
    
    # Load first vertex index for area calculation
    li r9, 4               # Standard word advance
    lwzux r10, r3, r9      # Load first vertex index and advance
    lwzux r11, r3, r9      # Load second vertex index and advance
    
    # Calculate face area using cross product method
vertex_loop:
    cmpw r8, r7            # Check if processed all vertices
    bge face_area_complete # Complete if done with vertices
    
    lwzux r12, r3, r9      # Load next vertex index and advance
    
    # Load vertex coordinates for area calculation
    lis r13, vertex_array@ha
    addi r13, r13, vertex_array@l
    
    # Load vertex positions (each vertex = [x, y, z])
    slwi r14, r10, 4       # v1 offset (* 16 for 4 floats)
    lfsx f1, r13, r14      # Load v1.x
    addi r15, r14, 4
    lfsx f2, r13, r15      # Load v1.y
    addi r16, r15, 4
    lfsx f3, r13, r16      # Load v1.z
    
    slwi r17, r11, 4       # v2 offset
    lfsx f4, r13, r17      # Load v2.x
    addi r18, r17, 4
    lfsx f5, r13, r18      # Load v2.y
    addi r19, r18, 4
    lfsx f6, r13, r19      # Load v2.z
    
    slwi r20, r12, 4       # v3 offset
    lfsx f7, r13, r20      # Load v3.x
    addi r21, r20, 4
    lfsx f8, r13, r21      # Load v3.y
    addi r22, r21, 4
    lfsx f9, r13, r22      # Load v3.z
    
    # Calculate triangle area using cross product: 0.5 * ||(v2-v1) × (v3-v1)||
    fsub f11, f4, f1       # v2.x - v1.x
    fsub f12, f5, f2       # v2.y - v1.y
    fsub f13, f6, f3       # v2.z - v1.z
    
    fsub f14, f7, f1       # v3.x - v1.x
    fsub f15, f8, f2       # v3.y - v1.y
    fsub f16, f9, f3       # v3.z - v1.z
    
    # Cross product: (v2-v1) × (v3-v1)
    fmsub f17, f12, f16, f0 # (v2.y-v1.y)*(v3.z-v1.z)
    fmsub f17, f13, f15, f17 # - (v2.z-v1.z)*(v3.y-v1.y) = cross.x
    
    fmsub f18, f13, f14, f0 # (v2.z-v1.z)*(v3.x-v1.x)
    fmsub f18, f11, f16, f18 # - (v2.x-v1.x)*(v3.z-v1.z) = cross.y
    
    fmsub f19, f11, f15, f0 # (v2.x-v1.x)*(v3.y-v1.y)
    fmsub f19, f12, f14, f19 # - (v2.y-v1.y)*(v3.x-v1.x) = cross.z
    
    # Calculate magnitude: ||cross|| = √(x² + y² + z²)
    fmadd f20, f17, f17, f0 # cross.x²
    fmadd f20, f18, f18, f20 # + cross.y²
    fmadd f20, f19, f19, f20 # + cross.z²
    fsqrt f21, f20         # ||cross||
    
    # Triangle area = 0.5 * ||cross||
    lfd f22, half_constant(r0) # 0.5
    fmul f23, f21, f22     # Triangle area
    fadd f10, f10, f23     # Add to face area
    
    # Move to next triangle in fan
    mr r11, r12            # v2 = v3 for next iteration
    addi r8, r8, 1         # Increment vertex counter
    b vertex_loop          # Continue with next vertex

face_area_complete:
    # Store calculated face area
    lis r23, face_areas@ha
    addi r23, r23, face_areas@l
    sub r24, r5, 1         # Calculate face index (reverse counter)
    slwi r25, r24, 3       # Convert to double offset
    stfdx f10, r23, r25    # Store face area
    
    # Load material ID for this face
    lwzux r26, r3, r9      # Load material_id and advance
    
    # Process material properties
    bl process_material    # Apply material properties
    
    addi r4, r4, 4         # Next face size
    subi r5, r5, 1         # Decrement face counter
    cmpwi r5, 0
    bne mesh_processing_loop # Continue mesh processing

Compiler Optimization - Loop Unrolling Analysis

# Analyze loop structures for optimization opportunities
lis r3, basic_blocks@ha
addi r3, r3, basic_blocks@l
lis r4, block_sizes@ha
addi r4, r4, block_sizes@l
lwz r5, num_blocks(r0)     # Number of basic blocks

# Basic block structure: [block_id, instruction_count, instr1, instr2, ..., exit_targets]
loop_analysis_loop:
    # Load basic block size (varies based on number of instructions)
    lwz r6, 0(r4)          # Load block size offset
    
    # Load basic block header
    lwzux r7, r3, r6       # Load block_id and advance by block size
    li r8, 4               # Standard word advance
    lwzux r9, r3, r8       # Load instruction_count and advance
    
    # Analyze loop characteristics
    li r10, 0              # Instruction index
    li r11, 0              # Loop instruction counter
    li r12, 0              # Memory operation counter
    li r13, 0              # Branch counter
    
instruction_analysis_loop:
    cmpw r10, r9           # Check if analyzed all instructions
    bge block_analysis_complete # Complete if done
    
    lwzux r14, r3, r8      # Load instruction opcode and advance
    
    # Classify instruction type
    srwi r15, r14, 26      # Extract primary opcode (bits 0-5)
    
    # Check for loop-relevant instructions
    cmpwi r15, 0x20        # lwz
    beq memory_operation
    cmpwi r15, 0x24        # stw
    beq memory_operation
    cmpwi r15, 0x10        # bc (conditional branch)
    beq branch_operation
    cmpwi r15, 0x12        # b (unconditional branch)
    beq branch_operation
    
    # Check for arithmetic operations (good for unrolling)
    cmpwi r15, 0x1F        # Extended opcodes
    bne continue_analysis
    
    # Extract extended opcode for detailed analysis
    andi. r16, r14, 0x3FF  # Extract XO field
    cmpwi r16, 266         # add
    beq arithmetic_operation
    cmpwi r16, 40          # subf
    beq arithmetic_operation
    cmpwi r16, 235         # mullw
    beq arithmetic_operation
    
    b continue_analysis

memory_operation:
    addi r12, r12, 1       # Increment memory operation counter
    
    # Analyze addressing mode for loop optimization potential
    andi. r17, r14, 0x1F   # Extract rA field
    cmpwi r17, 0           # Check for rA=0 (simple addressing)
    beq simple_addressing
    
    # Complex addressing - check for induction variables
    bl analyze_induction_variable # Analyze for loop variables
    
simple_addressing:
    b continue_analysis

branch_operation:
    addi r13, r13, 1       # Increment branch counter
    
    # Check if this is a loop back-edge
    bl analyze_loop_branch # Determine if loop-closing branch
    
    b continue_analysis

arithmetic_operation:
    addi r11, r11, 1       # Increment loop instruction counter
    
    # Analyze for loop-carried dependencies
    bl analyze_dependencies # Check data dependencies

continue_analysis:
    addi r10, r10, 1       # Next instruction
    b instruction_analysis_loop # Continue analysis

block_analysis_complete:
    # Calculate optimization metrics
    # Unroll factor = min(max_unroll, instructions/memory_ops)
    cmpwi r12, 0           # Check for divide by zero
    beq no_memory_ops
    divw r18, r11, r12     # instructions/memory_ops ratio
    b calculate_unroll_factor

no_memory_ops:
    li r18, MAX_UNROLL     # Default unroll factor

calculate_unroll_factor:
    lwz r19, max_unroll_factor(r0)
    cmpw r18, r19          # Compare with maximum allowed
    ble store_unroll_factor
    mr r18, r19            # Clamp to maximum

store_unroll_factor:
    # Store optimization recommendation
    lis r20, optimization_data@ha
    addi r20, r20, optimization_data@l
    slwi r21, r7, 4        # block_id * 16 (4 words per block)
    stwx r18, r20, r21     # Store unroll factor
    
    addi r22, r21, 4
    stwx r11, r20, r22     # Store loop instruction count
    addi r23, r22, 4
    stwx r12, r20, r23     # Store memory operation count
    addi r24, r23, 4
    stwx r13, r20, r24     # Store branch count
    
    addi r4, r4, 4         # Next block size
    subi r5, r5, 1         # Decrement block counter
    cmpwi r5, 0
    bne loop_analysis_loop # Continue analysis

Network Stack - Protocol Header Processing

# Process network packets with variable header lengths
lis r3, packet_buffer@ha
addi r3, r3, packet_buffer@l
lis r4, header_lengths@ha
addi r4, r4, header_lengths@l
lwz r5, num_packets(r0)    # Number of packets to process

# Packet structure: [header_type, header_data..., payload_length, payload...]
packet_processing_loop:
    # Load header length for current packet type
    lwz r6, 0(r4)          # Load header length offset
    
    # Load packet header with variable length advancement
    lwzux r7, r3, r6       # Load header_type and advance by header length
    
    # Process based on protocol type
    cmpwi r7, ETHERNET_TYPE
    beq process_ethernet
    cmpwi r7, IP_TYPE
    beq process_ip
    cmpwi r7, TCP_TYPE
    beq process_tcp
    cmpwi r7, UDP_TYPE
    beq process_udp
    b unknown_protocol

process_ethernet:
    # Process Ethernet header (14 bytes)
    li r8, 4               # Standard word advance
    lwzux r9, r3, r8       # Load destination MAC (first 4 bytes) and advance
    lwzux r10, r3, r8      # Load destination MAC (last 2 bytes) + source MAC (first 2 bytes) and advance
    lwzux r11, r3, r8      # Load source MAC (last 4 bytes) and advance
    lwzux r12, r3, r8      # Load EtherType and advance
    
    # Validate Ethernet frame
    bl validate_ethernet_frame
    b continue_processing

process_ip:
    # Process IP header (20+ bytes, variable with options)
    li r8, 4
    lwzux r13, r3, r8      # Load version/IHL/ToS/Total Length and advance
    
    # Extract header length
    srwi r14, r13, 8       # Shift to get IHL
    andi. r15, r14, 0x0F   # Extract IHL (Internet Header Length)
    slwi r16, r15, 2       # Convert to bytes (IHL * 4)
    
    # Continue loading IP header
    lwzux r17, r3, r8      # Load ID/Flags/Fragment Offset and advance
    lwzux r18, r3, r8      # Load TTL/Protocol/Checksum and advance
    lwzux r19, r3, r8      # Load source IP and advance
    lwzux r20, r3, r8      # Load destination IP and advance
    
    # Skip IP options if present
    subi r21, r16, 20      # Calculate options length
    cmpwi r21, 0           # Check if options present
    ble no_ip_options
    add r3, r3, r21        # Skip options

no_ip_options:
    bl process_ip_packet
    b continue_processing

process_tcp:
    # Process TCP header (20+ bytes, variable with options)
    li r8, 4
    lwzux r22, r3, r8      # Load source/dest ports and advance
    lwzux r23, r3, r8      # Load sequence number (first part) and advance
    lwzux r24, r3, r8      # Load sequence number (second part) and advance
    lwzux r25, r3, r8      # Load acknowledgment number (first part) and advance
    lwzux r26, r3, r8      # Load acknowledgment number (second part) and advance
    lwzux r27, r3, r8      # Load data offset/flags/window and advance
    
    # Extract TCP header length
    srwi r28, r27, 28      # Extract data offset (upper 4 bits)
    slwi r29, r28, 2       # Convert to bytes (offset * 4)
    
    # Continue with TCP header
    lwzux r30, r3, r8      # Load checksum/urgent pointer and advance
    
    # Skip TCP options if present
    subi r31, r29, 20      # Calculate options length
    cmpwi r31, 0           # Check if options present
    ble no_tcp_options
    add r3, r3, r31        # Skip options

no_tcp_options:
    bl process_tcp_segment
    b continue_processing

process_udp:
    # Process UDP header (8 bytes fixed)
    li r8, 4
    lwzux r9, r3, r8       # Load source/dest ports and advance
    lwzux r10, r3, r8      # Load length/checksum and advance
    
    bl process_udp_datagram
    b continue_processing

unknown_protocol:
    # Handle unknown protocol
    bl handle_unknown_protocol

continue_processing:
    # Load payload length
    li r8, 4
    lwzux r11, r3, r8      # Load payload length and advance
    
    # Process payload
    bl process_packet_payload # Process payload data
    
    # Skip to next packet
    add r3, r3, r11        # Skip payload data
    
    addi r4, r4, 4         # Next header length
    subi r5, r5, 1         # Decrement packet counter
    cmpwi r5, 0
    bne packet_processing_loop # Continue packet processing

Related Instructions

lwz, lwzu, lwzx, stwux, lfdux, lbzux

Back to Index