LHAUX - Load Halfword Algebraic with Update Indexed | PowerPC Instruction Set Reference

Instruction Syntax

Mnemonic	Format	Flags
lhaux	rD,rA,rB	-

Instruction Encoding

Field	Bits	Description
Primary Opcode	0-5	011111 (0x1F)
rD	6-10	Destination register
rA	11-15	Source register A
rB	16-20	Source register B
XO	21-30	375 (Extended opcode)
Rc	31	Reserved (0)

Operation

EA ← (rA) + (rB)
rD ← EXTS(MEM(EA, 2))
rA ← EA

A halfword (16 bits) is loaded from memory, sign-extended to 32 bits, and placed in register rD. The effective address is computed by adding the contents of registers rA and rB. After the load, the effective address is stored back into register rA.

Note: This instruction cannot be used with rA=0. The update form requires a valid base register. This is the most advanced addressing mode for signed halfword loads, combining indexed addressing with automatic pointer advancement and sign extension. Essential for processing signed 16-bit data structures with dynamic stride patterns.

Affected Registers

rA - Updated with the effective address after the load operation.

For more information on memory addressing see Section 2.1.6, "Effective Address Calculation," in the PowerPC Microprocessor Family: The Programming Environments manual.

Examples

Advanced Audio Processing - Multi-Channel Convolution

# Process multi-channel audio with variable channel spacing
lis r3, multichannel_audio@ha
addi r3, r3, multichannel_audio@l
lis r4, channel_strides@ha
addi r4, r4, channel_strides@l
lwz r5, num_samples(r0)     # Number of samples per channel
lwz r6, num_channels(r0)    # Number of audio channels

# Audio convolution with impulse response for room acoustics
multichannel_convolution_loop:
    li r7, 0                # Channel index
    
channel_loop:
    # Load channel stride (varies for different sample rates/formats)
    lwz r8, 0(r4)           # Load stride for this channel
    
    # Load audio sample with dynamic channel advancement
    lhaux r9, r3, r8        # Load signed sample and advance by stride
    
    # Apply convolution with room impulse response
    lis r10, impulse_response@ha
    addi r10, r10, impulse_response@l
    lwz r11, impulse_length(r0) # Length of impulse response
    
    li r12, 0               # Convolution accumulator
    li r13, 0               # Impulse sample index
    mr r14, r3              # Current audio position for convolution
    
convolution_loop:
    cmpw r13, r11           # Check if processed entire impulse
    bge convolution_done
    
    # Load impulse coefficient
    slwi r15, r13, 1        # Convert index to byte offset
    lhax r16, r10, r15      # Load signed impulse coefficient
    
    # Load delayed audio sample
    mullw r17, r13, r8      # Calculate delay offset
    sub r18, r14, r17       # Calculate delayed sample address
    
    # Bounds check for audio buffer
    lis r19, multichannel_audio@ha
    addi r19, r19, multichannel_audio@l
    cmpw r18, r19           # Check if within buffer
    blt skip_convolution    # Skip if out of bounds
    
    lha r20, 0(r18)         # Load delayed audio sample
    
    # Convolution: accumulate impulse * delayed_sample
    mullw r21, r16, r20     # impulse_coeff * delayed_sample
    add r12, r12, r21       # Add to convolution sum

skip_convolution:
    addi r13, r13, 1        # Next impulse sample
    b convolution_loop      # Continue convolution

convolution_done:
    # Scale convolution result
    srawi r22, r12, 12      # Scale down (12-bit fractional)
    
    # Apply dynamic range compression
    abs r23, r22            # |convolved_sample|
    lwz r24, compression_threshold(r0)
    cmpw r23, r24           # Compare with threshold
    ble no_compression      # Skip if below threshold
    
    # Apply soft compression: sign(x) * (threshold + (|x| - threshold) * ratio)
    sub r25, r23, r24       # |x| - threshold
    lwz r26, compression_ratio(r0) # Compression ratio (0-256)
    mullw r27, r25, r26     # (|x| - threshold) * ratio
    srawi r28, r27, 8       # Scale ratio
    add r29, r24, r28       # threshold + compressed_amount
    
    # Restore sign
    cmpwi r22, 0            # Check original sign
    bge positive_sample
    neg r29, r29            # Apply negative sign

positive_sample:
    mr r22, r29             # Use compressed sample

no_compression:
    # Store processed sample back to buffer
    sth r22, 0(r3)          # Store convolved sample
    
    addi r4, r4, 4          # Next channel stride
    addi r7, r7, 1          # Next channel
    cmpw r7, r6             # Check if done with all channels
    blt channel_loop        # Continue channels
    
    subi r5, r5, 1          # Decrement sample counter
    cmpwi r5, 0
    bne multichannel_convolution_loop # Continue processing

Scientific Computing - Sparse Matrix Compressed Storage

# Process sparse matrix in Compressed Sparse Row (CSR) format
lis r3, csr_values@ha
addi r3, r3, csr_values@l
lis r4, row_offsets@ha
addi r4, r4, row_offsets@l
lis r5, column_indices@ha
addi r5, r5, column_indices@l
lwz r6, num_rows(r0)       # Number of matrix rows

# Sparse matrix-vector multiplication: y = A * x
sparse_matvec_loop:
    li r7, 0                # Current row index
    
row_processing_loop:
    # Load row start and end offsets
    slwi r8, r7, 2          # Row index to byte offset
    lwzx r9, r4, r8         # Load row start offset
    addi r10, r8, 4         # Next row offset address
    lwzx r11, r4, r10       # Load row end offset
    
    # Initialize row result
    li r12, 0               # Row accumulator
    
    # Process non-zero elements in this row
    mr r13, r9              # Current element index
    
element_loop:
    cmpw r13, r11           # Check if done with row
    bge row_complete
    
    # Load sparse matrix value with dynamic advancement
    slwi r14, r13, 1        # Convert element index to byte offset
    lhaux r15, r3, r14      # Load signed matrix value and advance
    
    # Load corresponding column index
    slwi r16, r13, 1        # Convert to byte offset for column
    lhax r17, r5, r16       # Load column index (signed)
    
    # Load vector element x[column]
    lis r18, input_vector@ha
    addi r18, r18, input_vector@l
    slwi r19, r17, 1        # Convert column to byte offset
    lhax r20, r18, r19      # Load x[column] (signed)
    
    # Multiply-accumulate: row_sum += matrix_value * x[column]
    mullw r21, r15, r20     # matrix_value * x[column]
    add r12, r12, r21       # Add to row accumulator
    
    addi r13, r13, 1        # Next element in row
    b element_loop          # Continue row processing

row_complete:
    # Store row result in output vector
    lis r22, output_vector@ha
    addi r22, r22, output_vector@l
    slwi r23, r7, 1         # Convert row to byte offset
    sthx r12, r22, r23      # Store y[row] = row_sum
    
    addi r7, r7, 1          # Next row
    cmpw r7, r6             # Check if done with all rows
    blt row_processing_loop # Continue matrix processing

Computer Graphics - Mesh Deformation

# Deform 3D mesh vertices using signed displacement vectors
lis r3, vertex_positions@ha
addi r3, r3, vertex_positions@l
lis r4, deformation_vectors@ha
addi r4, r4, deformation_vectors@l
lis r5, vertex_weights@ha
addi r5, r5, vertex_weights@l
lwz r6, num_vertices(r0)   # Number of vertices

# Each vertex: [x, y, z] (signed 16-bit coordinates)
# Each deformation: [dx, dy, dz] (signed 16-bit displacements)
mesh_deformation_loop:
    li r7, 0                # Vertex index
    
vertex_deformation_loop:
    # Calculate vertex data addresses
    slwi r8, r7, 3          # Vertex index * 8 (4 coordinates * 2 bytes)
    slwi r9, r7, 3          # Same for deformation vectors
    slwi r10, r7, 1         # Weight index * 2 bytes
    
    # Load vertex weight for this deformation
    lhax r11, r5, r10       # Load signed weight (0-32767)
    
    # Load and deform X coordinate
    lhaux r12, r3, r8       # Load vertex X and advance to next coordinate
    li r13, 2               # Standard advance for next coordinate
    lhaux r14, r4, r9       # Load deformation dX and advance
    
    # Apply weighted deformation: new_x = old_x + (dx * weight / 32768)
    mullw r15, r14, r11     # dx * weight
    srawi r16, r15, 15      # Scale by 32768 (15-bit shift)
    add r17, r12, r16       # new_x = old_x + scaled_dx
    
    # Clamp to valid coordinate range
    cmpwi r17, -32768       # Check lower bound
    bge check_x_upper
    li r17, -32768          # Clamp to minimum

check_x_upper:
    cmpwi r17, 32767        # Check upper bound
    ble store_new_x
    li r17, 32767           # Clamp to maximum

store_new_x:
    sth r17, -2(r3)         # Store new X coordinate
    
    # Load and deform Y coordinate
    lhaux r18, r3, r13      # Load vertex Y and advance
    lhaux r19, r4, r13      # Load deformation dY and advance
    
    # Apply weighted deformation for Y
    mullw r20, r19, r11     # dy * weight
    srawi r21, r20, 15      # Scale by 32768
    add r22, r18, r21       # new_y = old_y + scaled_dy
    
    # Clamp Y coordinate
    cmpwi r22, -32768       # Check lower bound
    bge check_y_upper
    li r22, -32768          # Clamp to minimum

check_y_upper:
    cmpwi r22, 32767        # Check upper bound
    ble store_new_y
    li r22, 32767           # Clamp to maximum

store_new_y:
    sth r22, -2(r3)         # Store new Y coordinate
    
    # Load and deform Z coordinate
    lhaux r23, r3, r13      # Load vertex Z and advance
    lhaux r24, r4, r13      # Load deformation dZ and advance
    
    # Apply weighted deformation for Z
    mullw r25, r24, r11     # dz * weight
    srawi r26, r25, 15      # Scale by 32768
    add r27, r23, r26       # new_z = old_z + scaled_dz
    
    # Clamp Z coordinate
    cmpwi r27, -32768       # Check lower bound
    bge check_z_upper
    li r27, -32768          # Clamp to minimum

check_z_upper:
    cmpwi r27, 32767        # Check upper bound
    ble store_new_z
    li r27, 32767           # Clamp to maximum

store_new_z:
    sth r27, -2(r3)         # Store new Z coordinate
    
    # Calculate surface normal update (simplified)
    # For deformed mesh, recalculate normals based on adjacent vertices
    bl recalculate_vertex_normal # Update normal for this vertex
    
    addi r7, r7, 1          # Next vertex
    cmpw r7, r6             # Check if done with all vertices
    blt vertex_deformation_loop # Continue vertex processing

Signal Processing - Adaptive Filter

# Implement adaptive LMS (Least Mean Squares) filter
lis r3, input_signal@ha
addi r3, r3, input_signal@l
lis r4, adaptive_coeffs@ha
addi r4, r4, adaptive_coeffs@l
lis r5, step_sizes@ha
addi r5, r5, step_sizes@l
lwz r6, signal_length(r0)  # Number of signal samples
lwz r7, filter_order(r0)   # Number of adaptive filter taps

adaptive_filter_loop:
    # Calculate filter output using current coefficients
    li r8, 0                # Filter output accumulator
    li r9, 0                # Tap index
    mr r10, r3              # Current signal position
    mr r11, r4              # Current coefficient position
    
filter_convolution:
    cmpw r9, r7             # Check if done with all taps
    bge calculate_error
    
    # Load signal sample and coefficient with adaptive step
    slwi r12, r9, 1         # Convert tap index to byte offset
    lhaux r13, r10, r12     # Load signal sample and advance
    lhaux r14, r11, r12     # Load coefficient and advance
    
    # Multiply and accumulate
    mullw r15, r13, r14     # sample * coefficient
    add r8, r8, r15         # Add to filter output
    
    addi r9, r9, 1          # Next tap
    b filter_convolution    # Continue convolution

calculate_error:
    # Load desired response and calculate error
    lis r16, desired_signal@ha
    addi r16, r16, desired_signal@l
    sub r17, r6, 1          # Calculate current sample index
    slwi r18, r17, 1        # Convert to byte offset
    lhax r19, r16, r18      # Load desired response
    
    # Calculate error: e = desired - actual
    srawi r20, r8, 8        # Scale filter output
    sub r21, r19, r20       # error = desired - actual
    
    # Update adaptive coefficients using LMS algorithm
    # coeff_new = coeff_old + step_size * error * input
    li r22, 0               # Tap index for coefficient update
    mr r23, r3              # Reset signal position
    mr r24, r4              # Reset coefficient position
    mr r25, r5              # Reset step size position
    
coefficient_update_loop:
    cmpw r22, r7            # Check if updated all coefficients
    bge adaptation_complete
    
    # Load current input sample, coefficient, and step size
    slwi r26, r22, 1        # Convert index to byte offset
    lhaux r27, r23, r26     # Load input sample and advance
    lhaux r28, r24, r26     # Load current coefficient and advance
    lhaux r29, r25, r26     # Load step size and advance
    
    # Calculate coefficient update: delta = step_size * error * input
    mullw r30, r29, r21     # step_size * error
    mullw r31, r30, r27     # * input
    srawi r0, r31, 12       # Scale down (12-bit fractional)
    
    # Update coefficient: new_coeff = old_coeff + delta
    add r1, r28, r0         # new_coefficient
    
    # Clip coefficient to prevent overflow
    cmpwi r1, 32767         # Check upper bound
    ble check_coeff_lower
    li r1, 32767            # Clip to maximum

check_coeff_lower:
    cmpwi r1, -32768        # Check lower bound
    bge store_new_coeff
    li r1, -32768           # Clip to minimum

store_new_coeff:
    sth r1, -2(r24)         # Store updated coefficient
    
    addi r22, r22, 1        # Next coefficient
    b coefficient_update_loop # Continue coefficient updates

adaptation_complete:
    # Calculate convergence metric: |error|
    abs r2, r21             # |error|
    lwz r3, convergence_threshold(r0)
    cmpw r2, r3             # Check if converged
    blt filter_converged    # Branch if converged
    
    # Continue adaptation
    b next_sample

filter_converged:
    # Filter has converged - could switch to faster non-adaptive mode
    bl switch_to_fixed_mode

next_sample:
    # Advance to next signal sample
    addi r3, r3, 2          # Next input sample
    subi r6, r6, 1          # Decrement sample counter
    cmpwi r6, 0
    bne adaptive_filter_loop # Continue adaptive filtering

Bioinformatics - DNA Sequence Analysis

# Analyze DNA sequences with variable scoring matrices
lis r3, dna_sequence1@ha
addi r3, r3, dna_sequence1@l
lis r4, dna_sequence2@ha
addi r4, r4, dna_sequence2@l
lis r5, scoring_matrix@ha
addi r5, r5, scoring_matrix@l
lwz r6, sequence1_length(r0) # Length of first sequence
lwz r7, sequence2_length(r0) # Length of second sequence

# DNA bases encoded as: A=0, T=1, G=2, C=3 (signed values for gap penalties)
# Scoring matrix: 4x4 matrix of alignment scores

sequence_alignment_loop:
    li r8, 0                # Position in sequence 1
    
seq1_loop:
    li r9, 0                # Position in sequence 2
    
seq2_loop:
    # Load nucleotides from both sequences
    slwi r10, r8, 1         # Convert position to byte offset
    lhaux r11, r3, r10      # Load nucleotide from sequence 1 and advance
    
    slwi r12, r9, 1         # Convert position to byte offset
    lhaux r13, r4, r12      # Load nucleotide from sequence 2 and advance
    
    # Handle gap penalties (negative values indicate gaps)
    cmpwi r11, 0            # Check for gap in sequence 1
    blt gap_penalty_seq1
    cmpwi r13, 0            # Check for gap in sequence 2
    blt gap_penalty_seq2
    
    # Normal nucleotide comparison
    # Calculate scoring matrix index: score = matrix[base1][base2]
    slwi r14, r11, 3        # base1 * 8 (4 bases * 2 bytes per entry)
    slwi r15, r13, 1        # base2 * 2 bytes
    add r16, r14, r15       # Matrix offset
    lhaux r17, r5, r16      # Load alignment score and advance
    b score_loaded

gap_penalty_seq1:
    # Gap in sequence 1
    abs r18, r11            # |gap_length|
    lwz r19, gap_penalty(r0) # Gap penalty per position
    mullw r17, r18, r19     # gap_length * penalty
    neg r17, r17            # Negative penalty
    b score_loaded

gap_penalty_seq2:
    # Gap in sequence 2
    abs r18, r13            # |gap_length|
    lwz r19, gap_penalty(r0) # Gap penalty per position
    mullw r17, r18, r19     # gap_length * penalty
    neg r17, r17            # Negative penalty

score_loaded:
    # Store alignment score
    lis r20, alignment_scores@ha
    addi r20, r20, alignment_scores@l
    mullw r21, r8, r7       # seq1_pos * seq2_length
    add r22, r21, r9        # + seq2_pos
    slwi r23, r22, 1        # Convert to byte offset
    sthx r17, r20, r23      # Store alignment score
    
    # Update optimal alignment path (dynamic programming)
    bl update_alignment_path # Update traceback matrix
    
    addi r9, r9, 1          # Next position in sequence 2
    cmpw r9, r7             # Check if done with sequence 2
    blt seq2_loop           # Continue sequence 2
    
    addi r8, r8, 1          # Next position in sequence 1
    cmpw r8, r6             # Check if done with sequence 1
    blt seq1_loop           # Continue sequence 1

# Find optimal alignment score
bl find_optimal_alignment   # Traceback to find best alignment

Related Instructions

lha, lhau, lhax, sthux, lhzux, lwzux

Back to Index