LFSU - Load Floating-Point Single with Update | PowerPC Instruction Set Reference

Instruction Syntax

Mnemonic	Format	Flags
lfsu	frD,d(rA)	-

Instruction Encoding

Field	Bits	Description
Primary Opcode	0-5	110001 (0x31)
frD	6-10	Destination floating-point register
rA	11-15	Source register A
d	16-31	16-bit signed displacement

Operation

EA ← (rA) + EXTS(d)
frD ← DOUBLE(MEM(EA, 4))
rA ← EA

A single-precision floating-point value (32 bits) is loaded from memory, converted to double-precision format, and placed in floating-point register frD. The effective address is computed by adding the sign-extended displacement to the contents of register rA. After the load, the effective address is stored back into register rA.

Note: This instruction cannot be used with rA=0. The update form requires a valid base register. The loaded single-precision value is automatically converted to double-precision before being stored in the FPR. The effective address should be word-aligned (divisible by 4) for optimal performance.

Affected Registers

rA - Updated with the effective address after the load operation.

For more information on floating-point operations see Section 2.1.4, "Floating-Point Status and Control Register (FPSCR)," in the PowerPC Microprocessor Family: The Programming Environments manual.

Examples

Audio Sample Processing

# Process audio samples with automatic advance (32-bit float samples)
lis r3, audio_buffer@ha
addi r3, r3, audio_buffer@l
lwz r4, num_samples(r0)     # Number of audio samples
subi r3, r3, 4              # Pre-adjust for first lfsu

# Load reverb impulse response
lis r5, reverb_impulse@ha
addi r5, r5, reverb_impulse@l
lwz r6, impulse_length(r0)  # Length of impulse response

audio_process_loop:
    lfsu f1, 4(r3)          # Load next audio sample and advance pointer
    
    # Apply dynamic range compression
    lfs f2, compression_threshold(r0)
    fcmpu cr0, f1, f2       # Compare with threshold
    ble no_compression      # Skip if below threshold
    
    # Apply compression: output = threshold + (input - threshold) * ratio
    lfs f3, compression_ratio(r0)  # Load compression ratio (0.0 - 1.0)
    fsub f4, f1, f2         # (input - threshold)
    fmul f5, f4, f3         # (input - threshold) * ratio
    fadd f1, f2, f5         # threshold + compressed_amount

no_compression:
    # Apply high-pass filter for clarity
    lfs f6, prev_sample(r0) # Load previous sample
    lfs f7, hp_coefficient(r0) # High-pass filter coefficient
    fsub f8, f1, f6         # Current - previous
    fmul f9, f8, f7         # Apply filter coefficient
    stfs f1, prev_sample(r0) # Store current as previous for next iteration
    
    # Apply reverb (convolution with impulse response)
    fmr f10, f9             # Start with filtered sample
    mr r7, r5               # Reset impulse pointer
    subi r7, r7, 4          # Pre-adjust for lfsu
    li r8, 0                # Impulse index
    
reverb_loop:
    cmpw r8, r6             # Check if processed entire impulse
    bge reverb_done
    
    lfsu f11, 4(r7)         # Load impulse coefficient and advance
    
    # Calculate delayed sample index
    sub r9, r8, 0           # For simplicity, use direct convolution
    cmpwi r9, 0
    blt skip_reverb         # Skip if negative index
    
    # Load delayed sample (simplified - normally would use circular buffer)
    slwi r10, r9, 2         # Convert to byte offset
    sub r11, r3, r10        # Calculate delayed sample address
    cmpw r11, r3            # Bounds check (simplified)
    bgt skip_reverb
    
    lfs f12, 0(r11)         # Load delayed sample
    fmadd f10, f11, f12, f10 # accumulate reverb: result += impulse * delayed_sample

skip_reverb:
    addi r8, r8, 1          # Next impulse coefficient
    b reverb_loop

reverb_done:
    # Apply final gain and store processed sample
    lfs f13, output_gain(r0) # Load output gain
    fmul f14, f10, f13      # Apply gain
    stfs f14, 0(r3)         # Store processed sample back to buffer
    
    subi r4, r4, 1          # Decrement sample counter
    cmpwi r4, 0
    bne audio_process_loop  # Continue processing

3D Graphics Vertex Processing

# Process 3D vertex data with transformation matrices
lis r3, vertex_array@ha
addi r3, r3, vertex_array@l
lwz r4, num_vertices(r0)    # Number of vertices
subi r3, r3, 4              # Pre-adjust pointer

# Load transformation matrix (4x4 model-view-projection matrix)
lis r5, mvp_matrix@ha
addi r5, r5, mvp_matrix@l

vertex_transform_loop:
    # Load vertex position (x, y, z, w)
    lfsu f1, 4(r3)          # Load x and advance
    lfsu f2, 4(r3)          # Load y and advance
    lfsu f3, 4(r3)          # Load z and advance
    lfsu f4, 4(r3)          # Load w and advance
    
    # Matrix transformation: result = matrix * vertex
    # Row 0: result.x = m[0]*x + m[1]*y + m[2]*z + m[3]*w
    lfs f5, 0(r5)           # m[0][0]
    lfs f6, 4(r5)           # m[0][1]
    lfs f7, 8(r5)           # m[0][2]
    lfs f8, 12(r5)          # m[0][3]
    
    fmul f9, f5, f1         # m[0][0] * x
    fmadd f9, f6, f2, f9    # + m[0][1] * y
    fmadd f9, f7, f3, f9    # + m[0][2] * z
    fmadd f9, f8, f4, f9    # + m[0][3] * w = result.x
    
    # Row 1: result.y
    lfs f5, 16(r5)          # m[1][0]
    lfs f6, 20(r5)          # m[1][1]
    lfs f7, 24(r5)          # m[1][2]
    lfs f8, 28(r5)          # m[1][3]
    
    fmul f10, f5, f1        # m[1][0] * x
    fmadd f10, f6, f2, f10  # + m[1][1] * y
    fmadd f10, f7, f3, f10  # + m[1][2] * z
    fmadd f10, f8, f4, f10  # + m[1][3] * w = result.y
    
    # Row 2: result.z
    lfs f5, 32(r5)          # m[2][0]
    lfs f6, 36(r5)          # m[2][1]
    lfs f7, 40(r5)          # m[2][2]
    lfs f8, 44(r5)          # m[2][3]
    
    fmul f11, f5, f1        # m[2][0] * x
    fmadd f11, f6, f2, f11  # + m[2][1] * y
    fmadd f11, f7, f3, f11  # + m[2][2] * z
    fmadd f11, f8, f4, f11  # + m[2][3] * w = result.z
    
    # Row 3: result.w
    lfs f5, 48(r5)          # m[3][0]
    lfs f6, 52(r5)          # m[3][1]
    lfs f7, 56(r5)          # m[3][2]
    lfs f8, 60(r5)          # m[3][3]
    
    fmul f12, f5, f1        # m[3][0] * x
    fmadd f12, f6, f2, f12  # + m[3][1] * y
    fmadd f12, f7, f3, f12  # + m[3][2] * z
    fmadd f12, f8, f4, f12  # + m[3][3] * w = result.w
    
    # Perspective divide (x/w, y/w, z/w)
    fdiv f13, f9, f12       # x/w
    fdiv f14, f10, f12      # y/w
    fdiv f15, f11, f12      # z/w
    
    # Store transformed vertex (overwrite original)
    stfs f13, -16(r3)       # Store transformed x
    stfs f14, -12(r3)       # Store transformed y
    stfs f15, -8(r3)        # Store transformed z
    stfs f12, -4(r3)        # Store w (for clipping tests)
    
    subi r4, r4, 1          # Decrement vertex counter
    cmpwi r4, 0
    bne vertex_transform_loop # Continue processing vertices

Real-Time Signal Processing - IIR Filter

# Apply Infinite Impulse Response (IIR) filter to signal
lis r3, signal_input@ha
addi r3, r3, signal_input@l
lwz r4, signal_length(r0)   # Number of signal samples
subi r3, r3, 4              # Pre-adjust pointer

# IIR filter coefficients (2nd order Butterworth low-pass filter)
lis r5, filter_coeffs@ha
addi r5, r5, filter_coeffs@l
# Coefficients layout: [b0, b1, b2, a1, a2] where:
# y[n] = b0*x[n] + b1*x[n-1] + b2*x[n-2] - a1*y[n-1] - a2*y[n-2]

# Initialize delay lines (previous input and output samples)
lfs f20, zero_constant(r0)  # x[n-1] = 0
lfs f21, zero_constant(r0)  # x[n-2] = 0
lfs f22, zero_constant(r0)  # y[n-1] = 0
lfs f23, zero_constant(r0)  # y[n-2] = 0

# Load filter coefficients
lfs f10, 0(r5)              # b0
lfs f11, 4(r5)              # b1
lfs f12, 8(r5)              # b2
lfs f13, 12(r5)             # a1
lfs f14, 16(r5)             # a2

iir_filter_loop:
    lfsu f1, 4(r3)          # Load input sample x[n] and advance
    
    # Calculate IIR filter output
    # y[n] = b0*x[n] + b1*x[n-1] + b2*x[n-2] - a1*y[n-1] - a2*y[n-2]
    
    fmul f2, f10, f1        # b0 * x[n]
    fmadd f2, f11, f20, f2  # + b1 * x[n-1]
    fmadd f2, f12, f21, f2  # + b2 * x[n-2]
    fnmsub f2, f13, f22, f2 # - a1 * y[n-1]
    fnmsub f2, f14, f23, f2 # - a2 * y[n-2]
    
    # Update delay lines for next iteration
    fmr f21, f20            # x[n-2] = x[n-1]
    fmr f20, f1             # x[n-1] = x[n]
    fmr f23, f22            # y[n-2] = y[n-1]
    fmr f22, f2             # y[n-1] = y[n]
    
    # Store filtered output
    stfs f2, 0(r3)          # Store filtered sample back to buffer
    
    subi r4, r4, 1          # Decrement sample counter
    cmpwi r4, 0
    bne iir_filter_loop     # Continue filtering

Machine Learning - Neural Network Inference

# Forward pass through dense neural network layer
lis r3, input_layer@ha
addi r3, r3, input_layer@l
lwz r4, input_size(r0)      # Number of input neurons
lwz r5, output_size(r0)     # Number of output neurons
lis r6, weight_matrix@ha
addi r6, r6, weight_matrix@l # Weight matrix [output_size x input_size]
lis r7, output_layer@ha
addi r7, r7, output_layer@l
subi r3, r3, 4              # Pre-adjust input pointer

# Process each output neuron
li r8, 0                    # Output neuron index

output_neuron_loop:
    lfs f10, zero_constant(r0) # Initialize accumulator for this neuron
    
    # Reset input pointer for this output neuron
    lis r9, input_layer@ha
    addi r9, r9, input_layer@l
    subi r9, r9, 4          # Pre-adjust for lfsu
    
    mr r10, r4              # Reset input counter
    
    # Calculate weight matrix offset for current output neuron
    mullw r11, r8, r4       # output_index * input_size
    slwi r12, r11, 2        # Convert to byte offset (* 4)
    add r13, r6, r12        # Weight pointer for this output neuron
    subi r13, r13, 4        # Pre-adjust for lfsu

input_neuron_loop:
    lfsu f1, 4(r9)          # Load input activation and advance
    lfsu f2, 4(r13)         # Load weight and advance
    
    # Multiply-accumulate: sum += input * weight
    fmadd f10, f1, f2, f10
    
    subi r10, r10, 1        # Decrement input counter
    cmpwi r10, 0
    bne input_neuron_loop   # Continue for all inputs
    
    # Load bias for this output neuron
    lis r14, bias_array@ha
    addi r14, r14, bias_array@l
    slwi r15, r8, 2         # Convert output index to byte offset
    lfsx f3, r14, r15       # Load bias
    
    # Add bias: activation = sum + bias
    fadd f11, f10, f3
    
    # Apply ReLU activation function: max(0, x)
    lfs f4, zero_constant(r0)
    fcmpu cr0, f11, f4      # Compare with 0
    blt use_zero            # Use 0 if negative
    fmr f12, f11            # Use computed value if positive
    b store_activation

use_zero:
    fmr f12, f4             # Use 0 for negative values

store_activation:
    # Store output activation
    slwi r16, r8, 2         # Convert output index to byte offset
    stfsx f12, r7, r16      # Store activation in output layer
    
    addi r8, r8, 1          # Next output neuron
    cmpw r8, r5             # Check if done with all outputs
    blt output_neuron_loop  # Continue for all output neurons

# Apply softmax for classification (optional)
# First pass: find maximum for numerical stability
lfs f20, neg_infinity(r0)  # Start with very negative value
li r8, 0                   # Reset output index

find_max_loop:
    slwi r16, r8, 2
    lfsx f13, r7, r16       # Load activation
    fcmpu cr0, f13, f20     # Compare with current max
    ble not_new_max
    fmr f20, f13            # Update maximum

not_new_max:
    addi r8, r8, 1
    cmpw r8, r5
    blt find_max_loop

# Second pass: compute exp(x - max) and sum
lfs f21, zero_constant(r0) # Sum of exponentials
lis r17, temp_exp@ha
addi r17, r17, temp_exp@l  # Temporary array for exponentials
li r8, 0                   # Reset index
subi r17, r17, 4           # Pre-adjust for stfsu

exp_sum_loop:
    slwi r16, r8, 2
    lfsx f14, r7, r16       # Load activation
    fsub f15, f14, f20      # x - max
    bl compute_exp          # Compute exp(x - max) -> result in f16
    stfsu f16, 4(r17)       # Store exponential and advance
    fadd f21, f21, f16      # Add to sum
    
    addi r8, r8, 1
    cmpw r8, r5
    blt exp_sum_loop

# Third pass: divide by sum to get probabilities
lis r17, temp_exp@ha
addi r17, r17, temp_exp@l
subi r17, r17, 4           # Pre-adjust for lfsu
li r8, 0                   # Reset index

softmax_loop:
    lfsu f17, 4(r17)        # Load exponential and advance
    fdiv f18, f17, f21      # exp / sum = probability
    
    slwi r16, r8, 2
    stfsx f18, r7, r16      # Store probability
    
    addi r8, r8, 1
    cmpw r8, r5
    blt softmax_loop

Digital Image Processing - Convolution

# Apply 2D convolution filter to image (e.g., edge detection, blur)
lis r3, image_data@ha
addi r3, r3, image_data@l
lwz r4, image_width(r0)     # Image width in pixels
lwz r5, image_height(r0)    # Image height in pixels
lis r6, filter_kernel@ha
addi r6, r6, filter_kernel@l # 3x3 convolution kernel
lis r7, output_image@ha
addi r7, r7, output_image@l

# Process each pixel (excluding border for simplicity)
li r8, 1                    # Start from row 1 (skip border)
subi r9, r5, 1              # End at height-1 (skip border)

row_loop:
    li r10, 1               # Start from column 1 (skip border)
    subi r11, r4, 1         # End at width-1 (skip border)

col_loop:
    lfs f10, zero_constant(r0) # Initialize convolution sum
    
    # Apply 3x3 kernel
    li r12, -1              # Kernel row offset (-1, 0, 1)
    li r13, 3               # Kernel row counter

kernel_row_loop:
    li r14, -1              # Kernel column offset (-1, 0, 1)
    li r15, 3               # Kernel column counter

kernel_col_loop:
    # Calculate source pixel position
    add r16, r8, r12        # source_row = current_row + kernel_row_offset
    add r17, r10, r14       # source_col = current_col + kernel_col_offset
    
    # Calculate source pixel address
    mullw r18, r16, r4      # source_row * image_width
    add r19, r18, r17       # + source_col
    slwi r20, r19, 2        # Convert to byte offset (* 4)
    add r21, r3, r20        # Source pixel address
    
    # Load source pixel value
    lfs f1, 0(r21)          # Load pixel value
    
    # Calculate kernel coefficient address
    addi r22, r12, 1        # Convert kernel row offset to index (0-2)
    mulli r23, r22, 3       # kernel_row_index * 3
    addi r24, r14, 1        # Convert kernel col offset to index (0-2)
    add r25, r23, r24       # kernel_index = row_index * 3 + col_index
    slwi r26, r25, 2        # Convert to byte offset
    add r27, r6, r26        # Kernel coefficient address
    
    # Load kernel coefficient
    lfs f2, 0(r27)          # Load kernel coefficient
    
    # Multiply and accumulate
    fmadd f10, f1, f2, f10  # sum += pixel * kernel_coeff
    
    addi r14, r14, 1        # Next kernel column offset
    subi r15, r15, 1        # Decrement column counter
    cmpwi r15, 0
    bne kernel_col_loop     # Continue kernel column
    
    addi r12, r12, 1        # Next kernel row offset
    subi r13, r13, 1        # Decrement row counter
    cmpwi r13, 0
    bne kernel_row_loop     # Continue kernel row
    
    # Store convolution result
    mullw r28, r8, r4       # current_row * image_width
    add r29, r28, r10       # + current_col
    slwi r30, r29, 2        # Convert to byte offset
    add r31, r7, r30        # Output pixel address
    stfs f10, 0(r31)        # Store convolved pixel
    
    addi r10, r10, 1        # Next column
    cmpw r10, r11           # Check if done with row
    blt col_loop            # Continue row
    
    addi r8, r8, 1          # Next row
    cmpw r8, r9             # Check if done with image
    blt row_loop            # Continue image processing

Financial Modeling - Monte Carlo Simulation

# Monte Carlo simulation for option pricing
lis r3, random_numbers@ha
addi r3, r3, random_numbers@l
lwz r4, num_simulations(r0) # Number of Monte Carlo paths
subi r3, r3, 4              # Pre-adjust pointer

# Load option parameters
lfs f20, spot_price(r0)     # Current stock price
lfs f21, strike_price(r0)   # Option strike price
lfs f22, risk_free_rate(r0) # Risk-free interest rate
lfs f23, volatility(r0)     # Stock volatility
lfs f24, time_to_expiry(r0) # Time to expiration
lfs f25, zero_constant(r0)  # Zero for max calculations
lfs f26, zero_constant(r0)  # Accumulator for option values

# Precalculate constants
# drift = (r - 0.5 * σ²) * T
lfs f27, half_constant(r0)  # 0.5
fmul f28, f23, f23          # σ²
fmul f29, f27, f28          # 0.5 * σ²
fsub f30, f22, f29          # r - 0.5 * σ²
fmul f31, f30, f24          # drift = (r - 0.5 * σ²) * T

# vol_sqrt_t = σ * √T
fsqrt f0, f24               # √T
fmul f1, f23, f0            # σ * √T

monte_carlo_loop:
    lfsu f2, 4(r3)          # Load random number (standard normal) and advance
    
    # Calculate stock price at expiration using Black-Scholes formula
    # S_T = S_0 * exp(drift + σ*√T*Z) where Z is standard normal random
    fmul f3, f1, f2         # σ * √T * Z
    fadd f4, f31, f3        # drift + σ * √T * Z
    bl compute_exp          # exp(drift + σ * √T * Z) -> result in f5
    fmul f6, f20, f5        # S_T = S_0 * exp(...)
    
    # Calculate option payoff (European call option)
    # payoff = max(S_T - K, 0)
    fsub f7, f6, f21        # S_T - K
    fcmpu cr0, f7, f25      # Compare with 0
    blt zero_payoff         # Payoff is 0 if S_T < K
    fmr f8, f7              # Payoff = S_T - K
    b add_payoff

zero_payoff:
    fmr f8, f25             # Payoff = 0

add_payoff:
    fadd f26, f26, f8       # Add to accumulator
    
    subi r4, r4, 1          # Decrement simulation counter
    cmpwi r4, 0
    bne monte_carlo_loop    # Continue simulation

# Calculate option price
# price = exp(-r*T) * (sum_of_payoffs / num_simulations)
lwz r5, num_simulations(r0) # Reload total number of simulations
stw r5, temp_simulations(r1) # Store as float
lfs f9, temp_simulations(r1) # Load as float
fdiv f10, f26, f9           # Average payoff
fmul f11, f22, f24          # r * T
fneg f12, f11               # -r * T
bl compute_exp              # exp(-r * T) -> result in f13
fmul f14, f13, f10          # Discounted expected payoff = option price

stfs f14, option_price(r0)  # Store calculated option price

# Calculate additional Greeks (delta, gamma, etc.) if needed
# Delta approximation using finite differences would require additional simulations
# with slightly perturbed spot prices

Scientific Computing - Numerical Integration

# Adaptive quadrature integration using Simpson's rule
lis r3, function_values@ha
addi r3, r3, function_values@l
lwz r4, num_intervals(r0)   # Number of integration intervals
subi r3, r3, 4              # Pre-adjust pointer

# Integration parameters
lfs f20, integration_start(r0) # Lower bound
lfs f21, integration_end(r0)   # Upper bound
lfs f22, zero_constant(r0)     # Integral accumulator

# Calculate step size: h = (b - a) / n
fsub f23, f21, f20          # b - a
lwz r5, num_intervals(r0)
stw r5, temp_intervals(r1)
lfs f24, temp_intervals(r1) # Convert to float
fdiv f25, f23, f24          # h = (b - a) / n

# Simpson's rule coefficients
lfs f26, one_constant(r0)   # 1
lfs f27, four_constant(r0)  # 4
lfs f28, two_constant(r0)   # 2
lfs f29, six_constant(r0)   # 6

# Simpson's rule: ∫f(x)dx ≈ (h/3)[f(x₀) + 4f(x₁) + 2f(x₂) + 4f(x₃) + ... + f(xₙ)]
li r6, 0                    # Interval index

integration_loop:
    lfsu f1, 4(r3)          # Load function value f(xᵢ) and advance
    
    # Determine Simpson's coefficient based on position
    cmpwi r6, 0             # First point?
    beq first_point
    
    cmpw r6, r4             # Last point?
    beq last_point
    
    # Check if even or odd index (excluding first and last)
    andi. r7, r6, 1         # Check if odd
    bne odd_point           # Odd indices get coefficient 4
    
    # Even point (coefficient 2)
    fmul f2, f1, f28        # f(xᵢ) * 2
    b add_to_integral

first_point:
last_point:
    # First and last points get coefficient 1
    fmul f2, f1, f26        # f(xᵢ) * 1
    b add_to_integral

odd_point:
    # Odd points get coefficient 4
    fmul f2, f1, f27        # f(xᵢ) * 4

add_to_integral:
    fadd f22, f22, f2       # Add weighted function value to sum
    
    addi r6, r6, 1          # Next interval
    cmpw r6, r4             # Check if done
    ble integration_loop    # Continue (≤ because we need n+1 points)

# Final result: integral = (h/3) * sum
fdiv f30, f25, f29          # h/3
fmul f31, f30, f22          # (h/3) * sum

stfs f31, integral_result(r0) # Store final integral value

# Error estimation using Richardson extrapolation (optional)
# This would involve computing the integral with half the step size
# and comparing results for adaptive refinement

Related Instructions

lfs, lfsx, lfsux, stfsu, lfdu, lbzu

Back to Index