DCBTST - Data Cache Block Touch for Store Instruction | PowerPC Instruction Set Reference

Instruction Syntax

Mnemonic	Format	Flags
dcbtst	rA,rB	-

Instruction Encoding

Field	Bits	Description
Primary Opcode	0-5	011111 (0x1F)
Reserved	6-10	00000
rA	11-15	Register A (base address)
rB	16-20	Register B (index)
Reserved	21	0
XO	22-30	011110110 (246)
Reserved	31	0

Operation

if rA = 0 then EA ← (rB)
else EA ← (rA) + (rB)

if block not in cache and EA is cacheable then load to cache for store

The data cache block touch for store instruction loads the cache block containing the effective address into the data cache if it is not already present and the address is cacheable. This instruction indicates that the cache block will be modified soon, allowing the cache to optimize for write operations.

Note: This instruction is a performance hint for write-intensive operations, potentially obtaining exclusive ownership of the cache line.

Affected Registers

None - This instruction does not affect any registers.

For more information on cache management see Section 3.2, "Cache Management Instructions," in the PowerPC Microprocessor Family: The Programming Environments manual.

Examples

Basic Store Prefetch

# Prefetch cache line that will be written to
lis r3, output_buffer@ha    # Load high part of buffer address
addi r3, r3, output_buffer@l    # Complete buffer address
dcbtst 0, r3                # Prefetch for store operations
stw r4, 0(r3)               # Store will be faster

Buffer Initialization

# Prefetch cache lines before initializing buffer
lis r3, init_buffer@ha
addi r3, r3, init_buffer@l
li r4, 0                    # Start offset
li r5, 1024                 # Buffer size
li r6, 32                   # Cache line size

init_loop:
    add r7, r3, r4          # Calculate address
    dcbtst 0, r7            # Prefetch for store
    
    # Initialize cache line (32 bytes)
    li r8, 0x12345678       # Pattern to store
    stw r8, 0(r7)           # Store pattern
    stw r8, 4(r7)
    stw r8, 8(r7)
    stw r8, 12(r7)
    stw r8, 16(r7)
    stw r8, 20(r7)
    stw r8, 24(r7)
    stw r8, 28(r7)
    
    add r4, r4, r6          # Next cache line
    cmpw r4, r5             # Check if done
    blt init_loop           # Continue if more data

Memory Copy Optimization

# Optimize memory copy with prefetching
lis r3, source_addr@ha
addi r3, r3, source_addr@l
lis r4, dest_addr@ha
addi r4, r4, dest_addr@l
li r5, 0                    # Offset
li r6, 2048                 # Copy size
li r7, 64                   # Prefetch distance

copy_loop:
    add r8, r3, r5          # Source address
    add r9, r4, r5          # Destination address
    add r10, r8, r7         # Prefetch source
    add r11, r9, r7         # Prefetch dest
    
    dcbt 0, r10             # Prefetch source for read
    dcbtst 0, r11           # Prefetch dest for write
    
    # Copy 32 bytes (one cache line)
    lwz r12, 0(r8)
    lwz r13, 4(r8)
    lwz r14, 8(r8)
    lwz r15, 12(r8)
    stw r12, 0(r9)
    stw r13, 4(r9)
    stw r14, 8(r9)
    stw r15, 12(r9)
    
    lwz r12, 16(r8)
    lwz r13, 20(r8)
    lwz r14, 24(r8)
    lwz r15, 28(r8)
    stw r12, 16(r9)
    stw r13, 20(r9)
    stw r14, 24(r9)
    stw r15, 28(r9)
    
    addi r5, r5, 32         # Next cache line
    cmpwi r5, r6            # Check bounds
    blt copy_loop           # Continue copying

Array Processing with Store Prefetch

# Process array with write prefetching
lis r3, input_array@ha
addi r3, r3, input_array@l
lis r4, output_array@ha
addi r4, r4, output_array@l
li r5, 0                    # Index
li r6, 512                  # Array elements
li r7, 128                  # Prefetch ahead (32 elements)

process_loop:
    mulli r8, r5, 4         # Calculate byte offset
    add r9, r3, r8          # Input address
    add r10, r4, r8         # Output address
    add r11, r10, r7        # Prefetch output address
    
    dcbtst 0, r11           # Prefetch output for store
    
    lwz r12, 0(r9)          # Load input value
    # Process data (example: multiply by 2)
    slwi r12, r12, 1        # Shift left (multiply by 2)
    stw r12, 0(r10)         # Store result
    
    addi r5, r5, 1          # Next element
    cmpw r5, r6             # Check if done
    blt process_loop        # Continue processing

Related Instructions

dcba, dcbf, dcbi, dcbst, dcbt, dcbz

Back to Index