1/* SPDX-License-Identifier: MIT */ 2/* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24// This shader is to clean LDS, SGPRs and VGPRs. It is first 64 Dwords or 256 bytes of 256 Dwords cleaner shader. 25 26// GFX10.1 : Clear SGPRs, VGPRs and LDS 27// Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot 28// Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD 29// Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS) 30// It takes 2 workgroups to use all of LDS: one on each CU of the WGP 31// Each wave clears SGPRs 0 - 107 32// Each wave clears VGPRs 0 - 63 33// The first wave of the workgroup clears its 64KB of LDS 34// The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup 35// before any wave in the workgroup could end. Without this, it is possible not all SGPRs get cleared. 36 37 38shader main 39 asic(GFX10.1) 40 type(CS) 41 wave_size(32) 42// Note: original source code from SQ team 43 44// 45// Create 32 waves in a threadgroup (CS waves) 46// Each allocates 64 VGPRs 47// The workgroup allocates all of LDS (64kbytes) 48// 49// Takes about 2500 clocks to run. 50// (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks) 51// 52 S_BARRIER 53 s_cmp_eq_u32 s0, 1 // Bit0 is set, sgpr0 is set then clear VGPRS and LDS as FW set COMPUTE_USER_DATA_0 54 s_cbranch_scc0 label_0023 // Clean VGPRs and LDS if sgpr0 of wave is set, scc = (s0 == 1) 55 56 s_mov_b32 s2, 0x00000038 // Loop 64/8=8 times (loop unrolled for performance) 57 s_mov_b32 m0, 0 58 // 59 // CLEAR VGPRs 60 // 61label_0005: 62 v_movreld_b32 v0, 0 63 v_movreld_b32 v1, 0 64 v_movreld_b32 v2, 0 65 v_movreld_b32 v3, 0 66 v_movreld_b32 v4, 0 67 v_movreld_b32 v5, 0 68 v_movreld_b32 v6, 0 69 v_movreld_b32 v7, 0 70 s_mov_b32 m0, s2 71 s_sub_u32 s2, s2, 8 72 s_cbranch_scc0 label_0005 73 // 74 s_mov_b32 s2, 0x80000000 // Bit31 is first_wave 75 s_and_b32 s2, s2, s0 // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set 76 s_cbranch_scc0 label_0023 // Clean LDS if its first wave of ThreadGroup/WorkGroup 77 // CLEAR LDS 78 // 79 s_mov_b32 exec_lo, 0xffffffff 80 s_mov_b32 exec_hi, 0xffffffff 81 v_mbcnt_lo_u32_b32 v1, exec_hi, 0 // Set V1 to thread-ID (0..63) 82 v_mbcnt_hi_u32_b32 v1, exec_lo, v1 // Set V1 to thread-ID (0..63) 83 v_mul_u32_u24 v1, 0x00000008, v1 // * 8, so each thread is a double-dword address (8byte) 84 s_mov_b32 s2, 0x00000003f // 64 loop iterations 85 s_mov_b32 m0, 0xffffffff 86 // Clear all of LDS space 87 // Each FirstWave of WorkGroup clears 64kbyte block 88 89label_001F: 90 ds_write2_b64 v1, v[2:3], v[2:3] offset1:32 91 ds_write2_b64 v1, v[4:5], v[4:5] offset0:64 offset1:96 92 v_add_co_u32 v1, vcc, 0x00000400, v1 93 s_sub_u32 s2, s2, 1 94 s_cbranch_scc0 label_001F 95 96 // 97 // CLEAR SGPRs 98 // 99label_0023: 100 s_mov_b32 m0, 0x00000068 // Loop 108/4=27 times (loop unrolled for performance) 101label_sgpr_loop: 102 s_movreld_b32 s0, 0 103 s_movreld_b32 s1, 0 104 s_movreld_b32 s2, 0 105 s_movreld_b32 s3, 0 106 s_sub_u32 m0, m0, 4 107 s_cbranch_scc0 label_sgpr_loop 108 109 //clear vcc 110 s_mov_b64 vcc, 0 //clear vcc 111 //s_setreg_imm32_b32 hw_reg_shader_flat_scratch_lo, 0 //clear flat scratch lo SGPR 112 //s_setreg_imm32_b32 hw_reg_shader_flat_scratch_hi, 0 //clear flat scratch hi SGPR 113 s_mov_b64 ttmp0, 0 //Clear ttmp0 and ttmp1 114 s_mov_b64 ttmp2, 0 //Clear ttmp2 and ttmp3 115 s_mov_b64 ttmp4, 0 //Clear ttmp4 and ttmp5 116 s_mov_b64 ttmp6, 0 //Clear ttmp6 and ttmp7 117 s_mov_b64 ttmp8, 0 //Clear ttmp8 and ttmp9 118 s_mov_b64 ttmp10, 0 //Clear ttmp10 and ttmp11 119 s_mov_b64 ttmp12, 0 //Clear ttmp12 and ttmp13 120 s_mov_b64 ttmp14, 0 //Clear ttmp14 and ttmp15 121 122 s_endpgm 123 124end 125 126 127