Lines Matching +full:left +full:- +full:aligned
1 /* SPDX-License-Identifier: GPL-2.0 */
4 Copyright (c) 2002 Hewlett-Packard Co/CERN
15 we get to a 16B-aligned address, then loop on 128 B chunks using an
42 // This routine uses only scratch predicate registers (p6 - p15)
43 #define p_scr p6 // default register for same-cycle branches
72 and ptr2 = -(MIN1+1), dest // aligned address
93 (p_y) add cnt = -8, cnt //
96 (p_y) st8 [ptr2] = value,-4 //
100 (p_yy) add cnt = -4, cnt //
103 (p_yy) st4 [ptr2] = value,-2 //
108 (p_y) add cnt = -2, cnt //
112 (p_y) st2 [ptr2] = value,-1 //
120 (p_yy) add cnt = -1, cnt //
130 TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
132 and tmp = -(LINE_SIZE), cnt // compute end of range
134 and cnt = (LINE_SIZE-1), cnt // remainder
136 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
140 (p_scr) add loopcnt = -1, linecnt //
145 add tmp = -1, linecnt // next loop count
194 cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
200 .l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
202 and tmp = -(LINE_SIZE), cnt // compute end of range
204 and cnt = (LINE_SIZE-1), cnt // remainder
206 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
210 (p_scr) add loopcnt = -1, linecnt
215 add tmp = -1, linecnt // next loop count
248 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
259 add loopcnt = -1, loopcnt
267 .l2: // ------------------------------------ // L2A: store 32B in 2 cycles
278 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
285 add cnt = -8, cnt // subtract
290 (p_y) add cnt = -8, cnt // subtract
294 (p_yy) add cnt = -8, cnt // subtract
331 (p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
332 (p_y) add cnt = -1, cnt
339 (p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
340 (p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
341 (p_yy) add cnt = -4, cnt
345 add ptr3 = -1, ptr3 // last store
348 (p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
349 (p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
350 (p_y) add cnt = -4, cnt
353 (p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
354 (p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
357 (p_yy) add cnt = -4, cnt
360 (p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes