xref: /qemu/tests/tcg/hexagon/scatter_gather.c (revision 727385c4e13e1a5a985124a20a2370855141111d)
1  /*
2   *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
3   *
4   *  This program is free software; you can redistribute it and/or modify
5   *  it under the terms of the GNU General Public License as published by
6   *  the Free Software Foundation; either version 2 of the License, or
7   *  (at your option) any later version.
8   *
9   *  This program is distributed in the hope that it will be useful,
10   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   *  GNU General Public License for more details.
13   *
14   *  You should have received a copy of the GNU General Public License
15   *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16   */
17  
18  /*
19   * This example tests the HVX scatter/gather instructions
20   *
21   * See section 5.13 of the V68 HVX Programmer's Reference
22   *
23   * There are 3 main classes operations
24   *     _16                 16-bit elements and 16-bit offsets
25   *     _32                 32-bit elements and 32-bit offsets
26   *     _16_32              16-bit elements and 32-bit offsets
27   *
28   * There are also masked and accumulate versions
29   */
30  
31  #include <stdio.h>
32  #include <string.h>
33  #include <stdlib.h>
34  #include <inttypes.h>
35  
36  typedef long HVX_Vector       __attribute__((__vector_size__(128)))
37                                __attribute__((aligned(128)));
38  typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
39                                __attribute__((aligned(128)));
40  typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
41                                __attribute__((aligned(128)));
42  
43  #define VSCATTER_16(BASE, RGN, OFF, VALS) \
44      __builtin_HEXAGON_V6_vscattermh_128B((int)BASE, RGN, OFF, VALS)
45  #define VSCATTER_16_MASKED(MASK, BASE, RGN, OFF, VALS) \
46      __builtin_HEXAGON_V6_vscattermhq_128B(MASK, (int)BASE, RGN, OFF, VALS)
47  #define VSCATTER_32(BASE, RGN, OFF, VALS) \
48      __builtin_HEXAGON_V6_vscattermw_128B((int)BASE, RGN, OFF, VALS)
49  #define VSCATTER_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
50      __builtin_HEXAGON_V6_vscattermwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
51  #define VSCATTER_16_32(BASE, RGN, OFF, VALS) \
52      __builtin_HEXAGON_V6_vscattermhw_128B((int)BASE, RGN, OFF, VALS)
53  #define VSCATTER_16_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
54      __builtin_HEXAGON_V6_vscattermhwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
55  #define VSCATTER_16_ACC(BASE, RGN, OFF, VALS) \
56      __builtin_HEXAGON_V6_vscattermh_add_128B((int)BASE, RGN, OFF, VALS)
57  #define VSCATTER_32_ACC(BASE, RGN, OFF, VALS) \
58      __builtin_HEXAGON_V6_vscattermw_add_128B((int)BASE, RGN, OFF, VALS)
59  #define VSCATTER_16_32_ACC(BASE, RGN, OFF, VALS) \
60      __builtin_HEXAGON_V6_vscattermhw_add_128B((int)BASE, RGN, OFF, VALS)
61  
62  #define VGATHER_16(DSTADDR, BASE, RGN, OFF) \
63      __builtin_HEXAGON_V6_vgathermh_128B(DSTADDR, (int)BASE, RGN, OFF)
64  #define VGATHER_16_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
65      __builtin_HEXAGON_V6_vgathermhq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
66  #define VGATHER_32(DSTADDR, BASE, RGN, OFF) \
67      __builtin_HEXAGON_V6_vgathermw_128B(DSTADDR, (int)BASE, RGN, OFF)
68  #define VGATHER_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
69      __builtin_HEXAGON_V6_vgathermwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
70  #define VGATHER_16_32(DSTADDR, BASE, RGN, OFF) \
71      __builtin_HEXAGON_V6_vgathermhw_128B(DSTADDR, (int)BASE, RGN, OFF)
72  #define VGATHER_16_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
73      __builtin_HEXAGON_V6_vgathermhwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
74  
75  #define VSHUFF_H(V) \
76      __builtin_HEXAGON_V6_vshuffh_128B(V)
77  #define VSPLAT_H(X) \
78      __builtin_HEXAGON_V6_lvsplath_128B(X)
79  #define VAND_VAL(PRED, VAL) \
80      __builtin_HEXAGON_V6_vandvrt_128B(PRED, VAL)
81  #define VDEAL_H(V) \
82      __builtin_HEXAGON_V6_vdealh_128B(V)
83  
84  int err;
85  
86  /* define the number of rows/cols in a square matrix */
87  #define MATRIX_SIZE 64
88  
89  /* define the size of the scatter buffer */
90  #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
91  
92  /* fake vtcm - put buffers together and force alignment */
93  static struct {
94      unsigned short vscatter16[SCATTER_BUFFER_SIZE];
95      unsigned short vgather16[MATRIX_SIZE];
96      unsigned int   vscatter32[SCATTER_BUFFER_SIZE];
97      unsigned int   vgather32[MATRIX_SIZE];
98      unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
99      unsigned short vgather16_32[MATRIX_SIZE];
100  } vtcm __attribute__((aligned(0x10000)));
101  
102  /* declare the arrays of reference values */
103  unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
104  unsigned short vgather16_ref[MATRIX_SIZE];
105  unsigned int   vscatter32_ref[SCATTER_BUFFER_SIZE];
106  unsigned int   vgather32_ref[MATRIX_SIZE];
107  unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
108  unsigned short vgather16_32_ref[MATRIX_SIZE];
109  
110  /* declare the arrays of offsets */
111  unsigned short half_offsets[MATRIX_SIZE];
112  unsigned int   word_offsets[MATRIX_SIZE];
113  
114  /* declare the arrays of values */
115  unsigned short half_values[MATRIX_SIZE];
116  unsigned short half_values_acc[MATRIX_SIZE];
117  unsigned short half_values_masked[MATRIX_SIZE];
118  unsigned int   word_values[MATRIX_SIZE];
119  unsigned int   word_values_acc[MATRIX_SIZE];
120  unsigned int   word_values_masked[MATRIX_SIZE];
121  
122  /* declare the arrays of predicates */
123  unsigned short half_predicates[MATRIX_SIZE];
124  unsigned int   word_predicates[MATRIX_SIZE];
125  
126  /* make this big enough for all the intrinsics */
127  const size_t region_len = sizeof(vtcm);
128  
129  /* optionally add sync instructions */
130  #define SYNC_VECTOR 1
131  
132  static void sync_scatter(void *addr)
133  {
134  #if SYNC_VECTOR
135      /*
136       * Do the scatter release followed by a dummy load to complete the
137       * synchronization.  Normally the dummy load would be deferred as
138       * long as possible to minimize stalls.
139       */
140      asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
141      /* use volatile to force the load */
142      volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
143  #endif
144  }
145  
146  static void sync_gather(void *addr)
147  {
148  #if SYNC_VECTOR
149      /* use volatile to force the load */
150      volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
151  #endif
152  }
153  
154  /* optionally print the results */
155  #define PRINT_DATA 0
156  
157  #define FILL_CHAR       '.'
158  
159  /* fill vtcm scratch with ee */
160  void prefill_vtcm_scratch(void)
161  {
162      memset(&vtcm, FILL_CHAR, sizeof(vtcm));
163  }
164  
165  /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
166  void create_offsets_values_preds_16(void)
167  {
168      unsigned short half_element = 0;
169      unsigned short half_element_masked = 0;
170      char letter = 'A';
171      char letter_masked = '@';
172  
173      for (int i = 0; i < MATRIX_SIZE; i++) {
174          half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
175  
176          half_element = 0;
177          half_element_masked = 0;
178          for (int j = 0; j < 2; j++) {
179              half_element |= letter << j * 8;
180              half_element_masked |= letter_masked << j * 8;
181          }
182  
183          half_values[i] = half_element;
184          half_values_acc[i] = ((i % 10) << 8) + (i % 10);
185          half_values_masked[i] = half_element_masked;
186  
187          letter++;
188          /* reset to 'A' */
189          if (letter == 'M') {
190              letter = 'A';
191          }
192  
193          half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
194      }
195  }
196  
197  /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
198  void create_offsets_values_preds_32(void)
199  {
200      unsigned int word_element = 0;
201      unsigned int word_element_masked = 0;
202      char letter = 'A';
203      char letter_masked = '&';
204  
205      for (int i = 0; i < MATRIX_SIZE; i++) {
206          word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
207  
208          word_element = 0;
209          word_element_masked = 0;
210          for (int j = 0; j < 4; j++) {
211              word_element |= letter << j * 8;
212              word_element_masked |= letter_masked << j * 8;
213          }
214  
215          word_values[i] = word_element;
216          word_values_acc[i] = ((i % 10) << 8) + (i % 10);
217          word_values_masked[i] = word_element_masked;
218  
219          letter++;
220          /* reset to 'A' */
221          if (letter == 'M') {
222              letter = 'A';
223          }
224  
225          word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
226      }
227  }
228  
229  /*
230   * create byte offsets to be a diagonal of the matrix with 16 bit elements
231   * and 32 bit offsets
232   */
233  void create_offsets_values_preds_16_32(void)
234  {
235      unsigned short half_element = 0;
236      unsigned short half_element_masked = 0;
237      char letter = 'D';
238      char letter_masked = '$';
239  
240      for (int i = 0; i < MATRIX_SIZE; i++) {
241          word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
242  
243          half_element = 0;
244          half_element_masked = 0;
245          for (int j = 0; j < 2; j++) {
246              half_element |= letter << j * 8;
247              half_element_masked |= letter_masked << j * 8;
248          }
249  
250          half_values[i] = half_element;
251          half_values_acc[i] = ((i % 10) << 8) + (i % 10);
252          half_values_masked[i] = half_element_masked;
253  
254          letter++;
255          /* reset to 'A' */
256          if (letter == 'P') {
257              letter = 'D';
258          }
259  
260          half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
261      }
262  }
263  
264  /* scatter the 16 bit elements using intrinsics */
265  void vector_scatter_16(void)
266  {
267      /* copy the offsets and values to vectors */
268      HVX_Vector offsets = *(HVX_Vector *)half_offsets;
269      HVX_Vector values = *(HVX_Vector *)half_values;
270  
271      VSCATTER_16(&vtcm.vscatter16, region_len, offsets, values);
272  
273      sync_scatter(vtcm.vscatter16);
274  }
275  
276  /* scatter-accumulate the 16 bit elements using intrinsics */
277  void vector_scatter_16_acc(void)
278  {
279      /* copy the offsets and values to vectors */
280      HVX_Vector offsets = *(HVX_Vector *)half_offsets;
281      HVX_Vector values = *(HVX_Vector *)half_values_acc;
282  
283      VSCATTER_16_ACC(&vtcm.vscatter16, region_len, offsets, values);
284  
285      sync_scatter(vtcm.vscatter16);
286  }
287  
288  /* scatter the 16 bit elements using intrinsics */
289  void vector_scatter_16_masked(void)
290  {
291      /* copy the offsets and values to vectors */
292      HVX_Vector offsets = *(HVX_Vector *)half_offsets;
293      HVX_Vector values = *(HVX_Vector *)half_values_masked;
294      HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
295      HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
296  
297      VSCATTER_16_MASKED(preds, &vtcm.vscatter16, region_len, offsets, values);
298  
299      sync_scatter(vtcm.vscatter16);
300  }
301  
302  /* scatter the 32 bit elements using intrinsics */
303  void vector_scatter_32(void)
304  {
305      /* copy the offsets and values to vectors */
306      HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
307      HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
308      HVX_Vector valueslo = *(HVX_Vector *)word_values;
309      HVX_Vector valueshi = *(HVX_Vector *)&word_values[MATRIX_SIZE / 2];
310  
311      VSCATTER_32(&vtcm.vscatter32, region_len, offsetslo, valueslo);
312      VSCATTER_32(&vtcm.vscatter32, region_len, offsetshi, valueshi);
313  
314      sync_scatter(vtcm.vscatter32);
315  }
316  
317  /* scatter-acc the 32 bit elements using intrinsics */
318  void vector_scatter_32_acc(void)
319  {
320      /* copy the offsets and values to vectors */
321      HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
322      HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
323      HVX_Vector valueslo = *(HVX_Vector *)word_values_acc;
324      HVX_Vector valueshi = *(HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
325  
326      VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetslo, valueslo);
327      VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetshi, valueshi);
328  
329      sync_scatter(vtcm.vscatter32);
330  }
331  
332  /* scatter the 32 bit elements using intrinsics */
333  void vector_scatter_32_masked(void)
334  {
335      /* copy the offsets and values to vectors */
336      HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
337      HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
338      HVX_Vector valueslo = *(HVX_Vector *)word_values_masked;
339      HVX_Vector valueshi = *(HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
340      HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
341      HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
342      HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
343      HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
344  
345      VSCATTER_32_MASKED(predslo, &vtcm.vscatter32, region_len, offsetslo,
346                         valueslo);
347      VSCATTER_32_MASKED(predshi, &vtcm.vscatter32, region_len, offsetshi,
348                         valueshi);
349  
350      sync_scatter(vtcm.vscatter16);
351  }
352  
353  /* scatter the 16 bit elements with 32 bit offsets using intrinsics */
354  void vector_scatter_16_32(void)
355  {
356      HVX_VectorPair offsets;
357      HVX_Vector values;
358  
359      /* get the word offsets in a vector pair */
360      offsets = *(HVX_VectorPair *)word_offsets;
361  
362      /* these values need to be shuffled for the scatter */
363      values = *(HVX_Vector *)half_values;
364      values = VSHUFF_H(values);
365  
366      VSCATTER_16_32(&vtcm.vscatter16_32, region_len, offsets, values);
367  
368      sync_scatter(vtcm.vscatter16_32);
369  }
370  
371  /* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
372  void vector_scatter_16_32_acc(void)
373  {
374      HVX_VectorPair offsets;
375      HVX_Vector values;
376  
377      /* get the word offsets in a vector pair */
378      offsets = *(HVX_VectorPair *)word_offsets;
379  
380      /* these values need to be shuffled for the scatter */
381      values = *(HVX_Vector *)half_values_acc;
382      values = VSHUFF_H(values);
383  
384      VSCATTER_16_32_ACC(&vtcm.vscatter16_32, region_len, offsets, values);
385  
386      sync_scatter(vtcm.vscatter16_32);
387  }
388  
389  /* masked scatter the 16 bit elements with 32 bit offsets using intrinsics */
390  void vector_scatter_16_32_masked(void)
391  {
392      HVX_VectorPair offsets;
393      HVX_Vector values;
394      HVX_Vector pred_reg;
395  
396      /* get the word offsets in a vector pair */
397      offsets = *(HVX_VectorPair *)word_offsets;
398  
399      /* these values need to be shuffled for the scatter */
400      values = *(HVX_Vector *)half_values_masked;
401      values = VSHUFF_H(values);
402  
403      pred_reg = *(HVX_Vector *)half_predicates;
404      pred_reg = VSHUFF_H(pred_reg);
405      HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
406  
407      VSCATTER_16_32_MASKED(preds, &vtcm.vscatter16_32, region_len, offsets,
408                            values);
409  
410      sync_scatter(vtcm.vscatter16_32);
411  }
412  
413  /* gather the elements from the scatter16 buffer */
414  void vector_gather_16(void)
415  {
416      HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
417      HVX_Vector offsets = *(HVX_Vector *)half_offsets;
418  
419      VGATHER_16(vgather, &vtcm.vscatter16, region_len, offsets);
420  
421      sync_gather(vgather);
422  }
423  
424  static unsigned short gather_16_masked_init(void)
425  {
426      char letter = '?';
427      return letter | (letter << 8);
428  }
429  
430  void vector_gather_16_masked(void)
431  {
432      HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
433      HVX_Vector offsets = *(HVX_Vector *)half_offsets;
434      HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
435      HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
436  
437      *vgather = VSPLAT_H(gather_16_masked_init());
438      VGATHER_16_MASKED(vgather, preds, &vtcm.vscatter16, region_len, offsets);
439  
440      sync_gather(vgather);
441  }
442  
443  /* gather the elements from the scatter32 buffer */
444  void vector_gather_32(void)
445  {
446      HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
447      HVX_Vector *vgatherhi =
448          (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
449      HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
450      HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
451  
452      VGATHER_32(vgatherlo, &vtcm.vscatter32, region_len, offsetslo);
453      VGATHER_32(vgatherhi, &vtcm.vscatter32, region_len, offsetshi);
454  
455      sync_gather(vgatherhi);
456  }
457  
458  static unsigned int gather_32_masked_init(void)
459  {
460      char letter = '?';
461      return letter | (letter << 8) | (letter << 16) | (letter << 24);
462  }
463  
464  void vector_gather_32_masked(void)
465  {
466      HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
467      HVX_Vector *vgatherhi =
468          (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
469      HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
470      HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
471      HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
472      HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
473      HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
474      HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
475  
476      *vgatherlo = VSPLAT_H(gather_32_masked_init());
477      *vgatherhi = VSPLAT_H(gather_32_masked_init());
478      VGATHER_32_MASKED(vgatherlo, predslo, &vtcm.vscatter32, region_len,
479                        offsetslo);
480      VGATHER_32_MASKED(vgatherhi, predshi, &vtcm.vscatter32, region_len,
481                        offsetshi);
482  
483      sync_gather(vgatherlo);
484      sync_gather(vgatherhi);
485  }
486  
487  /* gather the elements from the scatter16_32 buffer */
488  void vector_gather_16_32(void)
489  {
490      HVX_Vector *vgather;
491      HVX_VectorPair offsets;
492      HVX_Vector values;
493  
494      /* get the vtcm address to gather from */
495      vgather = (HVX_Vector *)&vtcm.vgather16_32;
496  
497      /* get the word offsets in a vector pair */
498      offsets = *(HVX_VectorPair *)word_offsets;
499  
500      VGATHER_16_32(vgather, &vtcm.vscatter16_32, region_len, offsets);
501  
502      /* deal the elements to get the order back */
503      values = *(HVX_Vector *)vgather;
504      values = VDEAL_H(values);
505  
506      /* write it back to vtcm address */
507      *(HVX_Vector *)vgather = values;
508  }
509  
510  void vector_gather_16_32_masked(void)
511  {
512      HVX_Vector *vgather;
513      HVX_VectorPair offsets;
514      HVX_Vector pred_reg;
515      HVX_VectorPred preds;
516      HVX_Vector values;
517  
518      /* get the vtcm address to gather from */
519      vgather = (HVX_Vector *)&vtcm.vgather16_32;
520  
521      /* get the word offsets in a vector pair */
522      offsets = *(HVX_VectorPair *)word_offsets;
523      pred_reg = *(HVX_Vector *)half_predicates;
524      pred_reg = VSHUFF_H(pred_reg);
525      preds = VAND_VAL(pred_reg, ~0);
526  
527     *vgather = VSPLAT_H(gather_16_masked_init());
528     VGATHER_16_32_MASKED(vgather, preds, &vtcm.vscatter16_32, region_len,
529                          offsets);
530  
531      /* deal the elements to get the order back */
532      values = *(HVX_Vector *)vgather;
533      values = VDEAL_H(values);
534  
535      /* write it back to vtcm address */
536      *(HVX_Vector *)vgather = values;
537  }
538  
539  static void check_buffer(const char *name, void *c, void *r, size_t size)
540  {
541      char *check = (char *)c;
542      char *ref = (char *)r;
543      for (int i = 0; i < size; i++) {
544          if (check[i] != ref[i]) {
545              printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
546                     check[i], check[i], ref[i], ref[i]);
547              err++;
548          }
549      }
550  }
551  
552  /*
553   * These scalar functions are the C equivalents of the vector functions that
554   * use HVX
555   */
556  
557  /* scatter the 16 bit elements using C */
558  void scalar_scatter_16(unsigned short *vscatter16)
559  {
560      for (int i = 0; i < MATRIX_SIZE; ++i) {
561          vscatter16[half_offsets[i] / 2] = half_values[i];
562      }
563  }
564  
565  void check_scatter_16()
566  {
567      memset(vscatter16_ref, FILL_CHAR,
568             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
569      scalar_scatter_16(vscatter16_ref);
570      check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
571                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
572  }
573  
574  /* scatter the 16 bit elements using C */
575  void scalar_scatter_16_acc(unsigned short *vscatter16)
576  {
577      for (int i = 0; i < MATRIX_SIZE; ++i) {
578          vscatter16[half_offsets[i] / 2] += half_values_acc[i];
579      }
580  }
581  
582  void check_scatter_16_acc()
583  {
584      memset(vscatter16_ref, FILL_CHAR,
585             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
586      scalar_scatter_16(vscatter16_ref);
587      scalar_scatter_16_acc(vscatter16_ref);
588      check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
589                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
590  }
591  
592  /* scatter the 16 bit elements using C */
593  void scalar_scatter_16_masked(unsigned short *vscatter16)
594  {
595      for (int i = 0; i < MATRIX_SIZE; i++) {
596          if (half_predicates[i]) {
597              vscatter16[half_offsets[i] / 2] = half_values_masked[i];
598          }
599      }
600  
601  }
602  
603  void check_scatter_16_masked()
604  {
605      memset(vscatter16_ref, FILL_CHAR,
606             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
607      scalar_scatter_16(vscatter16_ref);
608      scalar_scatter_16_acc(vscatter16_ref);
609      scalar_scatter_16_masked(vscatter16_ref);
610      check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
611                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
612  }
613  
614  /* scatter the 32 bit elements using C */
615  void scalar_scatter_32(unsigned int *vscatter32)
616  {
617      for (int i = 0; i < MATRIX_SIZE; ++i) {
618          vscatter32[word_offsets[i] / 4] = word_values[i];
619      }
620  }
621  
622  void check_scatter_32()
623  {
624      memset(vscatter32_ref, FILL_CHAR,
625             SCATTER_BUFFER_SIZE * sizeof(unsigned int));
626      scalar_scatter_32(vscatter32_ref);
627      check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
628                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
629  }
630  
631  /* scatter the 32 bit elements using C */
632  void scalar_scatter_32_acc(unsigned int *vscatter32)
633  {
634      for (int i = 0; i < MATRIX_SIZE; ++i) {
635          vscatter32[word_offsets[i] / 4] += word_values_acc[i];
636      }
637  }
638  
639  void check_scatter_32_acc()
640  {
641      memset(vscatter32_ref, FILL_CHAR,
642             SCATTER_BUFFER_SIZE * sizeof(unsigned int));
643      scalar_scatter_32(vscatter32_ref);
644      scalar_scatter_32_acc(vscatter32_ref);
645      check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
646                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
647  }
648  
649  /* scatter the 32 bit elements using C */
650  void scalar_scatter_32_masked(unsigned int *vscatter32)
651  {
652      for (int i = 0; i < MATRIX_SIZE; i++) {
653          if (word_predicates[i]) {
654              vscatter32[word_offsets[i] / 4] = word_values_masked[i];
655          }
656      }
657  }
658  
659  void check_scatter_32_masked()
660  {
661      memset(vscatter32_ref, FILL_CHAR,
662             SCATTER_BUFFER_SIZE * sizeof(unsigned int));
663      scalar_scatter_32(vscatter32_ref);
664      scalar_scatter_32_acc(vscatter32_ref);
665      scalar_scatter_32_masked(vscatter32_ref);
666      check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
667                    SCATTER_BUFFER_SIZE * sizeof(unsigned int));
668  }
669  
670  /* scatter the 32 bit elements using C */
671  void scalar_scatter_16_32(unsigned short *vscatter16_32)
672  {
673      for (int i = 0; i < MATRIX_SIZE; ++i) {
674          vscatter16_32[word_offsets[i] / 2] = half_values[i];
675      }
676  }
677  
678  void check_scatter_16_32()
679  {
680      memset(vscatter16_32_ref, FILL_CHAR,
681             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
682      scalar_scatter_16_32(vscatter16_32_ref);
683      check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
684                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
685  }
686  
687  /* scatter the 32 bit elements using C */
688  void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
689  {
690      for (int i = 0; i < MATRIX_SIZE; ++i) {
691          vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
692      }
693  }
694  
695  void check_scatter_16_32_acc()
696  {
697      memset(vscatter16_32_ref, FILL_CHAR,
698             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
699      scalar_scatter_16_32(vscatter16_32_ref);
700      scalar_scatter_16_32_acc(vscatter16_32_ref);
701      check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
702                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
703  }
704  
705  void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
706  {
707      for (int i = 0; i < MATRIX_SIZE; i++) {
708          if (half_predicates[i]) {
709              vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
710          }
711      }
712  }
713  
714  void check_scatter_16_32_masked()
715  {
716      memset(vscatter16_32_ref, FILL_CHAR,
717             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
718      scalar_scatter_16_32(vscatter16_32_ref);
719      scalar_scatter_16_32_acc(vscatter16_32_ref);
720      scalar_scatter_16_32_masked(vscatter16_32_ref);
721      check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
722                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
723  }
724  
725  /* gather the elements from the scatter buffer using C */
726  void scalar_gather_16(unsigned short *vgather16)
727  {
728      for (int i = 0; i < MATRIX_SIZE; ++i) {
729          vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
730      }
731  }
732  
733  void check_gather_16()
734  {
735        memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
736        scalar_gather_16(vgather16_ref);
737        check_buffer(__func__, vtcm.vgather16, vgather16_ref,
738                     MATRIX_SIZE * sizeof(unsigned short));
739  }
740  
741  void scalar_gather_16_masked(unsigned short *vgather16)
742  {
743      for (int i = 0; i < MATRIX_SIZE; ++i) {
744          if (half_predicates[i]) {
745              vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
746          }
747      }
748  }
749  
750  void check_gather_16_masked()
751  {
752      memset(vgather16_ref, gather_16_masked_init(),
753             MATRIX_SIZE * sizeof(unsigned short));
754      scalar_gather_16_masked(vgather16_ref);
755      check_buffer(__func__, vtcm.vgather16, vgather16_ref,
756                   MATRIX_SIZE * sizeof(unsigned short));
757  }
758  
759  /* gather the elements from the scatter buffer using C */
760  void scalar_gather_32(unsigned int *vgather32)
761  {
762      for (int i = 0; i < MATRIX_SIZE; ++i) {
763          vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
764      }
765  }
766  
767  void check_gather_32(void)
768  {
769      memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
770      scalar_gather_32(vgather32_ref);
771      check_buffer(__func__, vtcm.vgather32, vgather32_ref,
772                   MATRIX_SIZE * sizeof(unsigned int));
773  }
774  
775  void scalar_gather_32_masked(unsigned int *vgather32)
776  {
777      for (int i = 0; i < MATRIX_SIZE; ++i) {
778          if (word_predicates[i]) {
779              vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
780          }
781      }
782  }
783  
784  
785  void check_gather_32_masked(void)
786  {
787      memset(vgather32_ref, gather_32_masked_init(),
788             MATRIX_SIZE * sizeof(unsigned int));
789      scalar_gather_32_masked(vgather32_ref);
790      check_buffer(__func__, vtcm.vgather32,
791                   vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
792  }
793  
794  /* gather the elements from the scatter buffer using C */
795  void scalar_gather_16_32(unsigned short *vgather16_32)
796  {
797      for (int i = 0; i < MATRIX_SIZE; ++i) {
798          vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
799      }
800  }
801  
802  void check_gather_16_32(void)
803  {
804      memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
805      scalar_gather_16_32(vgather16_32_ref);
806      check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
807                   MATRIX_SIZE * sizeof(unsigned short));
808  }
809  
810  void scalar_gather_16_32_masked(unsigned short *vgather16_32)
811  {
812      for (int i = 0; i < MATRIX_SIZE; ++i) {
813          if (half_predicates[i]) {
814              vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
815          }
816      }
817  
818  }
819  
820  void check_gather_16_32_masked(void)
821  {
822      memset(vgather16_32_ref, gather_16_masked_init(),
823             MATRIX_SIZE * sizeof(unsigned short));
824      scalar_gather_16_32_masked(vgather16_32_ref);
825      check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
826                   MATRIX_SIZE * sizeof(unsigned short));
827  }
828  
829  /* print scatter16 buffer */
830  void print_scatter16_buffer(void)
831  {
832      if (PRINT_DATA) {
833          printf("\n\nPrinting the 16 bit scatter buffer");
834  
835          for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
836              if ((i % MATRIX_SIZE) == 0) {
837                  printf("\n");
838              }
839              for (int j = 0; j < 2; j++) {
840                  printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
841              }
842              printf(" ");
843          }
844          printf("\n");
845      }
846  }
847  
848  /* print the gather 16 buffer */
849  void print_gather_result_16(void)
850  {
851      if (PRINT_DATA) {
852          printf("\n\nPrinting the 16 bit gather result\n");
853  
854          for (int i = 0; i < MATRIX_SIZE; i++) {
855              for (int j = 0; j < 2; j++) {
856                  printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
857              }
858              printf(" ");
859          }
860          printf("\n");
861      }
862  }
863  
864  /* print the scatter32 buffer */
865  void print_scatter32_buffer(void)
866  {
867      if (PRINT_DATA) {
868          printf("\n\nPrinting the 32 bit scatter buffer");
869  
870          for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
871              if ((i % MATRIX_SIZE) == 0) {
872                  printf("\n");
873              }
874              for (int j = 0; j < 4; j++) {
875                  printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
876              }
877              printf(" ");
878          }
879          printf("\n");
880      }
881  }
882  
883  /* print the gather 32 buffer */
884  void print_gather_result_32(void)
885  {
886      if (PRINT_DATA) {
887          printf("\n\nPrinting the 32 bit gather result\n");
888  
889          for (int i = 0; i < MATRIX_SIZE; i++) {
890              for (int j = 0; j < 4; j++) {
891                  printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
892              }
893              printf(" ");
894          }
895          printf("\n");
896      }
897  }
898  
899  /* print the scatter16_32 buffer */
900  void print_scatter16_32_buffer(void)
901  {
902      if (PRINT_DATA) {
903          printf("\n\nPrinting the 16_32 bit scatter buffer");
904  
905          for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
906              if ((i % MATRIX_SIZE) == 0) {
907                  printf("\n");
908              }
909              for (int j = 0; j < 2; j++) {
910                  printf("%c",
911                        (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
912              }
913              printf(" ");
914          }
915          printf("\n");
916      }
917  }
918  
919  /* print the gather 16_32 buffer */
920  void print_gather_result_16_32(void)
921  {
922      if (PRINT_DATA) {
923          printf("\n\nPrinting the 16_32 bit gather result\n");
924  
925          for (int i = 0; i < MATRIX_SIZE; i++) {
926              for (int j = 0; j < 2; j++) {
927                  printf("%c",
928                         (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
929              }
930              printf(" ");
931          }
932          printf("\n");
933      }
934  }
935  
936  int main()
937  {
938      prefill_vtcm_scratch();
939  
940      /* 16 bit elements with 16 bit offsets */
941      create_offsets_values_preds_16();
942  
943      vector_scatter_16();
944      print_scatter16_buffer();
945      check_scatter_16();
946  
947      vector_gather_16();
948      print_gather_result_16();
949      check_gather_16();
950  
951      vector_gather_16_masked();
952      print_gather_result_16();
953      check_gather_16_masked();
954  
955      vector_scatter_16_acc();
956      print_scatter16_buffer();
957      check_scatter_16_acc();
958  
959      vector_scatter_16_masked();
960      print_scatter16_buffer();
961      check_scatter_16_masked();
962  
963      /* 32 bit elements with 32 bit offsets */
964      create_offsets_values_preds_32();
965  
966      vector_scatter_32();
967      print_scatter32_buffer();
968      check_scatter_32();
969  
970      vector_gather_32();
971      print_gather_result_32();
972      check_gather_32();
973  
974      vector_gather_32_masked();
975      print_gather_result_32();
976      check_gather_32_masked();
977  
978      vector_scatter_32_acc();
979      print_scatter32_buffer();
980      check_scatter_32_acc();
981  
982      vector_scatter_32_masked();
983      print_scatter32_buffer();
984      check_scatter_32_masked();
985  
986      /* 16 bit elements with 32 bit offsets */
987      create_offsets_values_preds_16_32();
988  
989      vector_scatter_16_32();
990      print_scatter16_32_buffer();
991      check_scatter_16_32();
992  
993      vector_gather_16_32();
994      print_gather_result_16_32();
995      check_gather_16_32();
996  
997      vector_gather_16_32_masked();
998      print_gather_result_16_32();
999      check_gather_16_32_masked();
1000  
1001      vector_scatter_16_32_acc();
1002      print_scatter16_32_buffer();
1003      check_scatter_16_32_acc();
1004  
1005      vector_scatter_16_32_masked();
1006      print_scatter16_32_buffer();
1007      check_scatter_16_32_masked();
1008  
1009      puts(err ? "FAIL" : "PASS");
1010      return err;
1011  }
1012