程序代写代做代考 chain Squishy Maps for Soft Body Modelling Using Generalised Chain Mail

Squishy Maps for Soft Body Modelling Using Generalised Chain Mail

KIT308/408 (Advanced) Multicore Architecture and Programming

SIMD Challenges 2
Dr. Ian Lewis
Discipline of ICT, School of TED
University of Tasmania, Australia
1

Breaking alignment for efficiency / code simplicity
Horizontal operations
Shuffling values within a SIMD vector

2
SIMD Challenges

A Worked Example
SIMD Challenges 2

3

This time, a piece of code that actual has various uses
It filters out any values below a given value (limit)
A more often used version of this kind of function would take a filtering function as an argument
But that approach won’t easily work here
Difficult features:
Creating a list that will likely be a different length than the source list
Therefore values need to move across SIMD lanes
unsigned int filter(int* result, const int* v,
unsigned int len, int limit) {
unsigned int index = 0;
for (unsigned int i = 0; i < len; ++i) { if (v[i] >= limit) {
result[index] = v[i];
index++;
}
}
return index;
}

float* dest = new float[SIZE];
float* src = new float[SIZE];
// not shown: fill src with a bunch of values
unsigned int count = filter(dest, src, SIZE, 0);

4
Starting Scalar Code

Difficult features:
Creating a list that will likely be a different length than the source list
Therefore values need to move across SIMD lanes
int SIZE = 24;
float* dest = new float[SIZE];
float* src = { 1, -1, 5, 3, -2, 7, -1, 3, 9, -4, 2, -4,
-4, -4, -5, -6, -9, 5, 3, 4, 0, 3, 3, -1 };

unsigned int count = filter(dest, src, SIZE, 0);

// not shown, a bunch of code to output:
// – all the values in src (SIZE of them)
// – all the useful values in dest (count of them)

input:
1 -1 5 3 -2 7 -1 3 9 -4 2 -4 -4 -4 -5 -6 -9 5 3 4 0 3 3 -1
filtered:
1 5 3 7 3 9 2 5 3 4 0 3 3

5
Example

This approach would unfortunately get us stuck pretty soon
unsigned int filter(int* result, const int* v,
unsigned int len, int limit) {
__m128i limits = _mm_set1_epi32(limit);
unsigned int index = 0;
for (unsigned int i = 0; i < len; i += 4) { for (unsigned int j = 0; j < len; j++) { if (v[i + j] >= limit) {
result[index] = v[i + j];
index++;
}
}
}
return index;
}

6
Starting Scalar Code – Normal Way

What’s correct here?
Reading values from the v array
Correctly calculating the comparisons
What’s not correct here?
Adding all the comparisons to the output, not the actual values
The count is not what we want
Although it is correct for how many values we’ve written
What’s cheating?
The _mm_cmpge_epi32 intrinsic doesn’t exist
Just no room on the slide to use a less-than and a not
unsigned int filter(int* result, const int* v,
unsigned int len, int limit) {
__m128i limits = _mm_set1_epi32(limit);
unsigned int index = 0;
for (unsigned int i = 0; i < len; i += 4) { __m128i vs = _mm_load_si128((__m128i*) &v[i]); __m128i ge = _mm_cmpge_epi32(vs, limits); // ge not real _mm_store_si128((__m128i*) &result[index], ge); index += 4; } return index; } 7 Pseudo Packed Code – Incomplete What’s not correct here? Adding all the comparisons to the output, not the actual values The count is not what we want Although it is correct for how many values we’ve written input: 1 -1 5 3 -2 7 -1 3 9 -4 2 -4 -4 -4 -5 -6 -9 5 3 4 0 3 3 -1 filtered: -1 0 -1 -1 0 -1 0 -1 -1 0 -1 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 0 8 Example What’s correct here? Reading values from the v array Correctly calculating the comparisons Outputting filtered values What’s not correct here? Also outputting a bunch of zeros The count is not what we want Although it is correct for how many values we’ve written unsigned int filter(int* result, const int* v, unsigned int len, int limit) { __m128i limits = _mm_set1_epi32(limit); unsigned int index = 0; for (unsigned int i = 0; i < len; i += 4) { __m128i vs = _mm_load_si128(&v[i]); __m128i ge = _mm_cmpge_epi32(vs, limits); // ge not real __m128i fs = select(ge, vs, _mm_setzero_si128()); _mm_store_si128(&result[index], fs); index += 4; } return index; } 9 Pseudo Packed Code – Incomplete What’s not correct here? Also outputting a bunch of zeros The count is not what we want Although it is correct for how many values we’ve written input: 1 -1 5 3 -2 7 -1 3 9 -4 2 -4 -4 -4 -5 -6 -9 5 3 4 0 3 3 -1 filtered: 1 0 5 3 0 7 0 3 9 0 2 0 0 0 0 0 0 5 3 4 0 3 3 0 10 Example Let’s introduce a function that packs all the values in a SIMD vector on the left This is effectively a small version of the original problem We’ll come back to how to improve this scalar solution inline __m128i left_pack(__m128i cmp, __m128i vals) { __m128i pack = _mm_setzero_si128(); int index = 0; for (int i = 0; i < 4; i++) { if (cmp.m128i_u32[i] != 0) { pack.m128i_i32[index] = vals.m128i_i32[i]; index++; } } return pack; } 11 Left-Packing a Group of Values What’s correct here? Reading values from the v array Correctly calculating the comparisons Outputting filtered values What’s not correct here? Also outputting a bunch of zeros But they are always on the right-hand-side of each SIMD vector now The count is not what we want Although it is correct for how many values we’ve written What’s cheating? Introduced a pseudo-function left_pack Produces a correctly left-aligned set of values for a single SIMD vector unsigned int filter(int* result, const int* v, unsigned int len, int limit) { __m128i limits = _mm_set1_epi32(limit); unsigned int index = 0; for (unsigned int i = 0; i < len; i += 4) { __m128i vs = _mm_load_si128(&v[i]); __m128i ge = _mm_cmpge_epi32(vs, limits); // ge not real __m128i fs = left_pack(ge, vs); _mm_store_si128(&result[index], fs); index += 4; } return index; } 12 Pseudo Packed Code – Incomplete What’s not correct here? Also outputting a bunch of zeros But they are always on the right-hand-side of each SIMD vector now The count is not what we want Although it is correct for how many values we’ve written input: 1 -1 5 3 -2 7 -1 3 9 -4 2 -4 -4 -4 -5 -6 -9 5 3 4 0 3 3 -1 filtered: 1 5 3 0 7 3 0 0 9 2 0 0 0 0 0 0 5 3 4 0 0 3 3 0 13 Example We’ve been really strict on the idea of making sure all SIMD memory access are correctly aligned But when needed, we can explicitly decide to read/write from memory without correct alignment For example for 32-bit integers void _mm_storeu_si32(void* mem_addr, __m128i a) Same as a normal store, but mem_addr doesn’t need to aligned There’s versions for all the other types, and also for loading 1. https://au.news.yahoo.com/amiens-sinkhole-france-pub-threatened-111047095.html 14 Unaligned Store What’s correct here? Reading values from the v array Correctly calculating the comparisons Outputting filtered values The count is correct What’s cheating? Increasing the index the correct amount with a magic comment unsigned int filter(int* result, const int* v, unsigned int len, int limit) { __m128i limits = _mm_set1_epi32(limit); unsigned int index = 0; for (unsigned int i = 0; i < len; i += 4) { __m128i vs = _mm_load_si128(&v[i]); __m128i ge = _mm_cmpge_epi32(vs, limits); // ge not real __m128i fs = left_pack(ge, vs); _mm_storeu_si128((__m128i*) &result[index], fs); index += /* number of values to keep in fs */; } return index; } 15 Pseudo Packed Code – Incomplete We’ve already used a function that will help us a bit When doing the “all values have escaped” part of the Mandelbrot SIMD translation Movemask int _mm_movemask_ps (__m128 a) This enables us to reduce our comparison SIMD vector (ge) to a single 4-bit value Basically have a single bit for each SIMD vector value 0xFFFFFFFF becomes 1 0x0000000 becomes 0 (technically it only looks at the most significant bit of each value) 16 Movemask How can we count the number of bits set in a number? int bitsSetLookup[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; int mask = _mm_movemask(something); int bitsSet = bitsSetLookup(mask); 17 Bit Counting I don’t know what this instruction is named after  Population Count? Simply returns the number of bits that are set in a 32-bit integer int _mm_popcnt_u32(unsigned int a) 18 popcnt What’s correct here? Reading values from the v array Correctly calculating the comparisons Outputting filtered values The count is correct The index is correctly moved along What’s cheating? There is no 32-bit integer version of movemask Need to use the float version and a cast unsigned int filter(int* result, const int* v, unsigned int len, int limit) { __m128i limits = _mm_set1_epi32(limit); unsigned int index = 0; for (unsigned int i = 0; i < len; i += 4) { __m128i vs = _mm_load_si128(&v[i]); __m128i ge = _mm_cmpge_epi32(vs, limits); // ge not real int mask = _mm_movemask_ps(_mm_castsi128_ps(ge)); __m128i fs = left_pack(ge, vs); _mm_storeu_si128(&result[index], fs); index += _mm_popcnt_u32(mask); } return index; } 19 Pseudo Packed Code – Incomplete What’s correct here? Reading values from the v array Correctly calculating the comparisons Outputting filtered values The count is correct The index is correctly moved along unsigned int filter(int* result, const int* v, unsigned int len, int limit) { __m128i limits = _mm_set1_epi32(limit); unsigned int index = 0; for (unsigned int i = 0; i < len; i += 4) { __m128i vs = _mm_load_si128(&v[i]); __m128i ge = _mm_cmpge_epi32(vs, limits); // ge not real __m128i fs = left_pack(ge, vs); _mm_storeu_si128(&result[index], fs); __m128 mask = _mm_movemask_ps(_mm_castsi128_ps(ge)); index += _mm_popcnt_u32(mask); } return index; } 20 Pseudo Packed Code – Incomplete What’s not correct here? Also outputting a bunch of zeros But they are always on the right-hand-side of each SIMD vector now The count is not what we want Although it is correct for how many values we’ve written input: 1 -1 5 3 -2 7 -1 3 9 -4 2 -4 -4 -4 -5 -6 -9 5 3 4 0 3 3 -1 filtered: 1 5 3 7 3 9 2 5 3 4 0 3 3 21 Example Shuffling Values SIMD Challenges 2 22 The last remaining part of the code to translate to SIMD is left packing with a single SIMD vector inline __m128i left_pack(__m128i cmp, __m128i vals) { __m128i pack = _mm_setzero_si128(); int index = 0; for (int i = 0; i < 4; i++) { if (cmp.m128i_u32[i] != 0) { pack.m128i_i32[index] = vals.m128i_i32[i]; index++; } } return pack; } 23 Left-Packing a Group of Values There’s a number of instructions that allow the moving of values around across SIMD lanes While there is one that works on 32-bit integers, it’s not quite what we need We’ll use __m128i _mm_shuffle_epi8(__m128i a, __m128i b) This instruction allows us to move all the bytes of a around according to the indexes in b Bytes in b specify the index of bytes from a Unless 0x80 is specified, then a zero byte is placed in a For example (values for b): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 Change nothing 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 Reverse all the bytes 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 Reverse the order of all the 32-bit values stored in the SIMD vector 0x80, 0x80, 0x80, 0x80, 0, 1, 2, 3, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80 Move the outer two values of the source to the middle and make the outer values in the destination equal to 0 24 Shuffling How does this help? 25 Shuffling and Left-Packing We have the means to create a left-packed version of SIMD vector now But we need a way of working out what “case” we’ve got What would we use to make this decision? 26 Shuffling and Left-Packing Using our comparison value (ge) allows us to know what shuffle to do What’s the best way to use it? A switch statement doesn’t work switch (test) { case { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }: // use a shuffle to keep them all break; ... case { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }: // use a shuffle to keep none of them break; } 27 Shuffling and Left-Packing Using our comparison value (ge) allows us to know what shuffle to do Best to turn it into a single value with movemask What’s the best way to use it? A switch statement could work Best approach is probably a lookup array switch (_mm_movemask_ps(test)) { case 15: // use a shuffle to keep them all break; ... case 0: // use a shuffle to keep none of them break; } 28 Shuffling and Left-Packing Done! unsigned int filter(int* result, const int* v, unsigned int len, int limit) { __m128i limits = _mm_set1_epi32(limit); unsigned int index = 0; for (unsigned int i = 0; i < len; i += 4) { __m128i vs = _mm_load_si128(&v[i]); __m128i ge = _mm_cmpge_epi32(vs, limits); // ge not real __m128 mask = _mm_movemask_ps(_mm_castsi128_ps(ge)); __m128i fs = _mm_shuffle_epi8(vs, shuffles[mask]); _mm_storeu_si128(&result[index], fs); index += _mm_popcnt_u32(mask); } return index; } 29 Shuffle Lookup Array __m128i shuffles[16] = { _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //0000 _mm_setr_epi8(0, 1, 2, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //0001 _mm_setr_epi8(4, 5, 6, 7, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //0010 _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //0011 _mm_setr_epi8(8, 9, 10, 11, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //0100 _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //0101 _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //0110 _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 0x80, 0x80, 0x80), //0111 _mm_setr_epi8(12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //1000 _mm_setr_epi8(0, 1, 2, 3, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //1001 _mm_setr_epi8(4, 5, 6, 7, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //1010 _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80), //1011 _mm_setr_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80), //1100 _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80), //1101 _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80), //1110 _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), //1111 }; 30 Lookup Table Done! input: 1 -1 5 3 -2 7 -1 3 9 -4 2 -4 -4 -4 -5 -6 -9 5 3 4 0 3 3 -1 filtered: 1 5 3 7 3 9 2 5 3 4 0 3 3 31 Example This example is based on part of a talk given by Andreas Fredriksson from Insomniac Games “SIMD at Insomniac Games: How we do the Shuffle” https://www.gdcvault.com/play/1022249/SIMD-at-Insomniac-Games-How 32 References