代写代考 AVX512F__ #include

OPTIMISATION EXAMPLES

Optimising C for vectorisation
Software prefetching for unstructured mesh applications https://www.cl.cam.ac.uk/~tmj32/papers/docs/hadade18-ia3.pdf

Copyright By PowCoder代写 加微信 powcoder

Vectorisation

# include # include
# include
# include #if defined __AVX512F__ #include
#elif defined __AVX__ #include
#elif defined __MIC__ #include
template < typename type > inline __attribute__((always_inline)) void vgather(type *data, int nvar, int *index, double out[][VECLEN])
# ifdef ENABLE_PF
prefetchd(data, index); # endif
if(nvar==7)
gather1x7(data, index, out);
else if(nvar==3)
gather1x3(data, index, out);
else if(nvar==9)
gather1x9(data, index, out);
template < typename vgather(type *data, if(ndim==3) if(nvar==7) # ifdef ENABLE_PF type > inline __attribute__((always_inline)) void
int ndim, int nvar, int *index, double out[][3][VECLEN])
prefetchd(data, index, 0); prefetchd(data, index, 1); prefetchd(data, index, 2);
gather3x7(data, index, out); }

template < typename type > inline __attribute__((always_inline))
void gather1x3(type *data, int {
// a0,b0,c0,d0,a4,b4,c4,d4
__m512d a0b0c0d0a4b4c4d4 =
// a1,b3,c1,d1,a5,b5,c5,d5
__m512d a1b1c1d1a5b5c5d5 =
// a2,b2,c2,d2,a6,b6,c6,d6
__m512d a2b2c2d2a6b6c6d6 =
// a3,b3,c3,d3,a7,b7,c7,d7 __m512d a3b3c3d3a7b7c7d7 =
*pos, double out[][VECLEN])
_mm512_mask_broadcast_f64x4(_mm512_mask_load_pd(_mm512_undefined_pd(), mask0xF, &data[pos[0]].var[0]), mask0xF0, _mm256_load_pd(&data[pos[4]].var[0]));
_mm512_mask_broadcast_f64x4(_mm512_mask_load_pd(_mm512_undefined_pd(), mask0xF, &data[pos[1]].var[0]), mask0xF0, _mm256_load_pd(&data[pos[5]].var[0]));
_mm512_mask_broadcast_f64x4(_mm512_mask_load_pd(_mm512_undefined_pd(), mask0xF, &data[pos[2]].var[0]), mask0xF0, _mm256_load_pd(&data[pos[6]].var[0]));
_mm512_mask_broadcast_f64x4(_mm512_mask_load_pd(_mm512_undefined_pd(), mask0xF, &data[pos[3]].var[0]), mask0xF0, _mm256_load_pd(&data[pos[7]].var[0]));
// interleave low and high parts
// a0,a1,c0,c1,a4,a5,c4,c5
__m512d a0a1c0c1a4a5c4c5 =
// b0,b1,d0,d1,b4,b5,d4,d5
__m512d b0b1d0d1b4b5d4d5 =
// a2,a3,c2,c3,a6,a7,c6,c7
__m512d a2a3c2c3a6a7c6c7 =
// b2,b3,d2,d3,b6,b7,d6,d7
__m512d b2b3d2d3b6b7d6d7 =
// a0,a1,a2,a3,a4,a5,a6,a7
__m512d a0a1a2a3a4a5a6a7 =
// b0,b1,b2,b3,b4,b5,b6,b7
__m512d b0b1b2b3b4b5b6b7 =
// c0,c1,c2,c3,c4,c5,c6,c7
__m512d c0c1c2c3c4c5c6c7 =
_mm512_unpacklo_pd(a0b0c0d0a4b4c4d4, a1b1c1d1a5b5c5d5); _mm512_unpackhi_pd(a0b0c0d0a4b4c4d4, a1b1c1d1a5b5c5d5); _mm512_unpacklo_pd(a2b2c2d2a6b6c6d6, a3b3c3d3a7b7c7d7); _mm512_unpackhi_pd(a2b2c2d2a6b6c6d6, a3b3c3d3a7b7c7d7);
_mm512_mask_permutex_pd(a0a1c0c1a4a5c4c5, mask0xCC, a2a3c2c3a6a7c6c7, 0x44); _mm512_mask_permutex_pd(b0b1d0d1b4b5d4d5, mask0xCC, b2b3d2d3b6b7d6d7, 0x44); _mm512_mask_permutex_pd(a2a3c2c3a6a7c6c7, mask0x33, a0a1c0c1a4a5c4c5, 0xEE);
_mm512_store_pd(&out[0][0], a0a1a2a3a4a5a6a7); _mm512_store_pd(&out[1][0], b0b1b2b3b4b5b6b7); _mm512_store_pd(&out[2][0], c0c1c2c3c4c5c6c7);

template < typename type > inline __attribute__((always_inline)) void gather1x3(type *data, int *pos, double out[][VECLEN])
__m256d v[4];
// a0,b0,c0,d0
v[0] = _mm256_load_pd(&data[pos[0]].var[0]);
// a1,b1,c1,d1
v[1] = _mm256_load_pd(&data[pos[1]].var[0]);
// a2,b2,c2,d2
v[2] = _mm256_load_pd(&data[pos[2]].var[0]);
// a3,b3,c3,d3
v[3] = _mm256_load_pd(&data[pos[3]].var[0]);
// 64-bit wide permutations
// a0,a1,c0,c1
__m256d shufl0 = _mm256_shuffle_pd(v[0],v[1],0x0); // b0,b1,d0,d1
__m256d shufl1 = _mm256_shuffle_pd(v[0],v[1],0xF); // a2,a3,c2,c3
__m256d shufl2 = _mm256_shuffle_pd(v[2],v[3],0x0); // b2,b3,d2,d3
__m256d shufl3 = _mm256_shuffle_pd(v[2],v[3],0xF);
// 128-bit wide permutations and store
// a0,a1,a2,a3
_mm256_store_pd(&out[0][0],_mm256_permute2f128_pd(shufl0,shufl2,0x20));
// b0,b1,b2,b3
_mm256_store_pd(&out[1][0],_mm256_permute2f128_pd(shufl1,shufl3,0x20));
// c0,c1,c2,c3
_mm256_store_pd(&out[2][0],_mm256_permute2f128_pd(shufl0,shufl2,0x31));
// d0,d1,d2,d3 — not required

Prefetching
inline __attribute__((always_inline)) void prefetchi(int *pos1, int *pos2)
#ifdef ENABLE_PF
# if defined L2_INDEX_PF
_mm_prefetch((char *) &(pos1[L2_INDEX_PF]),_MM_HINT_T1); _mm_prefetch((char *) &(pos2[L2_INDEX_PF]),_MM_HINT_T1); # endif
# if defined L1_INDEX_PF
_mm_prefetch((char *) &(pos1[L1_INDEX_PF]),_MM_HINT_T0); _mm_prefetch((char *) &(pos2[L1_INDEX_PF]),_MM_HINT_T0); # endif

Performance

Performance

Memory optimisations
• Discontinuous Galerkin finite-element assembly routines in Fluidity are considerably slower than the Continuous Galerkin ones
• increasing core count leads to overheads

Performance
% runtime (with / without spin-up)
Function name
solve_momentum
correct_pressure
construct_momentum_dg
construct_momentum_ele ment_dg
advance_velocity
construct_momentum_inte rface_dg
subcycle_momentum_dg
assemble_cmc_dg
local_assembly_cdg_face
calculate_courant_number

Performance

Optimisation
• Fluidity supports:
• 1 to 3 dimensions
• four different viscosity schemes (Bassi-Rebay, Interior Penalty, Arbitrary Upwind, and Compact Discontinuous Galerkin)
• along with many other options that affect element assembly
• Options being parsed at run-time for each element
• Performancepenalties
• there are many hundreds of thousands of small allocs/deallocs
per iteration for a reasonable-sized problem per core
• The sizes of all of these small arrays do not change over the course of the simulation
• None of the element discretisation options change over the mesh for a given simulation
• These runtime evaluations hamper automatic vectorisation

Optimisation
• Compile time optimisation
• convert of all small dynamic array allocations to static allocations
• add compile-time length definitions to loops to allow compile-time vectorization of the loops
• inliningofcallstofinite-elementutilitysubroutines
• inlining of code element face assembly subroutines and all dependent
• rearrangement of decision logic to minimise expensive recalculation of array values
array allocation
• Addition of logic in to only call optimised code
• ifrun-timeelementconfigurationfordimension,quadrature,etc.
matches compile-time configuration
• otherwise,itrunsnon-optimisedoriginalcode
• Requires simulation input file at compile time

Manual inlining
call construct_momentum_interface_dg_ELEMENT_CONFIG(ele, face, face_2, ni,&
& big_m_tensor_addto, &
& rhs_addto, Grad_U_mat_q, Div_U_mat_q, X,& & Rho, U, U_nl, U_mesh, P, q_mesh, &
& surfacetension, &
& velocity_bc, velocity_bc_type, &
& pressure_bc, pressure_bc_type, hb_pressure, &
& subcycle_m_tensor_addto, nvfrac, &
& ele2grad_mat=ele2grad_mat, kappa_mat=kappa_mat, &
& inverse_mass_mat=inverse_mass_mat, &
& viscosity=viscosity, viscosity_mat=viscosity_mat, &
& tensor_eddy_visc=tensor_eddy_visc)
! ! ! ! ! ! ! ! ! ! !
face_start=opNloc+(ni-1)*opFloc+1
face_finish=face_start+opFloc-1
! u_face_l=face_local_nodes(U, face)
u_face_l = U%mesh%faces%face_lno( opFloc*(face-1)+1 : opFloc*face ) if(move_mesh) u_mesh_glno = face_global_nodes(U_mesh, face)
u_face_glno_1 = face_global_nodes(U_nl, face)
u_face_glno_2 = face_global_nodes(U_nl, face_2)
x_face_glno_1 = face_global_nodes(X, face)
rho_face_glno_1 = face_global_nodes(Rho, face)
! face_u_shape_2=>face_shape(U, face_2)
q_face_l = q_mesh%faces%face_lno( opFloc*(face-1)+1 : opFloc*face

Compile time optimisation
#define opDim 3
#define opFaces 4
#define opVelDeg 1
#define opPresDeg 2
#define opNloc 4
#define opNgi 11
#define opFloc 3
#define opFngi 6
#define opPFloc 6
#define opEFloc ( opNloc + opFaces* opFloc)

Compile time optimisation
vel_ele_degree=`grep -A 40 geometry $opt_flml_file | grep -A 20 VelocityMesh | grep -A 10 polynomial_degree | grep integer_value | sed ‘s/.*\(.*\)<\/integer_value>/\1/
pres_ele_degree=`grep -A 40 geometry $opt_flml_file | grep -A 20 PressureMesh | grep -A 10 polynomial_degree | grep integer_value | sed ‘s/.*\(.*\)<\/integer_value>/\1
# Viscosity schemes
case “$viscosity_scheme” in
“compact_discontinuous_galerkin” ) opt_visc_scheme=”SCHEME_CDG” ;; “bassi_rebay” ) opt_visc_scheme=”SCHEME_BASSI” ;;
“interior_penalty” ) opt_visc_scheme=”SCHEME_IP” ;;
* ) >&2 echo “Error. scheme $viscosity_scheme not recognised or supported.”
exit 1;; esac
# We’re assuming the velocity and pressure elements are geometrically # based upon tetrahedra. Currently only first and second order elements # make any sense. This really needs to be revisted, as the numbers are # wholly dependent on element type and representation.
case “$opt_dimension” in
1) opt_nfaces=1
((opt_nloc=2+vel_ele_degree-1)) opt_floc=1
opt_p_floc=2
2) opt_nfaces=3 ((opt_nloc=3*vel_ele_degree)) ((opt_floc=vel_ele_degree+1)) ((opt_p_floc=pres_ele_degree+1)) ;;
3) opt_nfaces=4 ((opt_nloc=4+(vel_ele_degree-1)*6)) ((opt_floc=3*vel_ele_degree)) ((opt_p_floc=3*pres_ele_degree))

Optimised performance

Data layouts
• https://www.archer.ac.uk/community/eCSE/eCSE03- 01/eCSE03-01-TechReport.pdf
• Linked list to Arrays for optimised performance
• Linked list is good for memory structure and organising particles, but bad for vectorisation and other CPU optimisations
• Need to consider both calculations and house keeping operations, i.e. sorting particles

• Array of Structures

• Structure of Arrays
• Array of structure of arrays

Compiler impact

Vectorisation

Load balancing

Conclusions
• Thinking about data layout and hardware configuration is key to improving performance
• Optimisation is iterative and can require performance revisions to get future performance
• Optimisations and parallelisation interact
• Profiling and consistently measuring performance key
• Have a look for best practice in terms of reporting and understanding performance
• i.e http://htor.inf.ethz.ch/publications/img/hoefler- scientific-benchmarking.pdf

程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com