SuperLU Distributed 8.2.1
Distributed memory sparse direct solver
zlustruct_gpu.h
Go to the documentation of this file.
1
15#pragma once // so that this header file is included onle once
16
17#include "superlu_zdefs.h"
18
19#ifdef GPU_ACC // enable GPU
20#include "gpu_api_utils.h"
21// #include "mkl.h"
22// #include "sec_structs.h"
23// #include "supernodal_etree.h"
24
25/* Constants */
26//#define SLU_TARGET_GPU 0
27//#define MAX_BLOCK_SIZE 10000
28#define MAX_NGPU_STREAMS 32
29
30static
31void check(gpuError_t result, char const *const func, const char *const file, int const line)
32{
33 if (result)
34 {
35 fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n",
36 file, line, gpuGetErrorString(result), func);
37
38 // Make sure we call GPU Device Reset before exiting
39 exit(EXIT_FAILURE);
40 }
41}
42
43#define checkGPUErrors(val) check ( (val), #val, __FILE__, __LINE__ )
44
45typedef struct //SCUbuf_gpu_
46{
47 /*Informations for various buffers*/
48 doublecomplex *bigV;
49 doublecomplex *bigU;
50 doublecomplex *bigU_host; /*pinned location*/
51 int_t *indirect; /*for indirect address calculations*/
52 int_t *indirect2; /*for indirect address calculations*/
53
54 doublecomplex *Remain_L_buff; /* on GPU */
55 doublecomplex *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */
56
57 int_t *lsub;
58 int_t *usub;
59
60 int_t *lsub_buf, *usub_buf;
61
62 Ublock_info_t *Ublock_info; /* on GPU */
63 Remain_info_t *Remain_info;
64 Ublock_info_t *Ublock_info_host;
65 Remain_info_t *Remain_info_host;
66
67 int_t* usub_IndirectJ3; /* on GPU */
68 int_t* usub_IndirectJ3_host;
69
70} zSCUbuf_gpu_t;
71
72/* Holds the L & U data structures on the GPU side */
73typedef struct //LUstruct_gpu_
74{
75 int_t *LrowindVec; /* A single vector */
76 int_t *LrowindPtr; /* A single vector */
77
78 doublecomplex *LnzvalVec; /* A single vector */
79 int_t *LnzvalPtr; /* A single vector */
80 int_t *LnzvalPtr_host; /* A single vector */
81
82 int_t *UrowindVec; /* A single vector */
83 int_t *UrowindPtr; /* A single vector */
84 int_t *UrowindPtr_host; /* A single vector */
85 int_t *UnzvalPtr_host;
86
87 doublecomplex *UnzvalVec; /* A single vector */
88 int_t *UnzvalPtr; /* A single vector */
89
90 /*gpu pointers for easy block accesses */
91 local_l_blk_info_t *local_l_blk_infoVec;
92 int_t *local_l_blk_infoPtr;
93 int_t *jib_lookupVec; /* NOT USED ? */
94 int_t *jib_lookupPtr; /* NOT USED ? */
95 local_u_blk_info_t *local_u_blk_infoVec;
96
97 int_t *local_u_blk_infoPtr;
98
99 // GPU buffers for performing Schur Complement Update on GPU
100 zSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS];
101 doublecomplex *acc_L_buff, *acc_U_buff;
102
103 /*Informations for various buffers*/
104 int_t buffer_size;
105 int_t nsupers; /*should have number of supernodes*/
106 int_t *xsup;
107 gridinfo_t *grid;
108
109#if 0 // Sherry: moved to 'SuperLUStat_t'
110 double ScatterMOPCounter;
111 double ScatterMOPTimer;
112 double GemmFLOPCounter;
113 double GemmFLOPTimer;
114
115 double cPCIeH2D;
116 double cPCIeD2H;
117 double tHost_PCIeH2D;
118 double tHost_PCIeD2H;
119
120 /*GPU events to measure DGEMM and SCATTER timing */
121 int *isOffloaded; /*stores if any iteration is offloaded or not*/
122 gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*GPU events to store gemm and scatter's begin and end*/
123 gpuEvent_t *ePCIeH2D;
124 gpuEvent_t *ePCIeD2H_Start;
125 gpuEvent_t *ePCIeD2H_End;
126#endif
127
128 int_t *xsup_host;
129 int_t* perm_c_supno;
130 int_t first_l_block_gpu, first_u_block_gpu;
131} zLUstruct_gpu_t;
132
133typedef struct //sluGPU_t_
134{
135 //int gpuId; // if there are multiple GPUs ( NOT USED )
136 zLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU
137 gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream;
138 gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS];
139 int lastOffloadStream[MAX_NGPU_STREAMS];
140 int nGPUStreams;
141 int* isNodeInMyGrid;
142 double acc_async_cost;
143} zsluGPU_t;
144
145
146#ifdef __cplusplus
147extern "C" {
148#endif
149
150extern int zsparseTreeFactor_ASYNC_GPU(
151 sForest_t *sforest,
152 commRequests_t **comReqss, // lists of communication requests,
153 // size = maxEtree level
154 zscuBufs_t *scuBufs, // contains buffers for schur complement update
155 packLUInfo_t *packLUInfo,
156 msgs_t **msgss, // size = num Look ahead
157 zLUValSubBuf_t **LUvsbs, // size = num Look ahead
158 zdiagFactBufs_t **dFBufs, // size = maxEtree level
159 factStat_t *factStat,
160 factNodelists_t *fNlists,
161 gEtreeInfo_t *gEtreeInfo, // global etree info
162 superlu_dist_options_t *options,
163 int_t *gIperm_c_supno,
164 int ldt,
165 zsluGPU_t *sluGPU,
166 d2Hreduce_t *d2Hred,
167 HyP_t *HyP,
168 zLUstruct_t *LUstruct, gridinfo3d_t *grid3d,
169 SuperLUStat_t *stat,
170 double thresh, SCT_t *SCT, int tag_ub,
171 int *info);
172
173int zinitD2Hreduce(
174 int next_k,
175 d2Hreduce_t* d2Hred,
176 int last_flag,
177 // int_t *perm_c_supno,
178 HyP_t* HyP,
179 zsluGPU_t *sluGPU,
180 gridinfo_t *grid,
181 zLUstruct_t *LUstruct, SCT_t* SCT
182);
183
184extern int zreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred,
185 zsluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid,
186 zLUstruct_t *LUstruct);
187
188extern int zwaitGPUscu(int streamId, zsluGPU_t *sluGPU, SCT_t *SCT);
189extern int zsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred,
190 zsluGPU_t *sluGPU, SuperLUStat_t *);
191extern int zsendSCUdataHost2GPU(
192 int_t streamId, int_t* lsub, int_t* usub, doublecomplex* bigU, int_t bigu_send_size,
193 int_t Remain_lbuf_send_size, zsluGPU_t *sluGPU, HyP_t* HyP
194);
195
196extern int zinitSluGPU3D_t(
197 zsluGPU_t *sluGPU,
198 zLUstruct_t *LUstruct,
199 gridinfo3d_t * grid3d,
200 int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt,
202);
203int zSchurCompUpdate_GPU(
204 int_t streamId,
205 int_t jj_cpu, int_t nub, int_t klst, int_t knsupc,
206 int_t Rnbrow, int_t RemainBlk,
207 int_t Remain_lbuf_send_size,
208 int_t bigu_send_size, int_t ldu,
209 int_t mcb,
210 int_t buffer_size, int_t lsub_len, int_t usub_len,
211 int_t ldt, int_t k0,
212 zsluGPU_t *sluGPU, gridinfo_t *grid,
214);
215
216
217extern void zCopyLUToGPU3D (int* isNodeInMyGrid, zLocalLU_t *A_host,
218 zsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n,
219 gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt,
221 );
222
223extern int zreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
224 int_t** treePerm, zLUValSubBuf_t*LUvsb,
225 zLUstruct_t* LUstruct, gridinfo3d_t* grid3d,
226 zsluGPU_t *sluGPU, d2Hreduce_t* d2Hred,
227 factStat_t *factStat, HyP_t* HyP, SCT_t* SCT,
229 );
230
231extern void zsyncAllfunCallStreams(zsluGPU_t* sluGPU, SCT_t* SCT);
232extern int zfree_LUstruct_gpu (zsluGPU_t *sluGPU, SuperLUStat_t *);
233
234//int freeSluGPU(zsluGPU_t *sluGPU);
235
236extern void zPrint_matrix( char *desc, int_t m, int_t n, doublecomplex *dA, int_t lda );
237
238#ifdef __cplusplus
239}
240#endif
241
242#endif // matching: enable GPU
int int_t
Definition: superlu_defs.h:114
double acc_async_cost
Definition: acc_aux.c:56
integer, parameter, public lsub
Definition: superlupara.f90:35
integer, parameter, public usub
Definition: superlupara.f90:35
Definition: superlu_defs.h:435
Definition: superlu_ddefs.h:329
Definition: superlu_defs.h:770
Definition: util_dist.h:172
Definition: util_dist.h:95
Definition: superlu_defs.h:760
Definition: superlu_defs.h:924
Definition: superlu_defs.h:852
Definition: dcomplex.h:30
Definition: superlu_defs.h:937
Definition: superlu_defs.h:839
Definition: superlu_defs.h:890
Definition: superlu_defs.h:398
Definition: superlu_defs.h:388
Definition: superlu_defs.h:815
Definition: superlu_defs.h:822
Definition: superlu_defs.h:947
Definition: superlu_ddefs.h:397
Definition: superlu_defs.h:901
Definition: superlu_defs.h:712
Definition: superlu_zdefs.h:357
Definition: superlu_zdefs.h:254
Definition: superlu_zdefs.h:97
Definition: superlu_zdefs.h:391
Definition: superlu_zdefs.h:385
Distributed SuperLU data types and function prototypes.