Skip to content

Commit 307a942

Browse files
authored
Address OMP issue for large systems (#45)
* Remove OMP reduction for multidimensional arrays. Signed-off-by: thfroitzheim <[email protected]> * Change to private allocation * Update notation --------- Signed-off-by: thfroitzheim <[email protected]>
1 parent 51c98fe commit 307a942

File tree

1 file changed

+88
-33
lines changed

1 file changed

+88
-33
lines changed

src/multicharge/model.F90

Lines changed: 88 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,16 @@ subroutine get_amat_0d(self, mol, amat)
159159
integer :: iat, jat, izp, jzp
160160
real(wp) :: vec(3), r2, gam, tmp
161161

162+
! Thread-private array for reduction
163+
real(wp), allocatable :: amat_local(:, :)
164+
162165
amat(:, :) = 0.0_wp
163166

164-
!$omp parallel do default(none) schedule(runtime) &
165-
!$omp reduction(+:amat) shared(mol, self) &
166-
!$omp private(iat, izp, jat, jzp, gam, vec, r2, tmp)
167+
!$omp parallel default(none) &
168+
!$omp shared(amat, mol, self) &
169+
!$omp private(iat, izp, jat, jzp, gam, vec, r2, tmp, amat_local)
170+
allocate(amat_local, source=amat)
171+
!$omp do schedule(runtime)
167172
do iat = 1, mol%nat
168173
izp = mol%id(iat)
169174
do jat = 1, iat-1
@@ -172,12 +177,18 @@ subroutine get_amat_0d(self, mol, amat)
172177
r2 = vec(1)**2 + vec(2)**2 + vec(3)**2
173178
gam = 1.0_wp / (self%rad(izp)**2 + self%rad(jzp)**2)
174179
tmp = erf(sqrt(r2*gam))/sqrt(r2)
175-
amat(jat, iat) = amat(jat, iat) + tmp
176-
amat(iat, jat) = amat(iat, jat) + tmp
180+
amat_local(jat, iat) = amat_local(jat, iat) + tmp
181+
amat_local(iat, jat) = amat_local(iat, jat) + tmp
177182
end do
178183
tmp = self%eta(izp) + sqrt2pi / self%rad(izp)
179-
amat(iat, iat) = amat(iat, iat) + tmp
184+
amat_local(iat, iat) = amat_local(iat, iat) + tmp
180185
end do
186+
!$omp end do
187+
!$omp critical (get_amat_0d_)
188+
amat(:, :) = amat(:, :) + amat_local(:, :)
189+
!$omp end critical (get_amat_0d_)
190+
deallocate(amat_local)
191+
!$omp end parallel
181192

182193
amat(mol%nat+1, 1:mol%nat+1) = 1.0_wp
183194
amat(1:mol%nat+1, mol%nat+1) = 1.0_wp
@@ -196,15 +207,20 @@ subroutine get_amat_3d(self, mol, wsc, alpha, amat)
196207
real(wp) :: vec(3), gam, wsw, dtmp, rtmp, vol
197208
real(wp), allocatable :: dtrans(:, :), rtrans(:, :)
198209

210+
! Thread-private array for reduction
211+
real(wp), allocatable :: amat_local(:, :)
212+
199213
amat(:, :) = 0.0_wp
200214

201215
vol = abs(matdet_3x3(mol%lattice))
202216
call get_dir_trans(mol%lattice, dtrans)
203217
call get_rec_trans(mol%lattice, rtrans)
204218

205-
!$omp parallel do default(none) schedule(runtime) &
206-
!$omp reduction(+:amat) shared(mol, self, wsc, dtrans, rtrans, alpha, vol) &
207-
!$omp private(iat, izp, jat, jzp, gam, wsw, vec, dtmp, rtmp)
219+
!$omp parallel default(none) &
220+
!$omp shared(amat, mol, self, wsc, dtrans, rtrans, alpha, vol) &
221+
!$omp private(iat, izp, jat, jzp, gam, wsw, vec, dtmp, rtmp, amat_local)
222+
allocate(amat_local, source=amat)
223+
!$omp do schedule(runtime)
208224
do iat = 1, mol%nat
209225
izp = mol%id(iat)
210226
do jat = 1, iat-1
@@ -215,8 +231,8 @@ subroutine get_amat_3d(self, mol, wsc, alpha, amat)
215231
vec = mol%xyz(:, iat) - mol%xyz(:, jat) - wsc%trans(:, wsc%tridx(img, jat, iat))
216232
call get_amat_dir_3d(vec, gam, alpha, dtrans, dtmp)
217233
call get_amat_rec_3d(vec, vol, alpha, rtrans, rtmp)
218-
amat(jat, iat) = amat(jat, iat) + (dtmp + rtmp) * wsw
219-
amat(iat, jat) = amat(iat, jat) + (dtmp + rtmp) * wsw
234+
amat_local(jat, iat) = amat_local(jat, iat) + (dtmp + rtmp) * wsw
235+
amat_local(iat, jat) = amat_local(iat, jat) + (dtmp + rtmp) * wsw
220236
end do
221237
end do
222238

@@ -226,12 +242,18 @@ subroutine get_amat_3d(self, mol, wsc, alpha, amat)
226242
vec = wsc%trans(:, wsc%tridx(img, iat, iat))
227243
call get_amat_dir_3d(vec, gam, alpha, dtrans, dtmp)
228244
call get_amat_rec_3d(vec, vol, alpha, rtrans, rtmp)
229-
amat(iat, iat) = amat(iat, iat) + (dtmp + rtmp) * wsw
245+
amat_local(iat, iat) = amat_local(iat, iat) + (dtmp + rtmp) * wsw
230246
end do
231247

232248
dtmp = self%eta(izp) + sqrt2pi / self%rad(izp) - 2 * alpha / sqrtpi
233-
amat(iat, iat) = amat(iat, iat) + dtmp
249+
amat_local(iat, iat) = amat_local(iat, iat) + dtmp
234250
end do
251+
!$omp end do
252+
!$omp critical (get_amat_3d_)
253+
amat(:, :) = amat(:, :) + amat_local(:, :)
254+
!$omp end critical (get_amat_3d_)
255+
deallocate(amat_local)
256+
!$omp end parallel
235257

236258
amat(mol%nat+1, 1:mol%nat+1) = 1.0_wp
237259
amat(1:mol%nat+1, mol%nat+1) = 1.0_wp
@@ -295,13 +317,22 @@ subroutine get_damat_0d(self, mol, qvec, dadr, dadL, atrace)
295317
integer :: iat, jat, izp, jzp
296318
real(wp) :: vec(3), r2, gam, arg, dtmp, dG(3), dS(3, 3)
297319

320+
! Thread-private arrays for reduction
321+
real(wp), allocatable :: atrace_local(:, :)
322+
real(wp), allocatable :: dadr_local(:, :, :), dadL_local(:, :, :)
323+
298324
atrace(:, :) = 0.0_wp
299325
dadr(:, :, :) = 0.0_wp
300326
dadL(:, :, :) = 0.0_wp
301327

302-
!$omp parallel do default(none) schedule(runtime) &
303-
!$omp reduction(+:atrace, dadr, dadL) shared(mol, self, qvec) &
304-
!$omp private(iat, izp, jat, jzp, gam, r2, vec, dG, dS, dtmp, arg)
328+
!$omp parallel default(none) &
329+
!$omp shared(atrace, dadr, dadL, mol, self, qvec) &
330+
!$omp private(iat, izp, jat, jzp, gam, r2, vec, dG, dS, dtmp, arg) &
331+
!$omp private(atrace_local, dadr_local, dadL_local)
332+
allocate(atrace_local, source=atrace)
333+
allocate(dadr_local, source=dadr)
334+
allocate(dadL_local, source=dadL)
335+
!$omp do schedule(runtime)
305336
do iat = 1, mol%nat
306337
izp = mol%id(iat)
307338
do jat = 1, iat-1
@@ -313,14 +344,22 @@ subroutine get_damat_0d(self, mol, qvec, dadr, dadL, atrace)
313344
dtmp = 2.0_wp*gam*exp(-arg)/(sqrtpi*r2)-erf(sqrt(arg))/(r2*sqrt(r2))
314345
dG = dtmp*vec
315346
dS = spread(dG, 1, 3) * spread(vec, 2, 3)
316-
atrace(:, iat) = +dG*qvec(jat) + atrace(:, iat)
317-
atrace(:, jat) = -dG*qvec(iat) + atrace(:, jat)
318-
dadr(:, iat, jat) = +dG*qvec(iat)
319-
dadr(:, jat, iat) = -dG*qvec(jat)
320-
dadL(:, :, jat) = +dS*qvec(iat) + dadL(:, :, jat)
321-
dadL(:, :, iat) = +dS*qvec(jat) + dadL(:, :, iat)
347+
atrace_local(:, iat) = +dG*qvec(jat) + atrace_local(:, iat)
348+
atrace_local(:, jat) = -dG*qvec(iat) + atrace_local(:, jat)
349+
dadr_local(:, iat, jat) = +dG*qvec(iat)
350+
dadr_local(:, jat, iat) = -dG*qvec(jat)
351+
dadL_local(:, :, jat) = +dS*qvec(iat) + dadL_local(:, :, jat)
352+
dadL_local(:, :, iat) = +dS*qvec(jat) + dadL_local(:, :, iat)
322353
end do
323354
end do
355+
!$omp end do
356+
!$omp critical (get_damat_0d_)
357+
atrace(:, :) = atrace(:, :) + atrace_local(:, :)
358+
dadr(:, :, :) = dadr(:, :, :) + dadr_local(:, :, :)
359+
dadL(:, :, :) = dadL(:, :, :) + dadL_local(:, :, :)
360+
!$omp end critical (get_damat_0d_)
361+
deallocate(dadL_local, dadr_local, atrace_local)
362+
!$omp end parallel
324363

325364
end subroutine get_damat_0d
326365

@@ -339,6 +378,10 @@ subroutine get_damat_3d(self, mol, wsc, alpha, qvec, dadr, dadL, atrace)
339378
real(wp) :: dGd(3), dSd(3, 3), dGr(3), dSr(3, 3)
340379
real(wp), allocatable :: dtrans(:, :), rtrans(:, :)
341380

381+
! Thread-private arrays for reduction
382+
real(wp), allocatable :: atrace_local(:, :)
383+
real(wp), allocatable :: dadr_local(:, :, :), dadL_local(:, :, :)
384+
342385
atrace(:, :) = 0.0_wp
343386
dadr(:, :, :) = 0.0_wp
344387
dadL(:, :, :) = 0.0_wp
@@ -347,11 +390,15 @@ subroutine get_damat_3d(self, mol, wsc, alpha, qvec, dadr, dadL, atrace)
347390
call get_dir_trans(mol%lattice, dtrans)
348391
call get_rec_trans(mol%lattice, rtrans)
349392

350-
!$omp parallel do default(none) schedule(runtime) &
351-
!$omp reduction(+:atrace, dadr, dadL) &
393+
!$omp parallel default(none) &
352394
!$omp shared(mol, self, wsc, alpha, vol, dtrans, rtrans, qvec) &
353-
!$omp private(iat, izp, jat, jzp, img, gam, wsw, vec, dG, dS, &
354-
!$omp& dGr, dSr, dGd, dSd)
395+
!$omp shared(atrace, dadr, dadL) &
396+
!$omp private(iat, izp, jat, jzp, img, gam, wsw, vec, dG, dS) &
397+
!$omp private(dGr, dSr, dGd, dSd, atrace_local, dadr_local, dadL_local)
398+
allocate(atrace_local, source=atrace)
399+
allocate(dadr_local, source=dadr)
400+
allocate(dadL_local, source=dadL)
401+
!$omp do schedule(runtime)
355402
do iat = 1, mol%nat
356403
izp = mol%id(iat)
357404
do jat = 1, iat-1
@@ -367,12 +414,12 @@ subroutine get_damat_3d(self, mol, wsc, alpha, qvec, dadr, dadL, atrace)
367414
dG = dG + (dGd + dGr) * wsw
368415
dS = dS + (dSd + dSr) * wsw
369416
end do
370-
atrace(:, iat) = +dG*qvec(jat) + atrace(:, iat)
371-
atrace(:, jat) = -dG*qvec(iat) + atrace(:, jat)
372-
dadr(:, iat, jat) = +dG*qvec(iat) + dadr(:, iat, jat)
373-
dadr(:, jat, iat) = -dG*qvec(jat) + dadr(:, jat, iat)
374-
dadL(:, :, jat) = +dS*qvec(iat) + dadL(:, :, jat)
375-
dadL(:, :, iat) = +dS*qvec(jat) + dadL(:, :, iat)
417+
atrace_local(:, iat) = +dG*qvec(jat) + atrace_local(:, iat)
418+
atrace_local(:, jat) = -dG*qvec(iat) + atrace_local(:, jat)
419+
dadr_local(:, iat, jat) = +dG*qvec(iat) + dadr_local(:, iat, jat)
420+
dadr_local(:, jat, iat) = -dG*qvec(jat) + dadr_local(:, jat, iat)
421+
dadL_local(:, :, jat) = +dS*qvec(iat) + dadL_local(:, :, jat)
422+
dadL_local(:, :, iat) = +dS*qvec(jat) + dadL_local(:, :, iat)
376423
end do
377424

378425
dS(:, :) = 0.0_wp
@@ -384,8 +431,16 @@ subroutine get_damat_3d(self, mol, wsc, alpha, qvec, dadr, dadL, atrace)
384431
call get_damat_rec_3d(vec, vol, alpha, rtrans, dGr, dSr)
385432
dS = dS + (dSd + dSr) * wsw
386433
end do
387-
dadL(:, :, iat) = +dS*qvec(iat) + dadL(:, :, iat)
434+
dadL_local(:, :, iat) = +dS*qvec(iat) + dadL_local(:, :, iat)
388435
end do
436+
!$omp end do
437+
!$omp critical (get_damat_3d_)
438+
atrace(:, :) = atrace(:, :) + atrace_local(:, :)
439+
dadr(:, :, :) = dadr(:, :, :) + dadr_local(:, :, :)
440+
dadL(:, :, :) = dadL(:, :, :) + dadL_local(:, :, :)
441+
!$omp end critical (get_damat_3d_)
442+
deallocate(dadL_local, dadr_local, atrace_local)
443+
!$omp end parallel
389444

390445
end subroutine get_damat_3d
391446

0 commit comments

Comments
 (0)