@@ -159,11 +159,16 @@ subroutine get_amat_0d(self, mol, amat)
159159 integer :: iat, jat, izp, jzp
160160 real (wp) :: vec(3 ), r2, gam, tmp
161161
162+ ! Thread-private array for reduction
163+ real (wp), allocatable :: amat_local(:, :)
164+
162165 amat(:, :) = 0.0_wp
163166
164- ! $omp parallel do default(none) schedule(runtime) &
165- ! $omp reduction(+:amat) shared(mol, self) &
166- ! $omp private(iat, izp, jat, jzp, gam, vec, r2, tmp)
167+ ! $omp parallel default(none) &
168+ ! $omp shared(amat, mol, self) &
169+ ! $omp private(iat, izp, jat, jzp, gam, vec, r2, tmp, amat_local)
170+ allocate (amat_local, source= amat)
171+ ! $omp do schedule(runtime)
167172 do iat = 1 , mol% nat
168173 izp = mol% id(iat)
169174 do jat = 1 , iat-1
@@ -172,12 +177,18 @@ subroutine get_amat_0d(self, mol, amat)
172177 r2 = vec(1 )** 2 + vec(2 )** 2 + vec(3 )** 2
173178 gam = 1.0_wp / (self% rad(izp)** 2 + self% rad(jzp)** 2 )
174179 tmp = erf(sqrt (r2* gam))/ sqrt (r2)
175- amat (jat, iat) = amat (jat, iat) + tmp
176- amat (iat, jat) = amat (iat, jat) + tmp
180+ amat_local (jat, iat) = amat_local (jat, iat) + tmp
181+ amat_local (iat, jat) = amat_local (iat, jat) + tmp
177182 end do
178183 tmp = self% eta(izp) + sqrt2pi / self% rad(izp)
179- amat (iat, iat) = amat (iat, iat) + tmp
184+ amat_local (iat, iat) = amat_local (iat, iat) + tmp
180185 end do
186+ ! $omp end do
187+ ! $omp critical (get_amat_0d_)
188+ amat(:, :) = amat(:, :) + amat_local(:, :)
189+ ! $omp end critical (get_amat_0d_)
190+ deallocate (amat_local)
191+ ! $omp end parallel
181192
182193 amat(mol% nat+1 , 1 :mol% nat+1 ) = 1.0_wp
183194 amat(1 :mol% nat+1 , mol% nat+1 ) = 1.0_wp
@@ -196,15 +207,20 @@ subroutine get_amat_3d(self, mol, wsc, alpha, amat)
196207 real (wp) :: vec(3 ), gam, wsw, dtmp, rtmp, vol
197208 real (wp), allocatable :: dtrans(:, :), rtrans(:, :)
198209
210+ ! Thread-private array for reduction
211+ real (wp), allocatable :: amat_local(:, :)
212+
199213 amat(:, :) = 0.0_wp
200214
201215 vol = abs (matdet_3x3(mol% lattice))
202216 call get_dir_trans(mol% lattice, dtrans)
203217 call get_rec_trans(mol% lattice, rtrans)
204218
205- ! $omp parallel do default(none) schedule(runtime) &
206- ! $omp reduction(+:amat) shared(mol, self, wsc, dtrans, rtrans, alpha, vol) &
207- ! $omp private(iat, izp, jat, jzp, gam, wsw, vec, dtmp, rtmp)
219+ ! $omp parallel default(none) &
220+ ! $omp shared(amat, mol, self, wsc, dtrans, rtrans, alpha, vol) &
221+ ! $omp private(iat, izp, jat, jzp, gam, wsw, vec, dtmp, rtmp, amat_local)
222+ allocate (amat_local, source= amat)
223+ ! $omp do schedule(runtime)
208224 do iat = 1 , mol% nat
209225 izp = mol% id(iat)
210226 do jat = 1 , iat-1
@@ -215,8 +231,8 @@ subroutine get_amat_3d(self, mol, wsc, alpha, amat)
215231 vec = mol% xyz(:, iat) - mol% xyz(:, jat) - wsc% trans(:, wsc% tridx(img, jat, iat))
216232 call get_amat_dir_3d(vec, gam, alpha, dtrans, dtmp)
217233 call get_amat_rec_3d(vec, vol, alpha, rtrans, rtmp)
218- amat (jat, iat) = amat (jat, iat) + (dtmp + rtmp) * wsw
219- amat (iat, jat) = amat (iat, jat) + (dtmp + rtmp) * wsw
234+ amat_local (jat, iat) = amat_local (jat, iat) + (dtmp + rtmp) * wsw
235+ amat_local (iat, jat) = amat_local (iat, jat) + (dtmp + rtmp) * wsw
220236 end do
221237 end do
222238
@@ -226,12 +242,18 @@ subroutine get_amat_3d(self, mol, wsc, alpha, amat)
226242 vec = wsc% trans(:, wsc% tridx(img, iat, iat))
227243 call get_amat_dir_3d(vec, gam, alpha, dtrans, dtmp)
228244 call get_amat_rec_3d(vec, vol, alpha, rtrans, rtmp)
229- amat (iat, iat) = amat (iat, iat) + (dtmp + rtmp) * wsw
245+ amat_local (iat, iat) = amat_local (iat, iat) + (dtmp + rtmp) * wsw
230246 end do
231247
232248 dtmp = self% eta(izp) + sqrt2pi / self% rad(izp) - 2 * alpha / sqrtpi
233- amat (iat, iat) = amat (iat, iat) + dtmp
249+ amat_local (iat, iat) = amat_local (iat, iat) + dtmp
234250 end do
251+ ! $omp end do
252+ ! $omp critical (get_amat_3d_)
253+ amat(:, :) = amat(:, :) + amat_local(:, :)
254+ ! $omp end critical (get_amat_3d_)
255+ deallocate (amat_local)
256+ ! $omp end parallel
235257
236258 amat(mol% nat+1 , 1 :mol% nat+1 ) = 1.0_wp
237259 amat(1 :mol% nat+1 , mol% nat+1 ) = 1.0_wp
@@ -295,13 +317,22 @@ subroutine get_damat_0d(self, mol, qvec, dadr, dadL, atrace)
295317 integer :: iat, jat, izp, jzp
296318 real (wp) :: vec(3 ), r2, gam, arg, dtmp, dG(3 ), dS(3 , 3 )
297319
320+ ! Thread-private arrays for reduction
321+ real (wp), allocatable :: atrace_local(:, :)
322+ real (wp), allocatable :: dadr_local(:, :, :), dadL_local(:, :, :)
323+
298324 atrace(:, :) = 0.0_wp
299325 dadr(:, :, :) = 0.0_wp
300326 dadL(:, :, :) = 0.0_wp
301327
302- ! $omp parallel do default(none) schedule(runtime) &
303- ! $omp reduction(+:atrace, dadr, dadL) shared(mol, self, qvec) &
304- ! $omp private(iat, izp, jat, jzp, gam, r2, vec, dG, dS, dtmp, arg)
328+ ! $omp parallel default(none) &
329+ ! $omp shared(atrace, dadr, dadL, mol, self, qvec) &
330+ ! $omp private(iat, izp, jat, jzp, gam, r2, vec, dG, dS, dtmp, arg) &
331+ ! $omp private(atrace_local, dadr_local, dadL_local)
332+ allocate (atrace_local, source= atrace)
333+ allocate (dadr_local, source= dadr)
334+ allocate (dadL_local, source= dadL)
335+ ! $omp do schedule(runtime)
305336 do iat = 1 , mol% nat
306337 izp = mol% id(iat)
307338 do jat = 1 , iat-1
@@ -313,14 +344,22 @@ subroutine get_damat_0d(self, mol, qvec, dadr, dadL, atrace)
313344 dtmp = 2.0_wp * gam* exp (- arg)/ (sqrtpi* r2)- erf(sqrt (arg))/ (r2* sqrt (r2))
314345 dG = dtmp* vec
315346 dS = spread (dG, 1 , 3 ) * spread (vec, 2 , 3 )
316- atrace (:, iat) = + dG* qvec(jat) + atrace (:, iat)
317- atrace (:, jat) = - dG* qvec(iat) + atrace (:, jat)
318- dadr (:, iat, jat) = + dG* qvec(iat)
319- dadr (:, jat, iat) = - dG* qvec(jat)
320- dadL (:, :, jat) = + dS* qvec(iat) + dadL (:, :, jat)
321- dadL (:, :, iat) = + dS* qvec(jat) + dadL (:, :, iat)
347+ atrace_local (:, iat) = + dG* qvec(jat) + atrace_local (:, iat)
348+ atrace_local (:, jat) = - dG* qvec(iat) + atrace_local (:, jat)
349+ dadr_local (:, iat, jat) = + dG* qvec(iat)
350+ dadr_local (:, jat, iat) = - dG* qvec(jat)
351+ dadL_local (:, :, jat) = + dS* qvec(iat) + dadL_local (:, :, jat)
352+ dadL_local (:, :, iat) = + dS* qvec(jat) + dadL_local (:, :, iat)
322353 end do
323354 end do
355+ ! $omp end do
356+ ! $omp critical (get_damat_0d_)
357+ atrace(:, :) = atrace(:, :) + atrace_local(:, :)
358+ dadr(:, :, :) = dadr(:, :, :) + dadr_local(:, :, :)
359+ dadL(:, :, :) = dadL(:, :, :) + dadL_local(:, :, :)
360+ ! $omp end critical (get_damat_0d_)
361+ deallocate (dadL_local, dadr_local, atrace_local)
362+ ! $omp end parallel
324363
325364end subroutine get_damat_0d
326365
@@ -339,6 +378,10 @@ subroutine get_damat_3d(self, mol, wsc, alpha, qvec, dadr, dadL, atrace)
339378 real (wp) :: dGd(3 ), dSd(3 , 3 ), dGr(3 ), dSr(3 , 3 )
340379 real (wp), allocatable :: dtrans(:, :), rtrans(:, :)
341380
381+ ! Thread-private arrays for reduction
382+ real (wp), allocatable :: atrace_local(:, :)
383+ real (wp), allocatable :: dadr_local(:, :, :), dadL_local(:, :, :)
384+
342385 atrace(:, :) = 0.0_wp
343386 dadr(:, :, :) = 0.0_wp
344387 dadL(:, :, :) = 0.0_wp
@@ -347,11 +390,15 @@ subroutine get_damat_3d(self, mol, wsc, alpha, qvec, dadr, dadL, atrace)
347390 call get_dir_trans(mol% lattice, dtrans)
348391 call get_rec_trans(mol% lattice, rtrans)
349392
350- ! $omp parallel do default(none) schedule(runtime) &
351- ! $omp reduction(+:atrace, dadr, dadL) &
393+ ! $omp parallel default(none) &
352394 ! $omp shared(mol, self, wsc, alpha, vol, dtrans, rtrans, qvec) &
353- ! $omp private(iat, izp, jat, jzp, img, gam, wsw, vec, dG, dS, &
354- ! $omp& dGr, dSr, dGd, dSd)
395+ ! $omp shared(atrace, dadr, dadL) &
396+ ! $omp private(iat, izp, jat, jzp, img, gam, wsw, vec, dG, dS) &
397+ ! $omp private(dGr, dSr, dGd, dSd, atrace_local, dadr_local, dadL_local)
398+ allocate (atrace_local, source= atrace)
399+ allocate (dadr_local, source= dadr)
400+ allocate (dadL_local, source= dadL)
401+ ! $omp do schedule(runtime)
355402 do iat = 1 , mol% nat
356403 izp = mol% id(iat)
357404 do jat = 1 , iat-1
@@ -367,12 +414,12 @@ subroutine get_damat_3d(self, mol, wsc, alpha, qvec, dadr, dadL, atrace)
367414 dG = dG + (dGd + dGr) * wsw
368415 dS = dS + (dSd + dSr) * wsw
369416 end do
370- atrace (:, iat) = + dG* qvec(jat) + atrace (:, iat)
371- atrace (:, jat) = - dG* qvec(iat) + atrace (:, jat)
372- dadr (:, iat, jat) = + dG* qvec(iat) + dadr (:, iat, jat)
373- dadr (:, jat, iat) = - dG* qvec(jat) + dadr (:, jat, iat)
374- dadL (:, :, jat) = + dS* qvec(iat) + dadL (:, :, jat)
375- dadL (:, :, iat) = + dS* qvec(jat) + dadL (:, :, iat)
417+ atrace_local (:, iat) = + dG* qvec(jat) + atrace_local (:, iat)
418+ atrace_local (:, jat) = - dG* qvec(iat) + atrace_local (:, jat)
419+ dadr_local (:, iat, jat) = + dG* qvec(iat) + dadr_local (:, iat, jat)
420+ dadr_local (:, jat, iat) = - dG* qvec(jat) + dadr_local (:, jat, iat)
421+ dadL_local (:, :, jat) = + dS* qvec(iat) + dadL_local (:, :, jat)
422+ dadL_local (:, :, iat) = + dS* qvec(jat) + dadL_local (:, :, iat)
376423 end do
377424
378425 dS(:, :) = 0.0_wp
@@ -384,8 +431,16 @@ subroutine get_damat_3d(self, mol, wsc, alpha, qvec, dadr, dadL, atrace)
384431 call get_damat_rec_3d(vec, vol, alpha, rtrans, dGr, dSr)
385432 dS = dS + (dSd + dSr) * wsw
386433 end do
387- dadL (:, :, iat) = + dS* qvec(iat) + dadL (:, :, iat)
434+ dadL_local (:, :, iat) = + dS* qvec(iat) + dadL_local (:, :, iat)
388435 end do
436+ ! $omp end do
437+ ! $omp critical (get_damat_3d_)
438+ atrace(:, :) = atrace(:, :) + atrace_local(:, :)
439+ dadr(:, :, :) = dadr(:, :, :) + dadr_local(:, :, :)
440+ dadL(:, :, :) = dadL(:, :, :) + dadL_local(:, :, :)
441+ ! $omp end critical (get_damat_3d_)
442+ deallocate (dadL_local, dadr_local, atrace_local)
443+ ! $omp end parallel
389444
390445end subroutine get_damat_3d
391446
0 commit comments