LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#include "kmp_utils.h"
28#if KMP_USE_HIER_SCHED
29#include "kmp_dispatch_hier.h"
30#endif
31
32#if OMPT_SUPPORT
33#include "ompt-specific.h"
34#endif
35#if OMPD_SUPPORT
36#include "ompd-specific.h"
37#endif
38
39#if OMP_PROFILING_SUPPORT
40#include "llvm/Support/TimeProfiler.h"
41static char *ProfileTraceFile = nullptr;
42#endif
43
44/* these are temporary issues to be dealt with */
45#define KMP_USE_PRCTL 0
46
47#if KMP_OS_WINDOWS
48#include <process.h>
49#endif
50
51#ifndef KMP_USE_SHM
52// Windows and WASI do not need these include files as they don't use shared
53// memory.
54#else
55#include <sys/mman.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#define SHM_SIZE 1024
59#endif
60
61#if defined(KMP_GOMP_COMPAT)
62char const __kmp_version_alt_comp[] =
63 KMP_VERSION_PREFIX "alternative compiler support: yes";
64#endif /* defined(KMP_GOMP_COMPAT) */
65
66char const __kmp_version_omp_api[] =
67 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69#ifdef KMP_DEBUG
70char const __kmp_version_lock[] =
71 KMP_VERSION_PREFIX "lock type: run time selectable";
72#endif /* KMP_DEBUG */
73
74#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76/* ------------------------------------------------------------------------ */
77
78#if KMP_USE_MONITOR
79kmp_info_t __kmp_monitor;
80#endif
81
82/* Forward declarations */
83
84void __kmp_cleanup(void);
85
86static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87 int gtid);
88static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89 kmp_internal_control_t *new_icvs,
90 ident_t *loc);
91#if KMP_AFFINITY_SUPPORTED
92static void __kmp_partition_places(kmp_team_t *team,
93 int update_master_only = 0);
94#endif
95static void __kmp_do_serial_initialize(void);
96void __kmp_fork_barrier(int gtid, int tid);
97void __kmp_join_barrier(int gtid);
98void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99 kmp_internal_control_t *new_icvs, ident_t *loc);
100
101#ifdef USE_LOAD_BALANCE
102static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103#endif
104
105static int __kmp_expand_threads(int nNeed);
106#if KMP_OS_WINDOWS
107static int __kmp_unregister_root_other_thread(int gtid);
108#endif
109static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113 int new_nthreads);
114void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
116/* Calculate the identifier of the current thread */
117/* fast (and somewhat portable) way to get unique identifier of executing
118 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
119int __kmp_get_global_thread_id() {
120 int i;
121 kmp_info_t **other_threads;
122 size_t stack_data;
123 char *stack_addr;
124 size_t stack_size;
125 char *stack_base;
126
127 KA_TRACE(
128 1000,
129 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
130 __kmp_nth, __kmp_all_nth));
131
132 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
133 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
134 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
135 __kmp_init_gtid for this to work. */
136
137 if (!TCR_4(__kmp_init_gtid))
138 return KMP_GTID_DNE;
139
140#ifdef KMP_TDATA_GTID
141 if (TCR_4(__kmp_gtid_mode) >= 3) {
142 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
143 return __kmp_gtid;
144 }
145#endif
146 if (TCR_4(__kmp_gtid_mode) >= 2) {
147 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
148 return __kmp_gtid_get_specific();
149 }
150 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
151
152 stack_addr = (char *)&stack_data;
153 other_threads = __kmp_threads;
154
155 /* ATT: The code below is a source of potential bugs due to unsynchronized
156 access to __kmp_threads array. For example:
157 1. Current thread loads other_threads[i] to thr and checks it, it is
158 non-NULL.
159 2. Current thread is suspended by OS.
160 3. Another thread unregisters and finishes (debug versions of free()
161 may fill memory with something like 0xEF).
162 4. Current thread is resumed.
163 5. Current thread reads junk from *thr.
164 TODO: Fix it. --ln */
165
166 for (i = 0; i < __kmp_threads_capacity; i++) {
167
168 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
169 if (!thr)
170 continue;
171
172 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
173 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
174
175 /* stack grows down -- search through all of the active threads */
176
177 if (stack_addr <= stack_base) {
178 size_t stack_diff = stack_base - stack_addr;
179
180 if (stack_diff <= stack_size) {
181 /* The only way we can be closer than the allocated */
182 /* stack size is if we are running on this thread. */
183 // __kmp_gtid_get_specific can return negative value because this
184 // function can be called by thread destructor. However, before the
185 // thread destructor is called, the value of the corresponding
186 // thread-specific data will be reset to NULL.
187 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
188 __kmp_gtid_get_specific() == i);
189 return i;
190 }
191 }
192 }
193
194 /* get specific to try and determine our gtid */
195 KA_TRACE(1000,
196 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
197 "thread, using TLS\n"));
198 i = __kmp_gtid_get_specific();
199
200 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
201
202 /* if we havn't been assigned a gtid, then return code */
203 if (i < 0)
204 return i;
205
206 // other_threads[i] can be nullptr at this point because the corresponding
207 // thread could have already been destructed. It can happen when this function
208 // is called in end library routine.
209 if (!TCR_SYNC_PTR(other_threads[i]))
210 return i;
211
212 /* dynamically updated stack window for uber threads to avoid get_specific
213 call */
214 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
215 KMP_FATAL(StackOverflow, i);
216 }
217
218 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219 if (stack_addr > stack_base) {
220 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
221 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
222 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
223 stack_base);
224 } else {
225 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
226 stack_base - stack_addr);
227 }
228
229 /* Reprint stack bounds for ubermaster since they have been refined */
230 if (__kmp_storage_map) {
231 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
232 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
233 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
234 other_threads[i]->th.th_info.ds.ds_stacksize,
235 "th_%d stack (refinement)", i);
236 }
237 return i;
238}
239
240int __kmp_get_global_thread_id_reg() {
241 int gtid;
242
243 if (!__kmp_init_serial) {
244 gtid = KMP_GTID_DNE;
245 } else
246#ifdef KMP_TDATA_GTID
247 if (TCR_4(__kmp_gtid_mode) >= 3) {
248 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
249 gtid = __kmp_gtid;
250 } else
251#endif
252 if (TCR_4(__kmp_gtid_mode) >= 2) {
253 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
254 gtid = __kmp_gtid_get_specific();
255 } else {
256 KA_TRACE(1000,
257 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
258 gtid = __kmp_get_global_thread_id();
259 }
260
261 /* we must be a new uber master sibling thread */
262 if (gtid == KMP_GTID_DNE) {
263 KA_TRACE(10,
264 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
265 "Registering a new gtid.\n"));
266 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
267 if (!__kmp_init_serial) {
268 __kmp_do_serial_initialize();
269 gtid = __kmp_gtid_get_specific();
270 } else {
271 gtid = __kmp_register_root(FALSE);
272 }
273 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
274 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
275 }
276
277 KMP_DEBUG_ASSERT(gtid >= 0);
278
279 return gtid;
280}
281
282/* caller must hold forkjoin_lock */
283void __kmp_check_stack_overlap(kmp_info_t *th) {
284 int f;
285 char *stack_beg = NULL;
286 char *stack_end = NULL;
287 int gtid;
288
289 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
290 if (__kmp_storage_map) {
291 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
292 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
293
294 gtid = __kmp_gtid_from_thread(th);
295
296 if (gtid == KMP_GTID_MONITOR) {
297 __kmp_print_storage_map_gtid(
298 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
299 "th_%s stack (%s)", "mon",
300 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
301 } else {
302 __kmp_print_storage_map_gtid(
303 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
304 "th_%d stack (%s)", gtid,
305 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
306 }
307 }
308
309 /* No point in checking ubermaster threads since they use refinement and
310 * cannot overlap */
311 gtid = __kmp_gtid_from_thread(th);
312 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
313 KA_TRACE(10,
314 ("__kmp_check_stack_overlap: performing extensive checking\n"));
315 if (stack_beg == NULL) {
316 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
317 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
318 }
319
320 for (f = 0; f < __kmp_threads_capacity; f++) {
321 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
322
323 if (f_th && f_th != th) {
324 char *other_stack_end =
325 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
326 char *other_stack_beg =
327 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
328 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
329 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
330
331 /* Print the other stack values before the abort */
332 if (__kmp_storage_map)
333 __kmp_print_storage_map_gtid(
334 -1, other_stack_beg, other_stack_end,
335 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
336 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
337
338 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
339 __kmp_msg_null);
340 }
341 }
342 }
343 }
344 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
345}
346
347/* ------------------------------------------------------------------------ */
348
349void __kmp_infinite_loop(void) {
350 static int done = FALSE;
351
352 while (!done) {
353 KMP_YIELD(TRUE);
354 }
355}
356
357#define MAX_MESSAGE 512
358
359void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
360 char const *format, ...) {
361 char buffer[MAX_MESSAGE];
362 va_list ap;
363
364 va_start(ap, format);
365 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
366 p2, (unsigned long)size, format);
367 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
368 __kmp_vprintf(kmp_err, buffer, ap);
369#if KMP_PRINT_DATA_PLACEMENT
370 int node;
371 if (gtid >= 0) {
372 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
373 if (__kmp_storage_map_verbose) {
374 node = __kmp_get_host_node(p1);
375 if (node < 0) /* doesn't work, so don't try this next time */
376 __kmp_storage_map_verbose = FALSE;
377 else {
378 char *last;
379 int lastNode;
380 int localProc = __kmp_get_cpu_from_gtid(gtid);
381
382 const int page_size = KMP_GET_PAGE_SIZE();
383
384 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
385 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
386 if (localProc >= 0)
387 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
388 localProc >> 1);
389 else
390 __kmp_printf_no_lock(" GTID %d\n", gtid);
391#if KMP_USE_PRCTL
392 /* The more elaborate format is disabled for now because of the prctl
393 * hanging bug. */
394 do {
395 last = p1;
396 lastNode = node;
397 /* This loop collates adjacent pages with the same host node. */
398 do {
399 (char *)p1 += page_size;
400 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
401 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
402 lastNode);
403 } while (p1 <= p2);
404#else
405 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
406 (char *)p1 + (page_size - 1),
407 __kmp_get_host_node(p1));
408 if (p1 < p2) {
409 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
410 (char *)p2 + (page_size - 1),
411 __kmp_get_host_node(p2));
412 }
413#endif
414 }
415 }
416 } else
417 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
418 }
419#endif /* KMP_PRINT_DATA_PLACEMENT */
420 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
421
422 va_end(ap);
423}
424
425void __kmp_warn(char const *format, ...) {
426 char buffer[MAX_MESSAGE];
427 va_list ap;
428
429 if (__kmp_generate_warnings == kmp_warnings_off) {
430 return;
431 }
432
433 va_start(ap, format);
434
435 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
436 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
437 __kmp_vprintf(kmp_err, buffer, ap);
438 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
439
440 va_end(ap);
441}
442
443void __kmp_abort_process() {
444 // Later threads may stall here, but that's ok because abort() will kill them.
445 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
446
447 if (__kmp_debug_buf) {
448 __kmp_dump_debug_buffer();
449 }
450
451#if KMP_OS_WINDOWS
452 // Let other threads know of abnormal termination and prevent deadlock
453 // if abort happened during library initialization or shutdown
454 __kmp_global.g.g_abort = SIGABRT;
455
456 /* On Windows* OS by default abort() causes pop-up error box, which stalls
457 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
458 boxes. _set_abort_behavior() works well, but this function is not
459 available in VS7 (this is not problem for DLL, but it is a problem for
460 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
461 help, at least in some versions of MS C RTL.
462
463 It seems following sequence is the only way to simulate abort() and
464 avoid pop-up error box. */
465 raise(SIGABRT);
466 _exit(3); // Just in case, if signal ignored, exit anyway.
467#else
468 __kmp_unregister_library();
469 abort();
470#endif
471
472 __kmp_infinite_loop();
473 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
474
475} // __kmp_abort_process
476
477void __kmp_abort_thread(void) {
478 // TODO: Eliminate g_abort global variable and this function.
479 // In case of abort just call abort(), it will kill all the threads.
480 __kmp_infinite_loop();
481} // __kmp_abort_thread
482
483/* Print out the storage map for the major kmp_info_t thread data structures
484 that are allocated together. */
485
486static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
487 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
488 gtid);
489
490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
491 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
492
493 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
494 sizeof(kmp_local_t), "th_%d.th_local", gtid);
495
496 __kmp_print_storage_map_gtid(
497 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
498 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
499
500 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
501 &thr->th.th_bar[bs_plain_barrier + 1],
502 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
503 gtid);
504
505 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
506 &thr->th.th_bar[bs_forkjoin_barrier + 1],
507 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
508 gtid);
509
510#if KMP_FAST_REDUCTION_BARRIER
511 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
512 &thr->th.th_bar[bs_reduction_barrier + 1],
513 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
514 gtid);
515#endif // KMP_FAST_REDUCTION_BARRIER
516}
517
518/* Print out the storage map for the major kmp_team_t team data structures
519 that are allocated together. */
520
521static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
522 int team_id, int num_thr) {
523 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
524 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
525 header, team_id);
526
527 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
528 &team->t.t_bar[bs_last_barrier],
529 sizeof(kmp_balign_team_t) * bs_last_barrier,
530 "%s_%d.t_bar", header, team_id);
531
532 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
533 &team->t.t_bar[bs_plain_barrier + 1],
534 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
535 header, team_id);
536
537 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
538 &team->t.t_bar[bs_forkjoin_barrier + 1],
539 sizeof(kmp_balign_team_t),
540 "%s_%d.t_bar[forkjoin]", header, team_id);
541
542#if KMP_FAST_REDUCTION_BARRIER
543 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
544 &team->t.t_bar[bs_reduction_barrier + 1],
545 sizeof(kmp_balign_team_t),
546 "%s_%d.t_bar[reduction]", header, team_id);
547#endif // KMP_FAST_REDUCTION_BARRIER
548
549 __kmp_print_storage_map_gtid(
550 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
551 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
552
553 __kmp_print_storage_map_gtid(
554 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
555 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
556
557 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
558 &team->t.t_disp_buffer[num_disp_buff],
559 sizeof(dispatch_shared_info_t) * num_disp_buff,
560 "%s_%d.t_disp_buffer", header, team_id);
561}
562
563static void __kmp_init_allocator() {
564 __kmp_init_memkind();
565 __kmp_init_target_mem();
566}
567static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
568
569/* ------------------------------------------------------------------------ */
570
571#if ENABLE_LIBOMPTARGET
572static void __kmp_init_omptarget() {
573 __kmp_init_target_task();
574}
575#endif
576
577/* ------------------------------------------------------------------------ */
578
579#if KMP_DYNAMIC_LIB
580#if KMP_OS_WINDOWS
581
582BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
583 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
584
585 switch (fdwReason) {
586
587 case DLL_PROCESS_ATTACH:
588 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
589
590 return TRUE;
591
592 case DLL_PROCESS_DETACH:
593 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
594
595 // According to Windows* documentation for DllMain entry point:
596 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
597 // lpReserved == NULL when FreeLibrary() is called,
598 // lpReserved != NULL when the process is terminated.
599 // When FreeLibrary() is called, worker threads remain alive. So the
600 // runtime's state is consistent and executing proper shutdown is OK.
601 // When the process is terminated, worker threads have exited or been
602 // forcefully terminated by the OS and only the shutdown thread remains.
603 // This can leave the runtime in an inconsistent state.
604 // Hence, only attempt proper cleanup when FreeLibrary() is called.
605 // Otherwise, rely on OS to reclaim resources.
606 if (lpReserved == NULL)
607 __kmp_internal_end_library(__kmp_gtid_get_specific());
608
609 return TRUE;
610
611 case DLL_THREAD_ATTACH:
612 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
613
614 /* if we want to register new siblings all the time here call
615 * __kmp_get_gtid(); */
616 return TRUE;
617
618 case DLL_THREAD_DETACH:
619 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
620
621 __kmp_internal_end_thread(__kmp_gtid_get_specific());
622 return TRUE;
623 }
624
625 return TRUE;
626}
627
628#endif /* KMP_OS_WINDOWS */
629#endif /* KMP_DYNAMIC_LIB */
630
631/* __kmp_parallel_deo -- Wait until it's our turn. */
632void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633 int gtid = *gtid_ref;
634#ifdef BUILD_PARALLEL_ORDERED
635 kmp_team_t *team = __kmp_team_from_gtid(gtid);
636#endif /* BUILD_PARALLEL_ORDERED */
637
638 if (__kmp_env_consistency_check) {
639 if (__kmp_threads[gtid]->th.th_root->r.r_active)
640#if KMP_USE_DYNAMIC_LOCK
641 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
642#else
643 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
644#endif
645 }
646#ifdef BUILD_PARALLEL_ORDERED
647 if (!team->t.t_serialized) {
648 KMP_MB();
649 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
650 NULL);
651 KMP_MB();
652 }
653#endif /* BUILD_PARALLEL_ORDERED */
654}
655
656/* __kmp_parallel_dxo -- Signal the next task. */
657void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
658 int gtid = *gtid_ref;
659#ifdef BUILD_PARALLEL_ORDERED
660 int tid = __kmp_tid_from_gtid(gtid);
661 kmp_team_t *team = __kmp_team_from_gtid(gtid);
662#endif /* BUILD_PARALLEL_ORDERED */
663
664 if (__kmp_env_consistency_check) {
665 if (__kmp_threads[gtid]->th.th_root->r.r_active)
666 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
667 }
668#ifdef BUILD_PARALLEL_ORDERED
669 if (!team->t.t_serialized) {
670 KMP_MB(); /* Flush all pending memory write invalidates. */
671
672 /* use the tid of the next thread in this team */
673 /* TODO replace with general release procedure */
674 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
675
676 KMP_MB(); /* Flush all pending memory write invalidates. */
677 }
678#endif /* BUILD_PARALLEL_ORDERED */
679}
680
681/* ------------------------------------------------------------------------ */
682/* The BARRIER for a SINGLE process section is always explicit */
683
684int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
685 int status;
686 kmp_info_t *th;
687 kmp_team_t *team;
688
689 if (!TCR_4(__kmp_init_parallel))
690 __kmp_parallel_initialize();
691 __kmp_resume_if_soft_paused();
692
693 th = __kmp_threads[gtid];
694 team = th->th.th_team;
695 status = 0;
696
697 th->th.th_ident = id_ref;
698
699 if (team->t.t_serialized) {
700 status = 1;
701 } else {
702 kmp_int32 old_this = th->th.th_local.this_construct;
703
704 ++th->th.th_local.this_construct;
705 /* try to set team count to thread count--success means thread got the
706 single block */
707 /* TODO: Should this be acquire or release? */
708 if (team->t.t_construct == old_this) {
709 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
710 th->th.th_local.this_construct);
711 }
712#if USE_ITT_BUILD
713 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
714 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
715 team->t.t_active_level == 1) {
716 // Only report metadata by primary thread of active team at level 1
717 __kmp_itt_metadata_single(id_ref);
718 }
719#endif /* USE_ITT_BUILD */
720 }
721
722 if (__kmp_env_consistency_check) {
723 if (status && push_ws) {
724 __kmp_push_workshare(gtid, ct_psingle, id_ref);
725 } else {
726 __kmp_check_workshare(gtid, ct_psingle, id_ref);
727 }
728 }
729#if USE_ITT_BUILD
730 if (status) {
731 __kmp_itt_single_start(gtid);
732 }
733#endif /* USE_ITT_BUILD */
734 return status;
735}
736
737void __kmp_exit_single(int gtid) {
738#if USE_ITT_BUILD
739 __kmp_itt_single_end(gtid);
740#endif /* USE_ITT_BUILD */
741 if (__kmp_env_consistency_check)
742 __kmp_pop_workshare(gtid, ct_psingle, NULL);
743}
744
745/* determine if we can go parallel or must use a serialized parallel region and
746 * how many threads we can use
747 * set_nproc is the number of threads requested for the team
748 * returns 0 if we should serialize or only use one thread,
749 * otherwise the number of threads to use
750 * The forkjoin lock is held by the caller. */
751static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
752 int master_tid, int set_nthreads,
753 int enter_teams) {
754 int capacity;
755 int new_nthreads;
756 KMP_DEBUG_ASSERT(__kmp_init_serial);
757 KMP_DEBUG_ASSERT(root && parent_team);
758 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
759
760 // If dyn-var is set, dynamically adjust the number of desired threads,
761 // according to the method specified by dynamic_mode.
762 new_nthreads = set_nthreads;
763 if (!get__dynamic_2(parent_team, master_tid)) {
764 ;
765 }
766#ifdef USE_LOAD_BALANCE
767 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
768 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
769 if (new_nthreads == 1) {
770 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
771 "reservation to 1 thread\n",
772 master_tid));
773 return 1;
774 }
775 if (new_nthreads < set_nthreads) {
776 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
777 "reservation to %d threads\n",
778 master_tid, new_nthreads));
779 }
780 }
781#endif /* USE_LOAD_BALANCE */
782 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
783 new_nthreads = __kmp_avail_proc - __kmp_nth +
784 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
785 if (new_nthreads <= 1) {
786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
787 "reservation to 1 thread\n",
788 master_tid));
789 return 1;
790 }
791 if (new_nthreads < set_nthreads) {
792 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
793 "reservation to %d threads\n",
794 master_tid, new_nthreads));
795 } else {
796 new_nthreads = set_nthreads;
797 }
798 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
799 if (set_nthreads > 2) {
800 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
801 new_nthreads = (new_nthreads % set_nthreads) + 1;
802 if (new_nthreads == 1) {
803 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
804 "reservation to 1 thread\n",
805 master_tid));
806 return 1;
807 }
808 if (new_nthreads < set_nthreads) {
809 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
810 "reservation to %d threads\n",
811 master_tid, new_nthreads));
812 }
813 }
814 } else {
815 KMP_ASSERT(0);
816 }
817
818 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
819 if (__kmp_nth + new_nthreads -
820 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
821 __kmp_max_nth) {
822 int tl_nthreads = __kmp_max_nth - __kmp_nth +
823 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
824 if (tl_nthreads <= 0) {
825 tl_nthreads = 1;
826 }
827
828 // If dyn-var is false, emit a 1-time warning.
829 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
830 __kmp_reserve_warn = 1;
831 __kmp_msg(kmp_ms_warning,
832 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
833 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
834 }
835 if (tl_nthreads == 1) {
836 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
837 "reduced reservation to 1 thread\n",
838 master_tid));
839 return 1;
840 }
841 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
842 "reservation to %d threads\n",
843 master_tid, tl_nthreads));
844 new_nthreads = tl_nthreads;
845 }
846
847 // Respect OMP_THREAD_LIMIT
848 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
849 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
850 if (cg_nthreads + new_nthreads -
851 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
852 max_cg_threads) {
853 int tl_nthreads = max_cg_threads - cg_nthreads +
854 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
855 if (tl_nthreads <= 0) {
856 tl_nthreads = 1;
857 }
858
859 // If dyn-var is false, emit a 1-time warning.
860 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
861 __kmp_reserve_warn = 1;
862 __kmp_msg(kmp_ms_warning,
863 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
864 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
865 }
866 if (tl_nthreads == 1) {
867 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
868 "reduced reservation to 1 thread\n",
869 master_tid));
870 return 1;
871 }
872 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
873 "reservation to %d threads\n",
874 master_tid, tl_nthreads));
875 new_nthreads = tl_nthreads;
876 }
877
878 // Check if the threads array is large enough, or needs expanding.
879 // See comment in __kmp_register_root() about the adjustment if
880 // __kmp_threads[0] == NULL.
881 capacity = __kmp_threads_capacity;
882 if (TCR_PTR(__kmp_threads[0]) == NULL) {
883 --capacity;
884 }
885 // If it is not for initializing the hidden helper team, we need to take
886 // __kmp_hidden_helper_threads_num out of the capacity because it is included
887 // in __kmp_threads_capacity.
888 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
889 capacity -= __kmp_hidden_helper_threads_num;
890 }
891 if (__kmp_nth + new_nthreads -
892 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
893 capacity) {
894 // Expand the threads array.
895 int slotsRequired = __kmp_nth + new_nthreads -
896 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
897 capacity;
898 int slotsAdded = __kmp_expand_threads(slotsRequired);
899 if (slotsAdded < slotsRequired) {
900 // The threads array was not expanded enough.
901 new_nthreads -= (slotsRequired - slotsAdded);
902 KMP_ASSERT(new_nthreads >= 1);
903
904 // If dyn-var is false, emit a 1-time warning.
905 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
906 __kmp_reserve_warn = 1;
907 if (__kmp_tp_cached) {
908 __kmp_msg(kmp_ms_warning,
909 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
910 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
911 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
912 } else {
913 __kmp_msg(kmp_ms_warning,
914 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
915 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
916 }
917 }
918 }
919 }
920
921#ifdef KMP_DEBUG
922 if (new_nthreads == 1) {
923 KC_TRACE(10,
924 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
925 "dead roots and rechecking; requested %d threads\n",
926 __kmp_get_gtid(), set_nthreads));
927 } else {
928 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
929 " %d threads\n",
930 __kmp_get_gtid(), new_nthreads, set_nthreads));
931 }
932#endif // KMP_DEBUG
933 return new_nthreads;
934}
935
936/* Allocate threads from the thread pool and assign them to the new team. We are
937 assured that there are enough threads available, because we checked on that
938 earlier within critical section forkjoin */
939static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
940 kmp_info_t *master_th, int master_gtid,
941 int fork_teams_workers) {
942 int i;
943 int use_hot_team;
944
945 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
946 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
947 KMP_MB();
948
949 /* first, let's setup the primary thread */
950 master_th->th.th_info.ds.ds_tid = 0;
951 master_th->th.th_team = team;
952 master_th->th.th_team_nproc = team->t.t_nproc;
953 master_th->th.th_team_master = master_th;
954 master_th->th.th_team_serialized = FALSE;
955 master_th->th.th_dispatch = &team->t.t_dispatch[0];
956
957/* make sure we are not the optimized hot team */
958#if KMP_NESTED_HOT_TEAMS
959 use_hot_team = 0;
960 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
961 if (hot_teams) { // hot teams array is not allocated if
962 // KMP_HOT_TEAMS_MAX_LEVEL=0
963 int level = team->t.t_active_level - 1; // index in array of hot teams
964 if (master_th->th.th_teams_microtask) { // are we inside the teams?
965 if (master_th->th.th_teams_size.nteams > 1) {
966 ++level; // level was not increased in teams construct for
967 // team_of_masters
968 }
969 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
970 master_th->th.th_teams_level == team->t.t_level) {
971 ++level; // level was not increased in teams construct for
972 // team_of_workers before the parallel
973 } // team->t.t_level will be increased inside parallel
974 }
975 if (level < __kmp_hot_teams_max_level) {
976 if (hot_teams[level].hot_team) {
977 // hot team has already been allocated for given level
978 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
979 use_hot_team = 1; // the team is ready to use
980 } else {
981 use_hot_team = 0; // AC: threads are not allocated yet
982 hot_teams[level].hot_team = team; // remember new hot team
983 hot_teams[level].hot_team_nth = team->t.t_nproc;
984 }
985 } else {
986 use_hot_team = 0;
987 }
988 }
989#else
990 use_hot_team = team == root->r.r_hot_team;
991#endif
992 if (!use_hot_team) {
993
994 /* install the primary thread */
995 team->t.t_threads[0] = master_th;
996 __kmp_initialize_info(master_th, team, 0, master_gtid);
997
998 /* now, install the worker threads */
999 for (i = 1; i < team->t.t_nproc; i++) {
1000
1001 /* fork or reallocate a new thread and install it in team */
1002 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1003 team->t.t_threads[i] = thr;
1004 KMP_DEBUG_ASSERT(thr);
1005 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1006 /* align team and thread arrived states */
1007 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1008 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1009 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1010 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1011 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1012 team->t.t_bar[bs_plain_barrier].b_arrived));
1013 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1014 thr->th.th_teams_level = master_th->th.th_teams_level;
1015 thr->th.th_teams_size = master_th->th.th_teams_size;
1016 { // Initialize threads' barrier data.
1017 int b;
1018 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1019 for (b = 0; b < bs_last_barrier; ++b) {
1020 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1021 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1022#if USE_DEBUGGER
1023 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1024#endif
1025 }
1026 }
1027 }
1028
1029#if KMP_AFFINITY_SUPPORTED
1030 // Do not partition the places list for teams construct workers who
1031 // haven't actually been forked to do real work yet. This partitioning
1032 // will take place in the parallel region nested within the teams construct.
1033 if (!fork_teams_workers) {
1034 __kmp_partition_places(team);
1035 }
1036#endif
1037
1038 if (team->t.t_nproc > 1 &&
1039 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1040 team->t.b->update_num_threads(team->t.t_nproc);
1041 __kmp_add_threads_to_team(team, team->t.t_nproc);
1042 }
1043 }
1044
1045 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1046 for (i = 0; i < team->t.t_nproc; i++) {
1047 kmp_info_t *thr = team->t.t_threads[i];
1048 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1049 thr->th.th_prev_level != team->t.t_level) {
1050 team->t.t_display_affinity = 1;
1051 break;
1052 }
1053 }
1054 }
1055
1056 KMP_MB();
1057}
1058
1059#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1060// Propagate any changes to the floating point control registers out to the team
1061// We try to avoid unnecessary writes to the relevant cache line in the team
1062// structure, so we don't make changes unless they are needed.
1063inline static void propagateFPControl(kmp_team_t *team) {
1064 if (__kmp_inherit_fp_control) {
1065 kmp_int16 x87_fpu_control_word;
1066 kmp_uint32 mxcsr;
1067
1068 // Get primary thread's values of FPU control flags (both X87 and vector)
1069 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1070 __kmp_store_mxcsr(&mxcsr);
1071 mxcsr &= KMP_X86_MXCSR_MASK;
1072
1073 // There is no point looking at t_fp_control_saved here.
1074 // If it is TRUE, we still have to update the values if they are different
1075 // from those we now have. If it is FALSE we didn't save anything yet, but
1076 // our objective is the same. We have to ensure that the values in the team
1077 // are the same as those we have.
1078 // So, this code achieves what we need whether or not t_fp_control_saved is
1079 // true. By checking whether the value needs updating we avoid unnecessary
1080 // writes that would put the cache-line into a written state, causing all
1081 // threads in the team to have to read it again.
1082 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1083 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1084 // Although we don't use this value, other code in the runtime wants to know
1085 // whether it should restore them. So we must ensure it is correct.
1086 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1087 } else {
1088 // Similarly here. Don't write to this cache-line in the team structure
1089 // unless we have to.
1090 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1091 }
1092}
1093
1094// Do the opposite, setting the hardware registers to the updated values from
1095// the team.
1096inline static void updateHWFPControl(kmp_team_t *team) {
1097 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1098 // Only reset the fp control regs if they have been changed in the team.
1099 // the parallel region that we are exiting.
1100 kmp_int16 x87_fpu_control_word;
1101 kmp_uint32 mxcsr;
1102 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1103 __kmp_store_mxcsr(&mxcsr);
1104 mxcsr &= KMP_X86_MXCSR_MASK;
1105
1106 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1107 __kmp_clear_x87_fpu_status_word();
1108 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1109 }
1110
1111 if (team->t.t_mxcsr != mxcsr) {
1112 __kmp_load_mxcsr(&team->t.t_mxcsr);
1113 }
1114 }
1115}
1116#else
1117#define propagateFPControl(x) ((void)0)
1118#define updateHWFPControl(x) ((void)0)
1119#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1120
1121static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1122 int realloc); // forward declaration
1123
1124/* Run a parallel region that has been serialized, so runs only in a team of the
1125 single primary thread. */
1126void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1127 kmp_info_t *this_thr;
1128 kmp_team_t *serial_team;
1129
1130 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1131
1132 /* Skip all this code for autopar serialized loops since it results in
1133 unacceptable overhead */
1134 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1135 return;
1136
1137 if (!TCR_4(__kmp_init_parallel))
1138 __kmp_parallel_initialize();
1139 __kmp_resume_if_soft_paused();
1140
1141 this_thr = __kmp_threads[global_tid];
1142 serial_team = this_thr->th.th_serial_team;
1143
1144 /* utilize the serialized team held by this thread */
1145 KMP_DEBUG_ASSERT(serial_team);
1146 KMP_MB();
1147
1148 if (__kmp_tasking_mode != tskm_immediate_exec) {
1149 KMP_DEBUG_ASSERT(
1150 this_thr->th.th_task_team ==
1151 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1152 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1153 NULL);
1154 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1155 "team %p, new task_team = NULL\n",
1156 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1157 this_thr->th.th_task_team = NULL;
1158 }
1159
1160 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1161 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1162 proc_bind = proc_bind_false;
1163 } else if (proc_bind == proc_bind_default) {
1164 // No proc_bind clause was specified, so use the current value
1165 // of proc-bind-var for this parallel region.
1166 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1167 }
1168 // Reset for next parallel region
1169 this_thr->th.th_set_proc_bind = proc_bind_default;
1170
1171 // Reset num_threads for next parallel region
1172 this_thr->th.th_set_nproc = 0;
1173
1174#if OMPT_SUPPORT
1175 ompt_data_t ompt_parallel_data = ompt_data_none;
1176 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1177 if (ompt_enabled.enabled &&
1178 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1179
1180 ompt_task_info_t *parent_task_info;
1181 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1182
1183 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1184 if (ompt_enabled.ompt_callback_parallel_begin) {
1185 int team_size = 1;
1186
1187 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1188 &(parent_task_info->task_data), &(parent_task_info->frame),
1189 &ompt_parallel_data, team_size,
1190 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1191 }
1192 }
1193#endif // OMPT_SUPPORT
1194
1195 if (this_thr->th.th_team != serial_team) {
1196 // Nested level will be an index in the nested nthreads array
1197 int level = this_thr->th.th_team->t.t_level;
1198
1199 if (serial_team->t.t_serialized) {
1200 /* this serial team was already used
1201 TODO increase performance by making this locks more specific */
1202 kmp_team_t *new_team;
1203
1204 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1205
1206 new_team =
1207 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1208#if OMPT_SUPPORT
1209 ompt_parallel_data,
1210#endif
1211 proc_bind, &this_thr->th.th_current_task->td_icvs,
1212 0 USE_NESTED_HOT_ARG(NULL));
1213 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1214 KMP_ASSERT(new_team);
1215
1216 /* setup new serialized team and install it */
1217 new_team->t.t_threads[0] = this_thr;
1218 new_team->t.t_parent = this_thr->th.th_team;
1219 serial_team = new_team;
1220 this_thr->th.th_serial_team = serial_team;
1221
1222 KF_TRACE(
1223 10,
1224 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1225 global_tid, serial_team));
1226
1227 /* TODO the above breaks the requirement that if we run out of resources,
1228 then we can still guarantee that serialized teams are ok, since we may
1229 need to allocate a new one */
1230 } else {
1231 KF_TRACE(
1232 10,
1233 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1234 global_tid, serial_team));
1235 }
1236
1237 /* we have to initialize this serial team */
1238 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1239 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1240 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1241 serial_team->t.t_ident = loc;
1242 serial_team->t.t_serialized = 1;
1243 serial_team->t.t_nproc = 1;
1244 serial_team->t.t_parent = this_thr->th.th_team;
1245 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1246 this_thr->th.th_team = serial_team;
1247 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1248
1249 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1250 this_thr->th.th_current_task));
1251 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1252 this_thr->th.th_current_task->td_flags.executing = 0;
1253
1254 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1255
1256 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1257 implicit task for each serialized task represented by
1258 team->t.t_serialized? */
1259 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1260 &this_thr->th.th_current_task->td_parent->td_icvs);
1261
1262 // Thread value exists in the nested nthreads array for the next nested
1263 // level
1264 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1265 this_thr->th.th_current_task->td_icvs.nproc =
1266 __kmp_nested_nth.nth[level + 1];
1267 }
1268
1269 if (__kmp_nested_proc_bind.used &&
1270 (level + 1 < __kmp_nested_proc_bind.used)) {
1271 this_thr->th.th_current_task->td_icvs.proc_bind =
1272 __kmp_nested_proc_bind.bind_types[level + 1];
1273 }
1274
1275#if USE_DEBUGGER
1276 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1277#endif
1278 this_thr->th.th_info.ds.ds_tid = 0;
1279
1280 /* set thread cache values */
1281 this_thr->th.th_team_nproc = 1;
1282 this_thr->th.th_team_master = this_thr;
1283 this_thr->th.th_team_serialized = 1;
1284
1285 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1286 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1287 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1288
1289 propagateFPControl(serial_team);
1290
1291 /* check if we need to allocate dispatch buffers stack */
1292 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1293 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1294 serial_team->t.t_dispatch->th_disp_buffer =
1295 (dispatch_private_info_t *)__kmp_allocate(
1296 sizeof(dispatch_private_info_t));
1297 }
1298 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1299
1300 KMP_MB();
1301
1302 } else {
1303 /* this serialized team is already being used,
1304 * that's fine, just add another nested level */
1305 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1306 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1307 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1308 ++serial_team->t.t_serialized;
1309 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1310
1311 // Nested level will be an index in the nested nthreads array
1312 int level = this_thr->th.th_team->t.t_level;
1313 // Thread value exists in the nested nthreads array for the next nested
1314 // level
1315 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1316 this_thr->th.th_current_task->td_icvs.nproc =
1317 __kmp_nested_nth.nth[level + 1];
1318 }
1319 serial_team->t.t_level++;
1320 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1321 "of serial team %p to %d\n",
1322 global_tid, serial_team, serial_team->t.t_level));
1323
1324 /* allocate/push dispatch buffers stack */
1325 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1326 {
1327 dispatch_private_info_t *disp_buffer =
1328 (dispatch_private_info_t *)__kmp_allocate(
1329 sizeof(dispatch_private_info_t));
1330 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1331 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1332 }
1333 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334
1335 KMP_MB();
1336 }
1337 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1338
1339 // Perform the display affinity functionality for
1340 // serialized parallel regions
1341 if (__kmp_display_affinity) {
1342 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1343 this_thr->th.th_prev_num_threads != 1) {
1344 // NULL means use the affinity-format-var ICV
1345 __kmp_aux_display_affinity(global_tid, NULL);
1346 this_thr->th.th_prev_level = serial_team->t.t_level;
1347 this_thr->th.th_prev_num_threads = 1;
1348 }
1349 }
1350
1351 if (__kmp_env_consistency_check)
1352 __kmp_push_parallel(global_tid, NULL);
1353#if OMPT_SUPPORT
1354 serial_team->t.ompt_team_info.master_return_address = codeptr;
1355 if (ompt_enabled.enabled &&
1356 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1357 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1358 OMPT_GET_FRAME_ADDRESS(0);
1359
1360 ompt_lw_taskteam_t lw_taskteam;
1361 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1362 &ompt_parallel_data, codeptr);
1363
1364 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1365 // don't use lw_taskteam after linking. content was swaped
1366
1367 /* OMPT implicit task begin */
1368 if (ompt_enabled.ompt_callback_implicit_task) {
1369 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1370 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1371 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1372 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1373 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1374 __kmp_tid_from_gtid(global_tid);
1375 }
1376
1377 /* OMPT state */
1378 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1379 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1380 OMPT_GET_FRAME_ADDRESS(0);
1381 }
1382#endif
1383}
1384
1385// Test if this fork is for a team closely nested in a teams construct
1386static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1387 microtask_t microtask, int level,
1388 int teams_level, kmp_va_list ap) {
1389 return (master_th->th.th_teams_microtask && ap &&
1390 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1391}
1392
1393// Test if this fork is for the teams construct, i.e. to form the outer league
1394// of teams
1395static inline bool __kmp_is_entering_teams(int active_level, int level,
1396 int teams_level, kmp_va_list ap) {
1397 return ((ap == NULL && active_level == 0) ||
1398 (ap && teams_level > 0 && teams_level == level));
1399}
1400
1401// AC: This is start of parallel that is nested inside teams construct.
1402// The team is actual (hot), all workers are ready at the fork barrier.
1403// No lock needed to initialize the team a bit, then free workers.
1404static inline int
1405__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1406 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1407 enum fork_context_e call_context, microtask_t microtask,
1408 launch_t invoker, int master_set_numthreads, int level,
1409#if OMPT_SUPPORT
1410 ompt_data_t ompt_parallel_data, void *return_address,
1411#endif
1412 kmp_va_list ap) {
1413 void **argv;
1414 int i;
1415
1416 parent_team->t.t_ident = loc;
1417 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1418 parent_team->t.t_argc = argc;
1419 argv = (void **)parent_team->t.t_argv;
1420 for (i = argc - 1; i >= 0; --i) {
1421 *argv++ = va_arg(kmp_va_deref(ap), void *);
1422 }
1423 // Increment our nested depth levels, but not increase the serialization
1424 if (parent_team == master_th->th.th_serial_team) {
1425 // AC: we are in serialized parallel
1426 __kmpc_serialized_parallel(loc, gtid);
1427 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1428
1429 if (call_context == fork_context_gnu) {
1430 // AC: need to decrement t_serialized for enquiry functions to work
1431 // correctly, will restore at join time
1432 parent_team->t.t_serialized--;
1433 return TRUE;
1434 }
1435
1436#if OMPD_SUPPORT
1437 parent_team->t.t_pkfn = microtask;
1438#endif
1439
1440#if OMPT_SUPPORT
1441 void *dummy;
1442 void **exit_frame_p;
1443 ompt_data_t *implicit_task_data;
1444 ompt_lw_taskteam_t lw_taskteam;
1445
1446 if (ompt_enabled.enabled) {
1447 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1448 &ompt_parallel_data, return_address);
1449 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1450
1451 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1452 // Don't use lw_taskteam after linking. Content was swapped.
1453
1454 /* OMPT implicit task begin */
1455 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1456 if (ompt_enabled.ompt_callback_implicit_task) {
1457 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1458 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1459 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1460 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1461 }
1462
1463 /* OMPT state */
1464 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1465 } else {
1466 exit_frame_p = &dummy;
1467 }
1468#endif
1469
1470 // AC: need to decrement t_serialized for enquiry functions to work
1471 // correctly, will restore at join time
1472 parent_team->t.t_serialized--;
1473
1474 {
1475 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1476 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1477 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1478#if OMPT_SUPPORT
1479 ,
1480 exit_frame_p
1481#endif
1482 );
1483 }
1484
1485#if OMPT_SUPPORT
1486 if (ompt_enabled.enabled) {
1487 *exit_frame_p = NULL;
1488 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1489 if (ompt_enabled.ompt_callback_implicit_task) {
1490 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1491 ompt_scope_end, NULL, implicit_task_data, 1,
1492 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1493 }
1494 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1495 __ompt_lw_taskteam_unlink(master_th);
1496 if (ompt_enabled.ompt_callback_parallel_end) {
1497 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1498 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1499 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1500 }
1501 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1502 }
1503#endif
1504 return TRUE;
1505 }
1506
1507 parent_team->t.t_pkfn = microtask;
1508 parent_team->t.t_invoke = invoker;
1509 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1510 parent_team->t.t_active_level++;
1511 parent_team->t.t_level++;
1512 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1513
1514 // If the threads allocated to the team are less than the thread limit, update
1515 // the thread limit here. th_teams_size.nth is specific to this team nested
1516 // in a teams construct, the team is fully created, and we're about to do
1517 // the actual fork. Best to do this here so that the subsequent uses below
1518 // and in the join have the correct value.
1519 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1520
1521#if OMPT_SUPPORT
1522 if (ompt_enabled.enabled) {
1523 ompt_lw_taskteam_t lw_taskteam;
1524 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1525 return_address);
1526 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1527 }
1528#endif
1529
1530 /* Change number of threads in the team if requested */
1531 if (master_set_numthreads) { // The parallel has num_threads clause
1532 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1533 // AC: only can reduce number of threads dynamically, can't increase
1534 kmp_info_t **other_threads = parent_team->t.t_threads;
1535 // NOTE: if using distributed barrier, we need to run this code block
1536 // even when the team size appears not to have changed from the max.
1537 int old_proc = master_th->th.th_teams_size.nth;
1538 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1539 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1540 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1541 }
1542 parent_team->t.t_nproc = master_set_numthreads;
1543 for (i = 0; i < master_set_numthreads; ++i) {
1544 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1545 }
1546 }
1547 // Keep extra threads hot in the team for possible next parallels
1548 master_th->th.th_set_nproc = 0;
1549 }
1550
1551#if USE_DEBUGGER
1552 if (__kmp_debugging) { // Let debugger override number of threads.
1553 int nth = __kmp_omp_num_threads(loc);
1554 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1555 master_set_numthreads = nth;
1556 }
1557 }
1558#endif
1559
1560 // Figure out the proc_bind policy for the nested parallel within teams
1561 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1562 // proc_bind_default means don't update
1563 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1564 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1565 proc_bind = proc_bind_false;
1566 } else {
1567 // No proc_bind clause specified; use current proc-bind-var
1568 if (proc_bind == proc_bind_default) {
1569 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1570 }
1571 /* else: The proc_bind policy was specified explicitly on parallel clause.
1572 This overrides proc-bind-var for this parallel region, but does not
1573 change proc-bind-var. */
1574 // Figure the value of proc-bind-var for the child threads.
1575 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1576 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1577 master_th->th.th_current_task->td_icvs.proc_bind)) {
1578 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1579 }
1580 }
1581 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1582 // Need to change the bind-var ICV to correct value for each implicit task
1583 if (proc_bind_icv != proc_bind_default &&
1584 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1585 kmp_info_t **other_threads = parent_team->t.t_threads;
1586 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1587 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1588 }
1589 }
1590 // Reset for next parallel region
1591 master_th->th.th_set_proc_bind = proc_bind_default;
1592
1593#if USE_ITT_BUILD && USE_ITT_NOTIFY
1594 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1595 KMP_ITT_DEBUG) &&
1596 __kmp_forkjoin_frames_mode == 3 &&
1597 parent_team->t.t_active_level == 1 // only report frames at level 1
1598 && master_th->th.th_teams_size.nteams == 1) {
1599 kmp_uint64 tmp_time = __itt_get_timestamp();
1600 master_th->th.th_frame_time = tmp_time;
1601 parent_team->t.t_region_time = tmp_time;
1602 }
1603 if (__itt_stack_caller_create_ptr) {
1604 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1605 // create new stack stitching id before entering fork barrier
1606 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1607 }
1608#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1609#if KMP_AFFINITY_SUPPORTED
1610 __kmp_partition_places(parent_team);
1611#endif
1612
1613 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1614 "master_th=%p, gtid=%d\n",
1615 root, parent_team, master_th, gtid));
1616 __kmp_internal_fork(loc, gtid, parent_team);
1617 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1618 "master_th=%p, gtid=%d\n",
1619 root, parent_team, master_th, gtid));
1620
1621 if (call_context == fork_context_gnu)
1622 return TRUE;
1623
1624 /* Invoke microtask for PRIMARY thread */
1625 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1626 parent_team->t.t_id, parent_team->t.t_pkfn));
1627
1628 if (!parent_team->t.t_invoke(gtid)) {
1629 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1630 }
1631 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1632 parent_team->t.t_id, parent_team->t.t_pkfn));
1633 KMP_MB(); /* Flush all pending memory write invalidates. */
1634
1635 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1636
1637 return TRUE;
1638}
1639
1640// Create a serialized parallel region
1641static inline int
1642__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1643 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1644 kmp_info_t *master_th, kmp_team_t *parent_team,
1645#if OMPT_SUPPORT
1646 ompt_data_t *ompt_parallel_data, void **return_address,
1647 ompt_data_t **parent_task_data,
1648#endif
1649 kmp_va_list ap) {
1650 kmp_team_t *team;
1651 int i;
1652 void **argv;
1653
1654/* josh todo: hypothetical question: what do we do for OS X*? */
1655#if KMP_OS_LINUX && \
1656 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1657 SimpleVLA<void *> args(argc);
1658#else
1659 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1660#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1661 KMP_ARCH_AARCH64) */
1662
1663 KA_TRACE(
1664 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1665
1666 __kmpc_serialized_parallel(loc, gtid);
1667
1668#if OMPD_SUPPORT
1669 master_th->th.th_serial_team->t.t_pkfn = microtask;
1670#endif
1671
1672 if (call_context == fork_context_intel) {
1673 /* TODO this sucks, use the compiler itself to pass args! :) */
1674 master_th->th.th_serial_team->t.t_ident = loc;
1675 if (!ap) {
1676 // revert change made in __kmpc_serialized_parallel()
1677 master_th->th.th_serial_team->t.t_level--;
1678// Get args from parent team for teams construct
1679
1680#if OMPT_SUPPORT
1681 void *dummy;
1682 void **exit_frame_p;
1683 ompt_task_info_t *task_info;
1684 ompt_lw_taskteam_t lw_taskteam;
1685
1686 if (ompt_enabled.enabled) {
1687 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1688 ompt_parallel_data, *return_address);
1689
1690 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1691 // don't use lw_taskteam after linking. content was swaped
1692 task_info = OMPT_CUR_TASK_INFO(master_th);
1693 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1694 if (ompt_enabled.ompt_callback_implicit_task) {
1695 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1696 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1697 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1698 &(task_info->task_data), 1,
1699 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1700 }
1701
1702 /* OMPT state */
1703 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1704 } else {
1705 exit_frame_p = &dummy;
1706 }
1707#endif
1708
1709 {
1710 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1711 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1712 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1713#if OMPT_SUPPORT
1714 ,
1715 exit_frame_p
1716#endif
1717 );
1718 }
1719
1720#if OMPT_SUPPORT
1721 if (ompt_enabled.enabled) {
1722 *exit_frame_p = NULL;
1723 if (ompt_enabled.ompt_callback_implicit_task) {
1724 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1725 ompt_scope_end, NULL, &(task_info->task_data), 1,
1726 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1727 }
1728 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1729 __ompt_lw_taskteam_unlink(master_th);
1730 if (ompt_enabled.ompt_callback_parallel_end) {
1731 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1732 ompt_parallel_data, *parent_task_data,
1733 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1734 }
1735 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1736 }
1737#endif
1738 } else if (microtask == (microtask_t)__kmp_teams_master) {
1739 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1740 team = master_th->th.th_team;
1741 // team->t.t_pkfn = microtask;
1742 team->t.t_invoke = invoker;
1743 __kmp_alloc_argv_entries(argc, team, TRUE);
1744 team->t.t_argc = argc;
1745 argv = (void **)team->t.t_argv;
1746 if (ap) {
1747 for (i = argc - 1; i >= 0; --i)
1748 *argv++ = va_arg(kmp_va_deref(ap), void *);
1749 } else {
1750 for (i = 0; i < argc; ++i)
1751 // Get args from parent team for teams construct
1752 argv[i] = parent_team->t.t_argv[i];
1753 }
1754 // AC: revert change made in __kmpc_serialized_parallel()
1755 // because initial code in teams should have level=0
1756 team->t.t_level--;
1757 // AC: call special invoker for outer "parallel" of teams construct
1758 invoker(gtid);
1759#if OMPT_SUPPORT
1760 if (ompt_enabled.enabled) {
1761 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1762 if (ompt_enabled.ompt_callback_implicit_task) {
1763 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1764 ompt_scope_end, NULL, &(task_info->task_data), 0,
1765 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1766 }
1767 if (ompt_enabled.ompt_callback_parallel_end) {
1768 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1769 ompt_parallel_data, *parent_task_data,
1770 OMPT_INVOKER(call_context) | ompt_parallel_league,
1771 *return_address);
1772 }
1773 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774 }
1775#endif
1776 } else {
1777 argv = args;
1778 for (i = argc - 1; i >= 0; --i)
1779 *argv++ = va_arg(kmp_va_deref(ap), void *);
1780 KMP_MB();
1781
1782#if OMPT_SUPPORT
1783 void *dummy;
1784 void **exit_frame_p;
1785 ompt_task_info_t *task_info;
1786 ompt_lw_taskteam_t lw_taskteam;
1787 ompt_data_t *implicit_task_data;
1788
1789 if (ompt_enabled.enabled) {
1790 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1791 ompt_parallel_data, *return_address);
1792 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1793 // don't use lw_taskteam after linking. content was swaped
1794 task_info = OMPT_CUR_TASK_INFO(master_th);
1795 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1796
1797 /* OMPT implicit task begin */
1798 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1799 if (ompt_enabled.ompt_callback_implicit_task) {
1800 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1801 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1802 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1803 ompt_task_implicit);
1804 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1805 }
1806
1807 /* OMPT state */
1808 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1809 } else {
1810 exit_frame_p = &dummy;
1811 }
1812#endif
1813
1814 {
1815 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1816 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1817 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1818#if OMPT_SUPPORT
1819 ,
1820 exit_frame_p
1821#endif
1822 );
1823 }
1824
1825#if OMPT_SUPPORT
1826 if (ompt_enabled.enabled) {
1827 *exit_frame_p = NULL;
1828 if (ompt_enabled.ompt_callback_implicit_task) {
1829 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1830 ompt_scope_end, NULL, &(task_info->task_data), 1,
1831 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1832 }
1833
1834 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1835 __ompt_lw_taskteam_unlink(master_th);
1836 if (ompt_enabled.ompt_callback_parallel_end) {
1837 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1838 ompt_parallel_data, *parent_task_data,
1839 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1840 }
1841 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1842 }
1843#endif
1844 }
1845 } else if (call_context == fork_context_gnu) {
1846#if OMPT_SUPPORT
1847 if (ompt_enabled.enabled) {
1848 ompt_lw_taskteam_t lwt;
1849 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1850 *return_address);
1851
1852 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1853 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1854 }
1855// don't use lw_taskteam after linking. content was swaped
1856#endif
1857
1858 // we were called from GNU native code
1859 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1860 return FALSE;
1861 } else {
1862 KMP_ASSERT2(call_context < fork_context_last,
1863 "__kmp_serial_fork_call: unknown fork_context parameter");
1864 }
1865
1866 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1867 KMP_MB();
1868 return FALSE;
1869}
1870
1871/* most of the work for a fork */
1872/* return true if we really went parallel, false if serialized */
1873int __kmp_fork_call(ident_t *loc, int gtid,
1874 enum fork_context_e call_context, // Intel, GNU, ...
1875 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1876 kmp_va_list ap) {
1877 void **argv;
1878 int i;
1879 int master_tid;
1880 int master_this_cons;
1881 kmp_team_t *team;
1882 kmp_team_t *parent_team;
1883 kmp_info_t *master_th;
1884 kmp_root_t *root;
1885 int nthreads;
1886 int master_active;
1887 int master_set_numthreads;
1888 int task_thread_limit = 0;
1889 int level;
1890 int active_level;
1891 int teams_level;
1892#if KMP_NESTED_HOT_TEAMS
1893 kmp_hot_team_ptr_t **p_hot_teams;
1894#endif
1895 { // KMP_TIME_BLOCK
1896 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1897 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1898
1899 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1900 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1901 /* Some systems prefer the stack for the root thread(s) to start with */
1902 /* some gap from the parent stack to prevent false sharing. */
1903 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1904 /* These 2 lines below are so this does not get optimized out */
1905 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1906 __kmp_stkpadding += (short)((kmp_int64)dummy);
1907 }
1908
1909 /* initialize if needed */
1910 KMP_DEBUG_ASSERT(
1911 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1912 if (!TCR_4(__kmp_init_parallel))
1913 __kmp_parallel_initialize();
1914 __kmp_resume_if_soft_paused();
1915
1916 /* setup current data */
1917 // AC: potentially unsafe, not in sync with library shutdown,
1918 // __kmp_threads can be freed
1919 master_th = __kmp_threads[gtid];
1920
1921 parent_team = master_th->th.th_team;
1922 master_tid = master_th->th.th_info.ds.ds_tid;
1923 master_this_cons = master_th->th.th_local.this_construct;
1924 root = master_th->th.th_root;
1925 master_active = root->r.r_active;
1926 master_set_numthreads = master_th->th.th_set_nproc;
1927 task_thread_limit =
1928 master_th->th.th_current_task->td_icvs.task_thread_limit;
1929
1930#if OMPT_SUPPORT
1931 ompt_data_t ompt_parallel_data = ompt_data_none;
1932 ompt_data_t *parent_task_data;
1933 ompt_frame_t *ompt_frame;
1934 void *return_address = NULL;
1935
1936 if (ompt_enabled.enabled) {
1937 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1938 NULL, NULL);
1939 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1940 }
1941#endif
1942
1943