56#define PROF_SET_THREAD(thread) 
   57#define PROF_TEVENT(thread,event) 
   58#define PROF_EVENT(event) 
   59#define PROF_TBEGIN(thread,event) 
   60#define PROF_BEGIN(event) 
   61#define PROF_TEND(thread,event) 
   62#define PROF_END(event) 
   64#define PROF_FORK_WRITE 
   67#define PROF_CUDA_TSTART(thread,stream) 
   68#define PROF_CUDA_START(stream) 
   69#define PROF_CUDA_TEVENT(thread,event,stream) 
   70#define PROF_CUDA_EVENT(event,stream) 
   71#define PROF_CUDA_TBEGIN(thread,event,stream) 
   72#define PROF_CUDA_BEGIN(event,stream) 
   73#define PROF_CUDA_TEND(thread,event,stream) 
   74#define PROF_CUDA_END(event,stream) 
   75#define PROF_CUDA_TFINISH(thread,stream) 
   76#define PROF_CUDA_FINISH(stream) 
   79#elif defined PROF_USE_NVTX 
   80    #include "nvToolsExt.h" 
   85    #define PROF_FORK_INIT 
   86    #define PROF_SET_THREAD(thread) 
   87    #define PROF_TEVENT(thread,event) 
   88    #define PROF_EVENT(event) 
   89    #define PROF_TBEGIN(thread,event) 
   90    #define PROF_BEGIN(event) {nvtxRangePushA(profile_description[event]);} 
   91    #define PROF_TEND(thread,event) 
   92    #define PROF_END(event) {nvtxRangePop();} 
   94    #define PROF_FORK_WRITE 
   96    #define PROF_CUDA_TSTART(thread,stream) 
   97    #define PROF_CUDA_START(stream) 
   98    #define PROF_CUDA_TEVENT(thread,event,stream) 
   99    #define PROF_CUDA_EVENT(event,stream) 
  100    #define PROF_CUDA_TBEGIN(thread,event,stream) 
  101    #define PROF_CUDA_BEGIN(event,stream) 
  102    #define PROF_CUDA_TEND(thread,event,stream) 
  103    #define PROF_CUDA_END(event,stream) 
  104    #define PROF_CUDA_TFINISH(thread,stream) 
  105    #define PROF_CUDA_FINISH(stream) 
  111#ifndef PROF_MAX_THREADS 
  112#error Must specify PROF_MAX_THREADS. 
  115#ifndef PROF_MAX_EVENTS 
  116#error Must specify PROF_MAX_EVENTS. 
  120#error Must specify PROF_OUT_FILE. 
  123#define PROF_STRINGIFY(a) #a 
  124#define PROF_MAKE_STR(a) PROF_STRINGIFY(a) 
  127#if !defined(MACOSX) && !defined(LINUX) 
  128#error Unknown profile architecture. 
  135#include <mach/mach_time.h> 
  136#include <machine/endian.h> 
  140#include <arpa/inet.h> 
  147#define _prof_time_to_net(x)        __DARWIN_OSSwapInt64(x) 
  148typedef uint64_t                    _prof_time_t;
 
  149extern double                       _prof_time_mult;
 
  150extern pthread_key_t                _prof_thread_key;
 
  152#if __BYTE_ORDER == __LITTLE_ENDIAN 
  153#define _prof_time_to_net(x)        bswap_64(x) 
  154#elif __BYTE_ORDER == __BIG_ENDIAN 
  155#define _prof_time_to_net(x)        x 
  157typedef unsigned long long          _prof_time_t;
 
  158extern __thread 
unsigned int        _prof_thread_id;
 
  159extern __thread 
struct timespec    _prof_timespec;
 
  161#define _prof_event_to_net(x)       htonl(x) 
  162typedef unsigned int                _prof_event_t;
 
  163extern _prof_time_t                 _prof_start_time;
 
  164extern unsigned int                 _prof_next_event[PROF_MAX_THREADS];
 
  165extern _prof_event_t                _prof_event_types[PROF_MAX_THREADS][PROF_MAX_EVENTS];
 
  166extern _prof_time_t                 _prof_event_times[PROF_MAX_THREADS][PROF_MAX_EVENTS];
 
  167extern unsigned int                 _prof_cuda_next_event[PROF_MAX_THREADS];
 
  168extern _prof_time_t                 _prof_cuda_start_time[PROF_MAX_THREADS];\
 
  175    struct _eventList *next;
 
  177typedef struct _eventList _eventList_s;
 
  178extern _eventList* _prof_cuda_event_list[PROF_MAX_THREADS];
 
  179extern void* _prof_cuda_ref_event[PROF_MAX_THREADS];
 
  185    double                      _prof_time_mult;\ 
  186    _prof_time_t                _prof_start_time;\ 
  187    unsigned int                _prof_next_event[PROF_MAX_THREADS];\ 
  188    _prof_event_t               _prof_event_types[PROF_MAX_THREADS][PROF_MAX_EVENTS];\ 
  189    _prof_time_t                _prof_event_times[PROF_MAX_THREADS][PROF_MAX_EVENTS];\ 
  190    unsigned int                _prof_cuda_next_event[PROF_MAX_THREADS];\ 
  191    _prof_time_t                _prof_cuda_start_time[PROF_MAX_THREADS];\ 
  192    _eventList_s*               _prof_cuda_event_list[PROF_MAX_THREADS];\ 
  193    void *                      _prof_cuda_ref_event[PROF_MAX_THREADS]; \ 
  196#define PROF_ALLOC_ARCH \ 
  197    pthread_key_t               _prof_thread_key; 
  199#define PROF_ALLOC_ARCH \ 
  200    __thread unsigned int       _prof_thread_id;\ 
  201    __thread struct timespec    _prof_timespec; 
  209    memset(_prof_next_event,0,sizeof(unsigned int)*PROF_MAX_THREADS);\ 
  210    memset(_prof_event_types,0,sizeof(_prof_event_t)*PROF_MAX_THREADS*PROF_MAX_EVENTS);\ 
  211    memset(_prof_event_times,0,sizeof(_prof_time_t)*PROF_MAX_THREADS*PROF_MAX_EVENTS);\ 
  212    memset(_prof_cuda_next_event,0,sizeof(unsigned int)*PROF_MAX_THREADS);\ 
  213    memset(_prof_cuda_start_time,0,sizeof(_prof_time_t)*PROF_MAX_THREADS);\ 
  214    memset(_prof_cuda_event_list,0,sizeof(_eventList_s*)*PROF_MAX_THREADS);\ 
  215    memset(_prof_cuda_ref_event,0,sizeof(void *)*PROF_MAX_THREADS); \ 
  218#define PROF_INIT_ARCH \ 
  219    mach_timebase_info_data_t _prof_info;\ 
  220    mach_timebase_info(&_prof_info);\ 
  221    _prof_time_mult=((double)_prof_info.numer)/((double)_prof_info.denom);\ 
  222    _prof_start_time=mach_absolute_time();\ 
  223    if (pthread_key_create(&_prof_thread_key, NULL)) _prof_thread_key=0; 
  225#define PROF_INIT_ARCH \ 
  226    _prof_start_time=PROF_GET_TIME; 
  231#define PROF_FORK_INIT \ 
  233    memset(_prof_next_event,0,sizeof(unsigned int)*PROF_MAX_THREADS);\ 
  234    memset(_prof_event_types,0,sizeof(_prof_event_t)*PROF_MAX_THREADS*PROF_MAX_EVENTS);\ 
  235    memset(_prof_event_times,0,sizeof(_prof_time_t)*PROF_MAX_THREADS*PROF_MAX_EVENTS);\ 
  241#define PROF_GET_TIME ((_prof_time_t)(((double)(mach_absolute_time()-_prof_start_time))*_prof_time_mult)) 
  243#define PROF_GET_TIME ((clock_gettime(CLOCK_MONOTONIC, &_prof_timespec)==0)?((_prof_time_t)((((_prof_time_t)_prof_timespec.tv_sec)*1000000000ULL)+_prof_timespec.tv_nsec-_prof_start_time)):((_prof_time_t)0ULL)) 
  249#define PROF_SET_THREAD(thread) \ 
  251    if (_prof_thread_key!=0){\ 
  252        void* _prof_thread_id=NULL;\ 
  253        if ((_prof_thread_id=pthread_getspecific(_prof_thread_key)) == NULL){\ 
  254            if((_prof_thread_id=malloc(sizeof(unsigned int)))!=NULL){\ 
  255                pthread_setspecific(_prof_thread_key, _prof_thread_id);\ 
  258        if(_prof_thread_id!=NULL){\ 
  259            *((unsigned int *)_prof_thread_id)=thread;\ 
  263#define PROF_SET_THREAD(thread) _prof_thread_id=thread; 
  269#define PROF_GET_THREAD ((_prof_thread_key != 0 && pthread_getspecific(_prof_thread_key) != NULL)?(*((unsigned int *)pthread_getspecific(_prof_thread_key))):(0)) 
  271#define PROF_GET_THREAD (_prof_thread_id) 
  276#define PROF_TEVENT_AT_TIME(thread,event,event_time) \ 
  277    {if (thread < PROF_MAX_THREADS && _prof_next_event[thread] < PROF_MAX_EVENTS){\ 
  278        _prof_event_types[thread][_prof_next_event[thread]]=_prof_event_to_net(event);\ 
  279        _prof_event_times[thread][_prof_next_event[thread]]=_prof_time_to_net(event_time);\ 
  280        _prof_next_event[thread]++;\ 
  282#define PROF_TEVENT(thread, event) PROF_TEVENT_AT_TIME(thread,event,PROF_GET_TIME) 
  283#define PROF_EVENT(event) PROF_TEVENT_AT_TIME(PROF_GET_THREAD,event,PROF_GET_TIME) 
  287#define PROF_TBEGIN(thread,event) PROF_TEVENT(thread,event+10000) 
  288#define PROF_BEGIN(event) PROF_EVENT(event+10000) 
  292#define PROF_TEND(thread,event) PROF_TEVENT(thread,event+20000) 
  293#define PROF_END(event) PROF_EVENT(event+20000) 
  297#if !defined(PROF_CUDA_ENABLE) || PROF_CUDA_ENABLE == 0 
  299#define PROF_CUDA_TSTART(thread,stream) 
  300#define PROF_CUDA_START(stream) 
  301#define PROF_CUDA_TEVENT(thread,event,stream) 
  302#define PROF_CUDA_EVENT(event,stream) 
  303#define PROF_CUDA_TBEGIN(thread,event,stream) 
  304#define PROF_CUDA_BEGIN(event,stream) 
  305#define PROF_CUDA_TEND(thread,event,stream) 
  306#define PROF_CUDA_END(event,stream) 
  307#define PROF_CUDA_TFINISH(thread,stream) 
  308#define PROF_CUDA_FINISH(stream) 
  312#define PROF_CUDA_TINIT(thread, event) \ 
  313    {if(thread < PROF_MAX_THREADS) { \ 
  314        _eventList_s *e=new _eventList_s; e->evt=event; cudaEventCreate((cudaEvent_t*)&e->begin); cudaEventCreate((cudaEvent_t*)&e->end); \ 
  315        e->next=_prof_cuda_event_list[thread]; _prof_cuda_event_list[thread]=e; \ 
  317#define PROF_CUDA_INIT(event) PROF_CUDA_TINIT(PROF_GET_THREAD,event) 
  319#define PROF_CUDA_TSTART(thread,stream) \ 
  320    {if (thread < PROF_MAX_THREADS) \ 
  322            if(!_prof_cuda_ref_event[thread]) \ 
  323                cudaEventCreate((cudaEvent_t*)&_prof_cuda_ref_event[thread]); \ 
  324            if (cudaEventRecord((cudaEvent_t)_prof_cuda_ref_event[thread], stream)==cudaSuccess){\ 
  325                _prof_cuda_start_time[thread]=PROF_GET_TIME;\ 
  329#define PROF_CUDA_START(stream) PROF_CUDA_TSTART(PROF_GET_THREAD,stream) 
  331#define PROF_CUDA_TBEGIN(thread, event, stream) \ 
  332    { if(thread < PROF_MAX_THREADS) { \ 
  333            _eventList_s *e=_prof_cuda_event_list[thread]; \ 
  336                if(e->evt==event){cudaEventRecord((cudaEvent_t)e->begin, stream); break;} \ 
  341                PROF_CUDA_TINIT(thread, event); \ 
  342                e=_prof_cuda_event_list[thread]; \ 
  343                cudaEventRecord((cudaEvent_t)e->begin, stream); \ 
  348#define PROF_CUDA_BEGIN(event,stream) PROF_CUDA_TBEGIN(PROF_GET_THREAD,event,stream) 
  350#define PROF_CUDA_TEND(thread, event, stream) \ 
  352        if(thread < PROF_MAX_THREADS) \ 
  354            _eventList_s *e=_prof_cuda_event_list[thread]; \ 
  357                if(e->evt==event){cudaEventRecord((cudaEvent_t)e->end, stream); break;} \ 
  362#define PROF_CUDA_END(event,stream) PROF_CUDA_TEND(PROF_GET_THREAD,event,stream) 
  364#define PROF_CUDA_TFINISH(thread,stream) \ 
  365    {if (thread < PROF_MAX_THREADS){\ 
  366        cudaEvent_t _prof_cuda_start_handle=(cudaEvent_t)_prof_cuda_ref_event[thread]; \ 
  367        _eventList_s *e=_prof_cuda_event_list[thread]; \ 
  370            float _prof_cuda_elaped_ms=0.0f;\ 
  371            if (cudaEventElapsedTime(&_prof_cuda_elaped_ms,(cudaEvent_t)_prof_cuda_start_handle,(cudaEvent_t)e->begin)==cudaSuccess){\ 
  372                PROF_TEVENT_AT_TIME(thread, (e->evt+10000), (_prof_cuda_start_time[thread]+(_prof_time_t)(1000000.0*((double)_prof_cuda_elaped_ms)))); \ 
  374            if (cudaEventElapsedTime(&_prof_cuda_elaped_ms,(cudaEvent_t)_prof_cuda_start_handle,(cudaEvent_t)e->end)==cudaSuccess){\ 
  375                PROF_TEVENT_AT_TIME(thread, (e->evt+20000), (_prof_cuda_start_time[thread]+(_prof_time_t)(1000000.0*((double)_prof_cuda_elaped_ms)))); \ 
  381#define PROF_CUDA_FINISH(stream) PROF_CUDA_TFINISH(PROF_GET_THREAD,stream) 
  389    int _prof_buf_max=strlen(PROF_MAKE_STR(PROF_OUT_FILE))+256;\ 
  390    char* _prof_buf = (char*)malloc(_prof_buf_max+1);\ 
  391    if (_prof_buf!=NULL){\ 
  392        snprintf(_prof_buf,_prof_buf_max,"%s.%d",PROF_MAKE_STR(PROF_OUT_FILE),getpid());\ 
  393        FILE * _prof_fp=fopen(_prof_buf,"w");\ 
  394        if (_prof_fp!=NULL){\ 
  395            char _prof_magic[]="SBPT";\ 
  396            fwrite(_prof_magic, 1, strlen(_prof_magic), _prof_fp);\ 
  397            unsigned int _prof_tmp=htonl(1);\ 
  398            fwrite(&_prof_tmp, sizeof(_prof_tmp), 1, _prof_fp);\ 
  399            _prof_tmp=htonl(PROF_MAX_THREADS);\ 
  400            fwrite(&_prof_tmp, sizeof(_prof_tmp), 1, _prof_fp);\ 
  401            _prof_tmp=htonl(PROF_MAX_EVENTS);\ 
  402            fwrite(&_prof_tmp, sizeof(_prof_tmp), 1, _prof_fp);\ 
  403            fwrite(_prof_event_types, sizeof(_prof_event_t), PROF_MAX_THREADS*PROF_MAX_EVENTS, _prof_fp);\ 
  404            fwrite(_prof_event_times, sizeof(_prof_time_t), PROF_MAX_THREADS*PROF_MAX_EVENTS, _prof_fp);\ 
  410#define PROF_FORK_WRITE PROF_WRITE