C++11 thread_local Performance And Use For Legacy Code Upgrade

Nerds Central Recording - Fun To Make -
Hopefully Fun To Listen To And Watch

Copyright Dr Alexander J Turner all rights reserved
C++11 has a new storage type of thread_local. It is not fully implemented in VC++11 yet, but there is enough of a work around to make it usable. It shows the promise of being a very powerful tool for upgrading legacy single threaded code.

This post is in conjunction with a video:


Best viewed in 1080p.

Here is the video description:
"I work through in Visual Studio 11 beta how to use thread_local to make single threaded C or C++ legacy code into multi-threaded code without having to rework the code in any significant way. I then go on to show the performance benefits and bottlenecks of this approach. This video also shows some of the new and powerful features of C++11 like chrono and lambdas. Please check out "

Here Is the code:


#include "stdafx.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <thread>
#include <chrono>
#include <functional>
#include <iostream>
#include <future>
#include <vector>
#include <algorithm>
using namespace std;

namespace nerds_central_thread_local{
  #define SEED 35791246
  #define toIterate 10000000
  /* Mimic C++11 thread_local storage */
    #define thread_local __declspec( thread ) 
  static thread_local size_t inc;  


  typedef std::chrono::high_resolution_clock Clock;
    typedef std::chrono::duration<double, std::milli> Milliseconds;
  void timeRun(long count,std::function<void()> toRun){
    for(;count>0;--count){
      auto t1 = Clock::now();
      toRun();
      auto t2 = Clock::now();
      auto tt = (t2-t1);
      std::chrono::system_clock::period p;
      Milliseconds res  = Clock::duration(1);
      std::cout << tt.count()*res.count() << '\n';
    }
  }

  double compute()
  {
     static int niter=toIterate;
     static double x,y;
     static int i,count=0; /* # of points in the 1st quadrant of unit circle */
     static double z;
     static double pi;

     /* initialize random numbers */
     srand(SEED);
     count=0;
     for ( i=0; i<niter; i++) {
      x = (double)rand()/RAND_MAX;
      y = (double)rand()/RAND_MAX;
      z = x*x+y*y;
      if (z<=1) count++;
     }
     pi=(double)count/niter*4;
     //("# of trials= %d , estimate of pi is %g \n",niter,pi);
     return pi;
  }

  double compute_thread_local()
  {
     static thread_local int niter=toIterate;
     static thread_local double x,y;
     static thread_local int i,count=0; /* # of points in the 1st quadrant of unit circle */
     static thread_local double z;
     static thread_local double pi;

     /* initialize random numbers */
     srand(SEED);
     count=0;
     for ( i=0; i<niter; i++) {
      x = (double)rand()/RAND_MAX;
      y = (double)rand()/RAND_MAX;
      z = x*x+y*y;
      if (z<=1) count++;
     }
     pi=(double)count/niter*4;
     //printf("# of trials= %d , estimate of pi is %g \n",niter,pi);
     return pi;
  }
}

#define nThreads 32
int _tmain(int argc, _TCHAR* argv[])
{
  nerds_central_thread_local::timeRun(1,[]{
    puts("traditional run");
    nerds_central_thread_local::timeRun(5,[]{
      nerds_central_thread_local::compute();
    });
  });
  nerds_central_thread_local::timeRun(1,[]{
    puts("thread_local run");
    nerds_central_thread_local::timeRun(5,[]{
      nerds_central_thread_local::compute_thread_local();
    });
  });
  
  // Is it really thread local?
  nerds_central_thread_local::timeRun(1,[]{
    vector<future<double>> futures;
    for(long tCount=0;tCount<nThreads;++tCount){
      futures.insert(futures.end(),std::async(std::launch::async,nerds_central_thread_local::compute_thread_local));
    }
    for_each(futures.begin(),futures.end(),
      [](future<double> &f){
        printf("Got async %g \r\n",f.get()); 
    });
  });
    // Is it really thread local?
  nerds_central_thread_local::timeRun(1,[]{
    vector<future<double>> futures;
    for(long tCount=0;tCount<nThreads;++tCount){
      futures.insert(futures.end(),std::async(std::launch::async,nerds_central_thread_local::compute));
    }
    for_each(futures.begin(),futures.end(),
      [](future<double> &f){
        printf("Got sync %g \r\n",f.get()); 
    });
  });

  return 0;
} 


Here is the output:
traditional run
435.025
448.026
449.026
443.025
459.026
2238.13
thread_local run
446.026
453.026
444.025
457.026
445.026
2251.13
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
Got async 3.1417
6661.38
Got sync 3.36214
Got sync 3.36214
Got sync 3.45238
Got sync 3.36214
Got sync 3.45238
Got sync 3.45238
Got sync 3.45238
Got sync 3.33424
Got sync 3.33424
Got sync 3.33424
Got sync 3.33424
Got sync 3.42925
Got sync 3.42925
Got sync 3.42925
Got sync 3.42925
Got sync 3.59499
Got sync 0.0558296
Got sync 3.46389
Got sync 3.46389
Got sync 3.59499
Got sync 3.59499
Got sync 0.0558296
Got sync 0.0558292
Got sync 0.0558292
Got sync 3.65116
Got sync 3.44473
Got sync 3.44473
Got sync 3.44473
Got sync 3.14012
Got sync 3.65116
Got sync 3.65116
Got sync 3.14012
3959.23