Review of Processes
To understand threads, it's important to review what a process is.Processes
#include <stdio.h> /* printf */
#include <unistd.h> /* fork, getpid */
#include <stdlib.h> /* exit */
#include <sys/wait.h> /* wait */
int main(void)
{
int pid = fork();
if (pid == 0) /* child */
{
printf(" child pid: %i\n", getpid());
/* do child stuff */
exit(0);
}
else if (pid > 0) /* parent */
{
printf("parent pid: %i\n", getpid());
/* do parent stuff */
wait(NULL);
}
else
{
/* fork failed */
}
return 0;
}
_______________________________________________
Output:
parent pid: 32491
child pid: 32492
Threads Overview
Types of threads
With a single CPU/core:
With multiple CPUs/cores
No parallelism pseudocode Pseudo-parallelism ![]()
// ← Pseudocode for left diagram int main() { Function1(); /* Blocks until done */ Function2(); /* Blocks until done */ Function3(); /* Blocks until done */ Function4(); /* Blocks until done */ } // Pseudocode for right diagram → int main() { while (not done) { Function1(); /* Run for 5 ms */ Function2(); /* Run for 5 ms */ Function3(); /* Run for 5 ms */ Function4(); /* Run for 5 ms */ } }![]()
Pseudo-parallelism (detail) ![]()
Parallelism ![]()
Thread States// Pseudocode for above diagram ↑ int main() { Function1(); /* Returns immediately and runs to completion */ Function2(); /* Returns immediately and runs to completion */ Function3(); /* Returns immediately and runs to completion */ Function4(); /* Returns immediately and runs to completion */ /* wait for the threads to complete here... */ }
There are several states in which a thread can be, and they are very similar to process states. They are mutually-exclusive, so a thread can only be in exactly one of these states at any given time:
Thread Libraries
POSIX Pthreads
This program simply creates a thread which prints the word Hello! to the screen and then exits.
/* To compile under Linux, use -pthread or -lpthread option */
#include <stdio.h> /* printf */
#include <pthread.h> /* thread create/join/exit */
void *hello(void *p)
{
printf("Hello!\n");
return NULL;
}
int main(void)
{
pthread_t tid;
pthread_create(&tid, NULL, hello, NULL);
pthread_join(tid, NULL);
return 0;
}
int pthread_create(pthread_t *thread, pthread_attr_t *attr, void * (*start_routine)(void *), void *arg);
int pthread_join(pthread_t thread, void **retval);
void pthread_exit(void *retval);
This program uses a global array of NUL-terminated strings (character pointers). The main function creates several threads. Each thread runs the same function palindrome, which simply chooses a number of random palindromes (from the global array) to print to the screen. There is only one process, but many threads.
#include <stdio.h> /* printf */
#include <unistd.h> /* sleep, getpid */
#include <pthread.h> /* thread create/join/exit */
char const *table[]
= { "A man, a plan, a canal: Panama!",
"No lemons, no melon.",
"Step on no pets.",
"Was it a rat I saw?",
"Dog as a devil deified lived as a god.",
"Able was I ere I saw Elba.",
"Yawn! Madonna fan? No damn way!",
"Go hang a salami. I'm a lasagna hog!"
};
const int table_size = sizeof(table) / sizeof(*table);
void *palindrome(void *p) /* p is unused */
{
pthread_t tid = pthread_self();
int max = 5;
int i, pid;
pid = getpid(); /* all threads have the same pid */
for (i = 0; i < max; ++i)
{
int index = rand() % table_size;
printf("[p:%i][t:%u]: %s\n", pid, (unsigned)tid / 1000000, table[index]);
}
return NULL;
}
int main(void)
{
#define COUNT 8
int i;
pthread_t thread_id[COUNT];
srand(0);
printf("Creating threads...\n");
for (i = 0; i < COUNT; ++i)
pthread_create(&thread_id[i], NULL, palindrome, NULL);
printf("Done creating threads.\n");
printf("Waiting on threads...\n");
for (i = 0; i < COUNT; ++i)
{
printf("Joining thread %i\n", i);
pthread_join(thread_id[i], NULL);
}
printf("Done waiting on threads.\n");
return 0;
}
Output from above
This program is similar to the one above but instead of using global data, the data is passed to each thread. The data is a user-defined struct that contains a pointer to the palindrome strings, the size of the table (array), and a count of how many strings to print. Although each thread receives its own copy of the pointer to the strings, they are all reading from the same array. This is completely safe. As before, there is only one process, but many threads.
#include <stdio.h> /* printf */
#include <unistd.h> /* sleep, getpid */
#include <stdlib.h> /* rand, srand */
#include <pthread.h> /* thread create/join/exit */
/* User-defined data for threads */
typedef struct
{
const char **table;
int table_size;
int to_do_count;
}data_struct; /* an unimaginative name for our structure ... */
void *palindrome(void *data)
{
data_struct *ds = (data_struct *)data; /* typical use when providing arguments */
pthread_t tid = pthread_self(); /* get this thread's tid */
int pid = getpid(); /* all threads have the same pid */
int i;
for (i = 0; i < ds->to_do_count; ++i)
{
int index = rand() % ds->table_size;
printf("[p:%i][t:%u]: %s\n", pid, (unsigned)tid / 1000000, ds->table[index]);
}
return NULL;
}
int main(void)
{
#define COUNT 8
int i;
const char *table[]
= { "A man, a plan, a canal: Panama!",
"No lemons, no melon.",
"Step on no pets.",
"Was it a rat I saw?",
"Dog as a devil deified lived as a god.",
"Able was I ere I saw Elba.",
"Yawn! Madonna fan? No damn way!",
"Go hang a salami. I'm a lasagna hog!"
};
const int table_size = sizeof(table) / sizeof(*table);
srand(0);
data_struct ds[COUNT];
pthread_t thread_id[COUNT];
printf("Creating threads...\n");
for (i = 0; i < COUNT; ++i)
{
ds[i].table = table;
ds[i].table_size = table_size;
ds[i].to_do_count = 5; /*1 + rand() % 10;*/
pthread_create(&thread_id[i], NULL, palindrome, &ds[i]);
}
printf("Done creating threads.\n");
printf("Waiting on threads...\n");
for (i = 0; i < COUNT; ++i)
{
printf("Joining thread %i\n", i);
pthread_join(thread_id[i], NULL);
}
printf("Done waiting on threads.\n");
return 0;
}
Output: (Like before, each run will produce a different ordering.)
Creating threads... [p:20030][t:791]: Go hang a salami. I'm a lasagna hog! [p:20030][t:791]: Was it a rat I saw? [p:20030][t:791]: No lemons, no melon. [p:20030][t:791]: Step on no pets. [p:20030][t:766]: Go hang a salami. I'm a lasagna hog! [p:20030][t:783]: Yawn! Madonna fan? No damn way! [p:20030][t:783]: Able was I ere I saw Elba. [p:20030][t:783]: Step on no pets. [p:20030][t:783]: Step on no pets. [p:20030][t:783]: Was it a rat I saw? [p:20030][t:741]: Was it a rat I saw? [p:20030][t:758]: No lemons, no melon. [p:20030][t:758]: Yawn! Madonna fan? No damn way! [p:20030][t:758]: Step on no pets. [p:20030][t:758]: Dog as a devil deified lived as a god. [p:20030][t:758]: A man, a plan, a canal: Panama! Done creating threads. [p:20030][t:733]: A man, a plan, a canal: Panama! [p:20030][t:733]: Go hang a salami. I'm a lasagna hog! [p:20030][t:766]: Dog as a devil deified lived as a god. [p:20030][t:766]: Yawn! Madonna fan? No damn way! [p:20030][t:766]: Step on no pets. [p:20030][t:766]: Yawn! Madonna fan? No damn way! [p:20030][t:733]: Able was I ere I saw Elba. [p:20030][t:733]: Was it a rat I saw? [p:20030][t:733]: Was it a rat I saw? [p:20030][t:741]: Was it a rat I saw? [p:20030][t:741]: Go hang a salami. I'm a lasagna hog! [p:20030][t:791]: Dog as a devil deified lived as a god. [p:20030][t:749]: Was it a rat I saw? [p:20030][t:749]: No lemons, no melon. [p:20030][t:749]: Step on no pets. [p:20030][t:749]: Yawn! Madonna fan? No damn way! [p:20030][t:749]: Step on no pets. Waiting on threads... Joining thread 0 [p:20030][t:775]: No lemons, no melon. [p:20030][t:775]: Able was I ere I saw Elba. [p:20030][t:775]: No lemons, no melon. [p:20030][t:775]: A man, a plan, a canal: Panama! [p:20030][t:775]: Was it a rat I saw? Joining thread 1 Joining thread 2 Joining thread 3 [p:20030][t:741]: Go hang a salami. I'm a lasagna hog! [p:20030][t:741]: Step on no pets. Joining thread 4 Joining thread 5 Joining thread 6 Joining thread 7 Done waiting on threads.Notes:
A More Real-World™ Example
The previous examples were a good introduction to how threads work, but they were contrived. There was little to be gained by having a seperate thread print out those strings at the same time to the screen. In fact, it may have been worse using threads because the output appeared chaotic. Let's do something that actually makes sense.
In this example, we have 3 different ways to calculate the value of pi: (pi.c) Briefly, the way these methods work is to iteratively calculate more and more precision for pi. The more iterations that are done, the closer approximation to pi we get. For reference, we are going to do 200,000,000 iterations with each function, giving us several digits of precision. Also for reference, the time it takes for each function to calculate pi on a few of reference machines (olga) is: (No optimizations were enabled in the compiler.)Remember, the goal of multi-threading is to keep the CPU busy. We never want it to be idle if there is work to be done.
olga (4 cores) rebecca (4 cores) VM (4 cores)
Method Time Circle method 2.2 seconds Leibniz method 2.4 seconds atan method 1.5 seconds
Method Time Circle method 3.4 seconds Leibniz method 1.9 seconds atan method 1.2 seconds
Method Time Circle method 2.7 seconds Leibniz method 1.4 seconds atan method 0.5 seconds
To give you an idea of how these function work, this is what the atan method looks like:
The details are not important. What is important is that this function is going to take a while to perform 200,000,000 iterations. The other two functions are similar in that they are going to loop 200,000,000 times.double atan_pi(int rectangles) { double width = 1.0 / rectangles; double length; double area = 0.0; int i; double midpoint = width / 2; for (i = 0; i < rectangles; i++) { double area_of_rectangle; length = 4.0 / (1 + midpoint * midpoint); area_of_rectangle = length * width; area += area_of_rectangle; midpoint += width; } return area; }
Before we learned about processes, threads, and multi-core CPUs, we would have programmed it like this: (pi-nothread.c)
/* To compile under Linux, use -lm linker option */
#include <stdio.h> /* printf */
#include <stdlib.h> /* atoi */
double circle_pi(int rectangles); /* Calculates PI using a quarter circle */
double leibniz_pi(int iterations); /* Calculates PI using a series */
double atan_pi(int rectangles); /* Calculates PI using a curve */
int main(int argc, char **argv)
{
int iterations = 1000 * 1000 * 100;
int method = 0;
double pi_circle, pi_leibniz, pi_atan;
if (argc > 1)
method = atoi(argv[1]);
if (argc > 2)
iterations = 1000 * 1000 * atoi(argv[2]);
switch (method)
{
case 1:
pi_circle = circle_pi(iterations);
printf("Iterations: %10i\n", iterations);
printf(" circle:%20.16f\n", pi_circle);
break;
case 2:
pi_leibniz = leibniz_pi(iterations);
printf("Iterations: %10i\n", iterations);
printf("leibniz:%20.16f\n", pi_leibniz);
break;
case 3:
pi_atan = atan_pi(iterations);
printf("Iterations: %10i\n", iterations);
printf(" atan:%20.16f\n", pi_atan);
break;
default:
pi_circle = circle_pi(iterations);
pi_leibniz = leibniz_pi(iterations);
pi_atan = atan_pi(iterations);
printf("Iterations: %10i\n", iterations);
printf(" circle:%20.16f\n", pi_circle);
printf("leibniz:%20.16f\n", pi_leibniz);
printf(" atan:%20.16f\n", pi_atan);
break;
}
return 0;
}
Then, run from the command line and timing it:
the output:time ./pi-nothread 0 200
Reminder of the times for each function (olga):Iterations: 200000000 circle: 3.1415926568498080 leibniz: 3.1415926485894077 atan: 3.1415926536631549 real 0m6.168s user 0m6.150s sys 0m0.010s
As expected, since we ran each function serially (one after the other), the total time is the sum of the times of each function. (The time to run the program without calling any pi function was 0.001 s.)
Method Time Circle method 2.2 seconds Leibniz method 2.4 seconds atan method 1.5 seconds
On a single-core CPU, that's the best we can do. Even putting each function in its own process or own thread would likely make the timings worse. Why?
BUT, on a multi-core system, we can certainly improve things. Let's run each function in its own process and hope that the operating system will give each process its own core to run on simultaneously. Now we program it like this: (pi-mpsh.c)
/* To compile under Linux, use -lm linker option */
#include <stdio.h> /* printf */
#include <stdlib.h> /* exit, atoi */
#include <string.h> /* strcpy */
#include <unistd.h> /* sleep, fork */
#include <sys/shm.h> /* shmget, shmat, shmdt, shmctl */
#include <sys/wait.h> /* waitpid */
double circle_pi(int rectangles); /* Calculates PI using a quarter circle */
double leibniz_pi(int iterations); /* Calculates PI using a series */
double atan_pi(int rectangles); /* Calculates PI using a curve */
int main(int argc, char **argv)
{
int iterations = 1000 * 1000 * 100;
int child1, child2, child3;
int shmid;
double *buffer; /* shared buffer */
key_t key = 123; /* arbitrary key */
if (argc > 1)
iterations = 1000 * 1000 * atoi(argv[1]);
/* Buffer will hold 3 doubles; one from each child process */
shmid = shmget(key, 3 * sizeof(double), 0600 | IPC_CREAT);
buffer = (double *) shmat(shmid, NULL, 0);
/* Initialize the buffer */
*buffer = 0.0;
*(buffer + 1) = 0.0;
*(buffer + 2) = 0.0;
if ( (child1 = fork()) == 0 ) /* first child */
{
*buffer = circle_pi(iterations);
shmdt(buffer); /* detach memory from child process */
exit(0);
}
/* parent */
if ( (child2 = fork()) == 0 ) /* second child */
{
*(buffer + 1) = leibniz_pi(iterations);
shmdt(buffer); /* detach memory from second process */
exit(0);
}
/* parent */
if ( (child3 = fork()) == 0 ) /* third child */
{
*(buffer + 2) = atan_pi(iterations);
shmdt(buffer); /* detach memory from third child process */
exit(0);
}
/* parent */
waitpid(child1, NULL, 0);
waitpid(child2, NULL, 0);
waitpid(child3, NULL, 0);
printf("Iterations: %10i\n", iterations);
printf(" circle:%20.16f\n", *buffer);
printf("leibniz:%20.16f\n", *(buffer + 1));
printf(" atan:%20.16f\n", *(buffer + 2));
shmdt(buffer); /* detach memory from parent process */
shmctl(shmid, IPC_RMID, 0); /* delete memory block */
return 0;
}
Then, run from the command line and timing it:
the output:time ./pi-mpsh 200
Reminder:Iterations: 200000000 circle: 3.1415926568498080 leibniz: 3.1415926485894077 atan: 3.1415926536631549 real 0m2.405s user 0m6.140s sys 0m0.000s
The single-process version took 6.15 seconds. This is a clearly significant improvement. Why do you think the time is 2.4 seconds? (It was run on a machine with 4 cores.)
Method Time Circle method 2.2 seconds Leibniz method 2.4 seconds atan method 1.5 seconds
Now, let's run each function in its own thread and see how that goes. We program it like this: (pi-mt.c or pi-mt-int.c)
/* To compile under Linux, use -lm and -lpthread linker options */
#include <stdio.h> /* printf */
#include <stdlib.h> /* exit, atoi */
#include <pthread.h> /* thread create/join/exit */
double circle_pi(int rectangles); /* Find PI using a quarter circle */
double leibniz_pi(int iterations); /* Find PI using a series */
double atan_pi(int rectangles); /* Find PI using a curve */
double pi_circle;
double pi_leibniz;
double pi_atan;
void *th1(void *p)
{
int it = *(int *)p;
pi_circle = circle_pi(it);
return NULL;
}
void *th2(void *p)
{
int it = *(int *)p;
pi_leibniz = leibniz_pi(it);
return NULL;
}
void *th3(void *p)
{
int it = *(int *)p;
pi_atan = atan_pi(it);
return NULL;
}
int main(int argc, char **argv)
{
int iterations = 1000 * 1000 * 100;
pthread_t thread1, thread2, thread3;
if (argc > 1)
iterations = 1000 * 1000 * atoi(argv[1]);
/* Create threads with data passing */
pthread_create(&thread1, NULL, th1, &iterations);
pthread_create(&thread2, NULL, th2, &iterations);
pthread_create(&thread3, NULL, th3, &iterations);
/* Wait on the threads */
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
pthread_join(thread3, NULL);
printf("Iterations: %10i\n", iterations);
printf(" circle:%20.16f\n", pi_circle);
printf("leibniz:%20.16f\n", pi_leibniz);
printf(" atan:%20.16f\n", pi_atan);
return 0;
}
Then, run from the command line and timing it:
the output:time ./pi-mt 200
Another version using arrays for scaling. This example also shows how to return data from one thread to another thread via the thread-function argument.Iterations: 200000000 circle: 3.1415926568498080 leibniz: 3.1415926485894077 atan: 3.1415926536631549 real 0m2.398s user 0m6.130s sys 0m0.000s
Some interesting benchmarks: Various computer specs for testing in this class.
Below are timings for calculating pi using the methods and systems described above. There are two primary factors that determine the overall performance of the calculations. One is the version of the GNU compiler used, and the other is the architecture of the processor, which includes the speed of the CPU itself, as well as the size, speed, and architecture of the caches. We will learn more about the details of cache memory later. Don't forget Mead's informal rule of thumb: Cache is King™
Finally, realize that, although this is not quite a synthetic benchmark, (WhetstoneWhetstone for floating point and DrhystoneDrhystone for integers ) it really doesn't represent all of the tasks and programs a system is going to run during an average day, so you can't just look at this one benchmark and conclude which system is best.
Some real stress testing: (Do try this at home! pi-mtv.c)
Computer
(200,000,000 iterations)Individual
timings (secs)
pi-nothread.cOne process
One thread
pi-nothread.cMultiple processes
One thread each
pi-mpsh.cOne process
Multiple Threads
pi-mt.cMaya (3.4 GHz) 4 cores
Linux 64-bit (3.2.0-23)
gcc 4.6.3 circle: 0.8 Leibniz: 1.7 atan: 0.8 3.3 real 0m3.299s user 0m3.288s sys 0m0.000s real 0m1.717s user 0m3.420s sys 0m0.008s real 0m1.713s user 0m3.680s sys 0m0.000sClara (3.3 GHz) 4 cores
Linux 64-bit (3.2.0-23)
gcc 4.6.3 circle: 1.0 Leibniz: 1.7 atan: 1.1 3.8 real 0m3.730s user 0m3.724s sys 0m0.000s real 0m1.673s user 0m3.724s sys 0m0.000s real 0m1.699s user 0m3.764s sys 0m0.000sServer (3.1 GHz) 4 cores
Linux 64-bit
gcc 4.6.3 circle: 1.7 Leibniz: 1.0 atan: 1.7 4.4 real 0m4.324s user 0m4.304s sys 0m0.000s real 0m1.743s user 0m4.292s sys 0m0.000s real 0m1.704s user 0m4.252s sys 0m0.000sAthena (2.83 GHz) 4 cores
Linux 32-bit (2.6.18-2)
gcc 4.1.2 circle: 2.1 Leibniz: 1.6 atan: 1.6 5.3 real 0m5.200s user 0m5.196s sys 0m0.004s real 0m2.105s user 0m5.252s sys 0m0.000s real 0m2.100s user 0m5.256s sys 0m0.000sLulu (2.7 GHz) 6 cores
Linux 64-bit
gcc 4.6.3 circle: 2.0 Leibniz: 2.1 atan: 1.9 6.0 real 0m6.087s user 0m6.084s sys 0m0.000s real 0m2.224s user 0m6.288s sys 0m0.000s real 0m2.209s user 0m6.244s sys 0m0.000sOlga (2.8 GHz) 4 cores
Linux 64-bit (2.6.32-24)
gcc 4.4.3 circle: 2.2 Leibniz: 2.4 atan: 1.5 6.1 real 0m6.135s user 0m6.130s sys 0m0.000s real 0m2.405s user 0m6.150s sys 0m0.000s real 0m2.408s user 0m6.120s sys 0m0.000sLisa (2.67 GHz) 2 core
Linux 64-bit (2.6.38)
gcc 4.5.2 circle: 2.2 Leibniz: 2.4 atan: 1.5 6.1 real 0m6.131s user 0m6.120s sys 0m0.010s real 0m3.007s user 0m8.360s sys 0m0.000s real 0m2.984s user 0m8.400s sys 0m0.000sGina (3.0 GHz) 2 cores
Linux 64-bit
gcc 4.8.2 circle: 4.8 Leibniz: 2.2 atan: 1.0 8.0 real 0m7.879s user 0m7.813s sys 0m0.008s real 0m3.011s user 0m4.026s sys 0m0.000s real 0m4.105s user 0m5.550s sys 0m0.021sDigipen (2.4 GHz) 2 cores
Windows 32-bit (XP)
gcc 4.3.2 circle: 6.3 Leibniz: 3.2 atan: 3.2 12.7 real 0m12.516s user 0m12.499s sys 0m0.046s real 0m7.406s user 0m12.341s sys 0m0.015s real 0m5.281s user 0m10.374s sys 0m0.031sJessica (2.8 GHz) 1 core
Windows 32-bit (Server 2003)
gcc 4.3.2 circle: 6.2 Leibniz: 3.4 atan: 3.7 13.3 real 0m6.141s user 0m5.953s sys 0m0.015s real 0m13.547s user 0m23.046s sys 0m0.015s real 0m14.718s user 0m26.093s sys 0m0.015sSabrina (2.4 GHz) 1 core
Linux 32-bit (2.4.20)
gcc 3.3 circle:10.7 Leibniz: 3.9 atan: 4.0 18.6 real 0m18.645s user 0m18.200s sys 0m0.000s real 0m18.610s user 0m18.180s sys 0m0.000s real 0m19.046s user 0m18.590s sys 0m0.000sMaria (1.66 GHz) 2 cores
Linux 32-bit (2.6.31-16)
gcc 4.4.1 circle:17.7 Leibniz:12.9 atan:16.3 46.9 real 0m46.806s user 0m46.759s sys 0m0.048s real 0m27.072s user 0m52.855s sys 0m0.052s real 0m27.306s user 0m52.091s sys 0m0.024s
This program runs many threads (up to 1024, but you can set it higher) and executes the function atan_pi for calculating pi. Under Windows, you can watch the program with Task Manager or Process Explorer and see all of the threads. The command line takes two parameters: The number of threads (default 1) and the number of iterations (times one million) for the function (default 100).
/* To compile under Linux, use -lm and -lpthread linker options */
#include <stdio.h> /* printf */
#include <stdlib.h> /* atoi */
#include <pthread.h> /* thread create/join/exit */
#define MAX_THREADS 1024
double atan_pi(int rectangles); /* Find PI using a curve */
void *th1(void *p)
{
int it = *(int *)p;
atan_pi(it);
return NULL;
}
int main(int argc, char **argv)
{
int iterations = 1000 * 1000 * 100;
int thread_count = 1;
int i;
pthread_t threads[MAX_THREADS];
if (argc > 1)
{
thread_count = atoi(argv[1]);
thread_count = thread_count > MAX_THREADS ? MAX_THREADS : thread_count;
}
if (argc > 2)
iterations = 1000 * 1000 * atoi(argv[2]);
printf(" Threads: %i\n", thread_count);
printf("Iterations: %i\n", iterations);
for (i = 0; i < thread_count; i++)
pthread_create(&threads[i], NULL, th1, &iterations);
for (i = 0; i < thread_count; i++)
pthread_join(threads[i], NULL);
return 0;
}
Another perspective
The previous examples were all fairly trivial to run in multiple threads. The main reason is that each function implements a complete, stand-alone calculation. They can easily be run independently of each other without requiring any communication between the threads. This is ideal for multi-threading: One function, one thread, independent data. It doesn't get any easier than this.
However, what if we only had one function that calculated the value of PI? How would we take advantage of multiple threads? In this case, it's still pretty easy because it is trivial to parallelize the computation. For example, if we want to calculate the value of PI using 1,000,000 iterations, it will take a certain amount of time. But, what if we run two threads in parallel with one thread running half of the iterations and the other thread running the other half? Examples: Using this data:
We could set all elements in array using one loop in one thread:#define SIZE 16 int array[SIZE]; int i;
We could essentially cut the time required in half (less thread overhead) if we used two threads:/* Thread #1 Sets elements 0-15 */ for (i = 0; i < 16; i++) array[i] = i;
Or, better yet, use 4 threads:
Thread #1 Thread #2 /* Thread #1 Sets elements 0-7 */ for (i = 0; i < 8; i++) array[i] = i; /* Thread #2 Sets elements 8-15*/ for (i = 8; i < 16; i++) array[i] = i;
Thread #1 Thread #2 /* Thread #1 Sets elements 0-3 */ for (i = 0; i < 4; i++) array[i] = i; /* Thread #2 Sets elements 4-7 */ for (i = 4; i < 8; i++) array[i] = i;
Or, using 8 or more threads (you get the picture). Yes, with 16 elements in the example above, multiple threads is too much overhead. But, suppose you need to process millions or billions of elements, not uncommon with today's computers. Using threads will have an enormous impact on the processing time.
Thread #3 Thread #4 /* Thread #3 Sets elements 8-11 */ for (i = 8; i < 12; i++) array[i] = i; /* Thread #4 Sets elements 12-15*/ for (i = 12; i < 16; i++) array[i] = i;
Getting back to the example of calculating digits of PI using 1,000,000 iterations.Aside: There are languages that will automagically divide the work into concurrent threads. One example is FORTRAN setting an array of 2 billion integers (gfortran init.f90):
By specifying the concurrent keyword, the compiler will automagically create multiple loops to be executed simultaneously.
Single thread Concurrent threads ! This program tests the concurrency program TestConcurrentLoop ! Type declarations integer :: i, x(2000000000) ! Executable statements do i = 1, 2000000000 x(i) = i * 3; enddo end program TestConcurrentLoop Time to complete: 7.4 seconds ! This program tests the concurrency program TestConcurrentLoop ! Type declarations integer :: i, x(2000000000) ! Executable statements do concurrent (i = 1: 2000000000) x(i) = i * 3; enddo end program TestConcurrentLoop Time to complete: 5.3 seconds
One thread could calculate the value using the range 0 - 500,000 and the second thread can caclulate the value using the range 500,001 - 1,000,000. Then, we just add the two together to get the total value. We could take it further and break the range into 4 intervals:
A note about processor affinityprocessor affinity:
/*
Compile: gcc pi-mtv.c pi-int.c -lm -lpthread
*/
#define _GNU_SOURCE /* required for CPU_ macros */
#include <stdio.h> /* printf */
#include <stdlib.h> /* atoi */
#include <pthread.h> /* thread create/join/exit */
#define MAX_THREADS 1024
double atan_pi(int rectangles); /* Find PI using a curve */
void *th1(void *p)
{
int result = 0;
int it, i;
pthread_t thread = pthread_self(); /* get this thread's ID */
cpu_set_t cpuset; /* affinity mask */
CPU_ZERO(&cpuset); /* clear mask */
CPU_SET(0, &cpuset); /* set only CPU 0 */
/* Enable this to set affinity mask for CPUs 0 to 7 */
#if 0
for (i = 0; i < 8; i++)
CPU_SET(i, &cpuset);
#endif
result = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
if (result != 0)
printf("pthread_setaffinity_np failed\n");
it = *(int *)p;
atan_pi(it);
return NULL;
}
/* main is the same as above */
Win32 Threads
Windows has its own API for threads:
This program simply creates a thread which prints the word Hello! to the screen and then exits.
#include <stdio.h> /* printf */
#include <windows.h> /* Windows stuff */
DWORD WINAPI hello(LPVOID)
{
printf("Hello!\n");
return 0;
}
int main(void)
{
HANDLE thread_handle;
thread_handle = CreateThread(0, /* Default attributes */
0, /* Default stack size */
hello, /* Start function */
0, /* No parameters */
0, /* Creation flags */
0 /* Returned thread ID */
);
/* Wait for the thread (no timeout) */
WaitForSingleObject(thread_handle, INFINITE);
/* Release the thread */
CloseHandle(thread_handle);
return 0;
}
This program uses a global array of strings (character pointers). The main function creates several threads. Each thread runs the same function palindrome, which simply chooses a random number of palindromes (from the global array) to print to the screen. There is only one process, but many threads.
#include <stdio.h> /* printf */
#include <stdlib.h> /* rand, srand */
#include <windows.h> /* Windows stuff */
char const *table[]
= { "A man, a plan, a canal: Panama!",
"No lemons, no melon.",
"Step on no pets.",
"Was it a rat I saw?",
"Dog as a devil deified lived as a god.",
"Able was I ere I saw Elba.",
"Yawn! Madonna fan? No damn way!",
"Go hang a salami. I'm a lasagna hog!"
};
const int table_size = sizeof(table) / sizeof(*table);
DWORD WINAPI palindrome(LPVOID)
{
DWORD tid = GetCurrentThreadId(); /* each thread has unique tid */
DWORD pid = GetCurrentProcessId(); /* all threads have the same pid */
int max = 5;
int i;
for (i = 0; i < max; ++i)
{
int index = rand() % table_size;
printf("[p:%i][t:%u]: %s\n", pid, tid, table[index]);
}
return 0;
}
int main(void)
{
#define COUNT 8
int i;
HANDLE th_handles[COUNT];
srand(0);
printf("Creating threads...\n");
for (i = 0; i < COUNT; ++i)
th_handles[i] = CreateThread(0, /* Default attributes */
0, /* Default stack size */
palindrome, /* Start function */
0, /* No parameters */
0, /* Default flags */
0 /* Returned thread ID */
);
printf("Done creating threads.\n");
printf("Waiting on threads...\n");
WaitForMultipleObjects(COUNT, th_handles, TRUE, INFINITE);
printf("Done waiting on threads.\n");
for (i = 0; i < COUNT; ++i)
CloseHandle(th_handles[i]);
return 0;
}
This program is similar to the one above but instead of using global data, the data is passed to each thread. The data is a user-defined struct that contains a pointer to the palindrome strings, the size of the table (array), and a count of how many strings to print. Although each thread receives its own copy of the pointer to the strings, they are all reading from the same array. This is completely safe. As before, there is only one process, but many threads.
#include <stdio.h> /* printf */
#include <stdlib.h> /* rand, srand */
#include <windows.h> /* Windows stuff */
typedef struct
{
const char **table;
int table_size;
int to_do_count;
}data_struct;
DWORD WINAPI palindrome(LPVOID data)
{
data_struct *ds = (data_struct *)data;
DWORD tid = GetCurrentThreadId();
DWORD pid = GetCurrentProcessId();; /* all threads have the same pid */
int i;
for (i = 0; i < ds->to_do_count; ++i)
{
int index = rand() % ds->table_size;
printf("[p:%i][t:%u]: %s\n", pid, tid, ds->table[index]);
}
return 0;
}
int main(void)
{
#define COUNT 8
int i;
char const *table[]
= { "A man, a plan, a canal: Panama!",
"No lemons, no melon.",
"Step on no pets.",
"Was it a rat I saw?",
"Dog as a devil deified lived as a god.",
"Able was I ere I saw Elba.",
"Yawn! Madonna fan? No damn way!",
"Go hang a salami. I'm a lasagna hog!"
};
const int table_size = sizeof(table) / sizeof(*table);
srand(0);
data_struct ds[COUNT];
HANDLE th_handles[COUNT];
printf("Creating threads...\n");
for (i = 0; i < COUNT; ++i)
{
ds[i].table = table;
ds[i].table_size = table_size;
ds[i].to_do_count = 5; /*1 + rand() % 10;*/
th_handles[i] = CreateThread(0, 0, palindrome, &ds[i], 0, 0);
}
printf("Done creating threads.\n");
printf("Waiting on threads...\n");
WaitForMultipleObjects(COUNT, th_handles, TRUE, INFINITE);
printf("Done waiting on threads.\n");
for (i = 0; i < COUNT; ++i)
CloseHandle(th_handles[i]);
return 0;
}
The threaded stress test using the Win32 API. Use Windows Task Manager or Process Explorer to see the processes and threads. (wpi-mtv.c)
#include <stdio.h> /* printf */ #include <stdlib.h> /* exit, atoi */ #include <windows.h> /* Windows stuff */ #define MAX_THREADS 1024 double atan_pi(int rectangles); /* Find PI using a curve */ DWORD WINAPI th1(LPVOID p) { int it = *(int *)p; atan_pi(it); return 0; } int main(int argc, char **argv) { int iterations = 1000 * 1000 * 100; int thread_count = 1; int i; HANDLE threads[MAX_THREADS]; if (argc > 1) { thread_count = atoi(argv[1]); thread_count = thread_count > MAX_THREADS ? MAX_THREADS : thread_count; } if (argc > 2) { iterations = 1000 * 1000 * atoi(argv[2]); } printf(" Threads: %i\n", thread_count); printf("Iterations: %i\n", iterations); for (i = 0; i < thread_count; i++) threads[i] = CreateThread(0, 0, th1, &iterations, 0, 0); WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE); for (i = 0; i < thread_count; i++) CloseHandle(threads[i]); return 0; }
Multi-threading Problems
So far, everything seems to be working out just fine. However, consider the following two programs:Single-threaded version: (thread-safe.c)
#include <stdio.h> /* printf */
int Count = 0;
void Increment(void)
{
int i;
int loops = 1000;
for (i = 0; i < loops; i++)
Count++;
}
int main(void)
{
#define N 10
int i;
for (i = 0; i < N; i++)
Increment();
printf("Count = %i\n", Count);
return 0;
}
Output:
Multi-threaded version: (thread-unsafe.c)Count = 10000
#include <stdio.h> /* printf */
#include <pthread.h> /* thread create/join/exit */
int Count;
void *Increment(void *p)
{
int i;
int loops = 1000;
for (i = 0; i < loops; i++)
Count++;
return NULL;
}
int main(void)
{
#define N 10
int i;
pthread_t thread_id[N];
for (i = 0; i < N; i++)
pthread_create(thread_id + i, NULL, Increment, NULL);
for (i = 0; i < N; i++)
pthread_join(thread_id[i], NULL);
printf("Count = %i\n", Count);
return 0;
}
Output:
Count = 10000
to this:int loops = 1000;
Results:int loops = 1000 * 100;
Single-threaded Multi-threaded Run #1 Count = 1000000 Count = 591274 Run #2 Count = 1000000 Count = 636767 Run #3 Count = 1000000 Count = 591146 Run #4 Count = 1000000 Count = 611036 Run #5 Count = 1000000 Count = 625463
Output:for val in {1..10}; do ./thread-unsafe; done
Count = 10000 Count = 10000 Count = 10000 Count = 10000 Count = 10000 Count = 9170 Count = 10000 Count = 10000 Count = 8656 Count = 9203
This is the fundamental problem with multi-threaded programs accessing a shared resource. Later, we'll see various ways of dealing with this issue and others when we talk about synchronization.
And the Windows API has the exact same problem:
#include <stdio.h> /* printf */
#include <windows.h> /* Windows stuff */
int Count;
DWORD WINAPI Increment(LPVOID p)
{
int i;
int loops = 1000 * 100;
for (i = 0; i < loops; i++)
Count++;
return 0;
}
int main(void)
{
#define N 10
int i;
HANDLE thread[N];
for (i = 0; i < N; i++)
thread[i] = CreateThread(0, 0, Increment, 0, 0, 0);
WaitForMultipleObjects(N, thread, TRUE, INFINITE);
for (i = 0; i < N; i++)
CloseHandle(thread[i]);
printf("Count = %i\n", Count);
return 0;
}
Output:
Other points to make:Count = 629261 Count = 652001 Count = 628380 Count = 743270 Count = 653191