new simpler approach to parallalization

2022-06-15 15:17:54 +02:00
parent 429f0890d6
commit ac80bc9f3f
13 changed files with 76 additions and 824 deletions
--- a/parallelization/allhosts
+++ b/parallelization/allhosts
@@ -1,135 +0,0 @@
 euler.ma.utexas.edu
 fac1.ma.utexas.edu
 fac4.ma.utexas.edu
 fac8.ma.utexas.edu
 fac9.ma.utexas.edu
 frog.ma.utexas.edu
 gummo.ma.utexas.edu
 iguana.ma.utexas.edu
 lab10.ma.utexas.edu
 lab11.ma.utexas.edu
 lab12.ma.utexas.edu
 lab13.ma.utexas.edu
 lab14.ma.utexas.edu
 lab15.ma.utexas.edu
 lab16.ma.utexas.edu
 lab17.ma.utexas.edu
 lab18.ma.utexas.edu
 lab19.ma.utexas.edu
 lab1.ma.utexas.edu
 lab20.ma.utexas.edu
 lab21.ma.utexas.edu
 lab22.ma.utexas.edu
 lab23.ma.utexas.edu
 lab24.ma.utexas.edu
 lab25.ma.utexas.edu
 lab26.ma.utexas.edu
 lab27.ma.utexas.edu
 lab28.ma.utexas.edu
 lab29.ma.utexas.edu
 lab2.ma.utexas.edu
 lab30.ma.utexas.edu
 lab31.ma.utexas.edu
 lab32.ma.utexas.edu
 lab33.ma.utexas.edu
 lab34.ma.utexas.edu
 lab35.ma.utexas.edu
 lab36.ma.utexas.edu
 lab37.ma.utexas.edu
 lab38.ma.utexas.edu
 lab39.ma.utexas.edu
 lab3.ma.utexas.edu
 lab40.ma.utexas.edu
 lab41.ma.utexas.edu
 lab42.ma.utexas.edu
 lab43.ma.utexas.edu
 lab44.ma.utexas.edu
 lab45.ma.utexas.edu
 lab46.ma.utexas.edu
 lab47.ma.utexas.edu
 lab48.ma.utexas.edu
 lab49.ma.utexas.edu
 lab4.ma.utexas.edu
 lab50.ma.utexas.edu
 lab51.ma.utexas.edu
 lab52.ma.utexas.edu
 lab53.ma.utexas.edu
 lab54.ma.utexas.edu
 lab55.ma.utexas.edu
 lab56.ma.utexas.edu
 lab57.ma.utexas.edu
 lab58.ma.utexas.edu
 lab59.ma.utexas.edu
 lab5.ma.utexas.edu
 lab60.ma.utexas.edu
 lab61.ma.utexas.edu
 lab62.ma.utexas.edu
 lab63.ma.utexas.edu
 lab64.ma.utexas.edu
 lab65.ma.utexas.edu
 lab66.ma.utexas.edu
 lab67.ma.utexas.edu
 lab68.ma.utexas.edu
 lab69.ma.utexas.edu
 lab6.ma.utexas.edu
 lab70.ma.utexas.edu
 lab7.ma.utexas.edu
 lab8.ma.utexas.edu
 lab9.ma.utexas.edu
 linux100.ma.utexas.edu
 linux104.ma.utexas.edu
 linux110.ma.utexas.edu
 linux115.ma.utexas.edu
 linux119.ma.utexas.edu
 linux122.ma.utexas.edu
 linux149.ma.utexas.edu
 linux14.ma.utexas.edu
 linux15.ma.utexas.edu
 linux164.ma.utexas.edu
 linux169.ma.utexas.edu
 linux16.ma.utexas.edu
 linux17.ma.utexas.edu
 linux180.ma.utexas.edu
 linux181.ma.utexas.edu
 linux184.ma.utexas.edu
 linux18.ma.utexas.edu
 linux20.ma.utexas.edu
 linux21.ma.utexas.edu
 linux24.ma.utexas.edu
 linux27.ma.utexas.edu
 linux28.ma.utexas.edu
 linux29.ma.utexas.edu
 linux2.ma.utexas.edu
 linux30.ma.utexas.edu
 linux31.ma.utexas.edu
 linux32.ma.utexas.edu
 linux38.ma.utexas.edu
 linux40.ma.utexas.edu
 linux41.ma.utexas.edu
 linux46.ma.utexas.edu
 linux4.ma.utexas.edu
 linux50.ma.utexas.edu
 linux52.ma.utexas.edu
 linux54.ma.utexas.edu
 linux57.ma.utexas.edu
 linux62.ma.utexas.edu
 linux64.ma.utexas.edu
 linux66.ma.utexas.edu
 linux68.ma.utexas.edu
 linux69.ma.utexas.edu
 linux70.ma.utexas.edu
 linux71.ma.utexas.edu
 linux72.ma.utexas.edu
 linux74.ma.utexas.edu
 linux76.ma.utexas.edu
 linux79.ma.utexas.edu
 linux80.ma.utexas.edu
 linux82.ma.utexas.edu
 linux83.ma.utexas.edu
 linux86.ma.utexas.edu
 linux91.ma.utexas.edu
 linux92.ma.utexas.edu
 linux96.ma.utexas.edu
 linux9.ma.utexas.edu
--- a/parallelization/generate_commands.py
+++ b/parallelization/generate_commands.py
@@ -0,0 +1,19 @@
 #!/usr/bin/python
 wordlength = 16
 n = 101038
 res = 50
 radius = 1.0
 q = [1,1,1]
 denom = round(res/radius)
 cmd = "IDLIST=./output/idlist_{len} ./complex_anosov summary {n} {q1} {q2} {q3} {rnum}/{rden} {inum}/{iden}"
 for i in range(-res,res+1):
 	for j in range(0,res+1):
 		if i == 0 and j == 0:
 			continue
 		print(cmd.format(
 			len=wordlength, n=n, q1=q[0], q2=q[1], q3=q[2],
 			rnum=i, inum=j, rden=denom, iden=denom))
--- a/parallelization/hostfile
+++ b/parallelization/hostfile
@@ -1,5 +0,0 @@
 linux50 slots=4
 linux52 slots=4
 linux57 slots=4
 linux110 slots=4
 linux115 slots=4
--- a/parallelization/hostfile_big
+++ b/parallelization/hostfile_big
@@ -1,48 +0,0 @@
 linux100 slots=4
 linux104 slots=4
 linux110 slots=4
 linux122 slots=4
 linux149 slots=4
 linux14 slots=4
 linux15 slots=2
 linux16 slots=4
 linux17 slots=2
 linux180 slots=4
 linux181 slots=2
 linux184 slots=4
 linux18 slots=4
 linux20 slots=4
 linux21 slots=4
 linux24 slots=4
 linux27 slots=4
 linux29 slots=4
 linux2 slots=4
 linux30 slots=4
 linux31 slots=4
 linux32 slots=4
 linux38 slots=2
 linux40 slots=4
 linux41 slots=4
 linux46 slots=4
 linux4 slots=4
 linux50 slots=4
 linux52 slots=4
 linux54 slots=4
 linux57 slots=4
 linux62 slots=2
 linux64 slots=4
 linux68 slots=4
 linux69 slots=4
 linux70 slots=4
 linux71 slots=4
 linux72 slots=4
 linux74 slots=4
 linux76 slots=4
 linux79 slots=4
 linux80 slots=4
 linux83 slots=4
 linux86 slots=4
 linux91 slots=4
 linux92 slots=4
 linux96 slots=4
 linux9 slots=2
--- a/parallelization/localnames
+++ b/parallelization/localnames
@@ -1,51 +0,0 @@
 linux100
 linux104
 linux110
 linux115
 linux122
 linux149
 linux14
 linux15
 linux164
 linux169
 linux16
 linux17
 linux180
 linux181
 linux184
 linux18
 linux20
 linux21
 linux24
 linux27
 linux29
 linux2
 linux30
 linux31
 linux32
 linux38
 linux40
 linux41
 linux46
 linux4
 linux50
 linux52
 linux54
 linux57
 linux62
 linux64
 linux68
 linux69
 linux70
 linux71
 linux72
 linux74
 linux76
 linux79
 linux80
 linux83
 linux86
 linux91
 linux92
 linux96
 linux9
--- a/parallelization/parallel.c
+++ b/parallelization/parallel.c
@@ -1,409 +0,0 @@
 #include "parallel.h"
 #include <mpi.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
 #include <malloc.h>
 #include <stdlib.h>
 //#define DEBUG INFO
 #define DEBUG(msg, ...)
 #define INFO(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__)
 //#define DEBUG(msg, ...) fprintf(stderr, "[   %10.3f] " msg, runtime(), ##__VA_ARGS__)
 //#define DEBUG_MPI(msg, node, ...) fprintf(stderr, "[%003d%10.3f] " msg, node, runtime(), ##__VA_ARGS__)
 #define DONE(x) *((int*)(x))
 enum message_tag {
 	PARALLEL_ORDER = 0,
 	PARALLEL_RESULT,
 	PARALLEL_SHUTDOWN,
 	PARALLEL_GLOBAL_DATA
 };
 struct timespec starttime;
 int mpi_rank(int activate_mpi)
 {
 	static int active = 0;
 	if(activate_mpi)
 		active = 1;
 	if(!active)
 		return 0;
 	else {
 		int rank;
 		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 		return rank;
 	}
 }
 void start_timer()
 {
 	clock_gettime(CLOCK_MONOTONIC, &starttime);
 }
 double runtime()
 {
 	struct timespec curtime;
 	double diff;
 	clock_gettime(CLOCK_MONOTONIC, &curtime);
 	return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9;
 }
 parallel_context *parallel_init()
 {
 	parallel_context *ctx = malloc(sizeof(parallel_context));
 	if(!getenv("OMPI_COMM_WORLD_SIZE")) {
 		ctx->mpi_mode = 0;
 		DEBUG("Running standalone.\n");
 		return ctx;
 	}
 	ctx->mpi_mode = 1;
 	int result = MPI_Init(NULL, NULL);
 	MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
 	MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
 	MPI_Get_processor_name(ctx->processor_name, &ctx->processor_name_len);
 	mpi_rank(1); // display the rank in debug output from now on
 	if(ctx->rank == 0)
 		DEBUG("Running in mpi mode, %d nodes.\n", ctx->size);
 	return ctx;
 }
 void parallel_destroy(parallel_context* ctx)
 {
 	if(ctx->mpi_mode) {
 		MPI_Type_free(&ctx->order_datatype);
 		MPI_Type_free(&ctx->result_datatype);
 		MPI_Finalize();
 	}
 	free(ctx);
 }
 void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size)
 {
 	ctx->init = init;
 	ctx->destroy = destroy;
 	ctx->job = job;
 	ctx->global_data_size = global_data_size;
 	ctx->node_data_size = node_data_size;
 	ctx->input_size = input_size;
 	ctx->output_size = output_size;
 	if(ctx->mpi_mode) {
 		// create a datatype for job orders, consisting of an integer (the job id) and a user-defined section
 		int order_blocklengths[2] = {1, input_size};
 		MPI_Aint order_displacements[2] = {0, sizeof(int)};
 		MPI_Datatype order_types[2] = {MPI_INT, MPI_BYTE};
 		MPI_Type_create_struct(2, order_blocklengths, order_displacements, order_types, &ctx->order_datatype);
 		MPI_Type_commit(&ctx->order_datatype);
 		int result_blocklengths[2] = {1, output_size};
 		MPI_Aint result_displacements[2] = {0, sizeof(int)};
 		MPI_Datatype result_types[2] = {MPI_INT, MPI_BYTE};
 		MPI_Type_create_struct(2, result_blocklengths, result_displacements, result_types, &ctx->result_datatype);
 		MPI_Type_commit(&ctx->result_datatype);
 	}
 }
 int parallel_job(parallel_context *ctx, const void *global_data, void *node_data, int block)
 {
 	MPI_Status status;
 	double jobtime;
 	void *input_and_job_nr = malloc(ctx->input_size + sizeof(int));
 	void *output_and_job_nr = malloc(ctx->output_size + sizeof(int));
 	void *input = input_and_job_nr + sizeof(int);
 	int *job_nr = (int *)input_and_job_nr;
 	void *output = output_and_job_nr + sizeof(int);
 	int *output_job_nr = (int *)output_and_job_nr;
 	int result = 0;
 	int message_present;
 	if(block) {
 		jobtime = -MPI_Wtime();
 		MPI_Probe(0, MPI_ANY_TAG, MPI_COMM_WORLD,
 		          &status);
 		jobtime += MPI_Wtime();
 		INFO("TIMING: Probe() took %f seconds\n", jobtime);
 		message_present = 1;
 	} else {
 		MPI_Iprobe(0, MPI_ANY_TAG, MPI_COMM_WORLD,
 		          &message_present, &status);
 	}
 //		DEBUG("Message received: source = %d, tag = %d\n", status.MPI_SOURCE, status.MPI_TAG);
 	if(message_present) {
 		if(status.MPI_TAG == PARALLEL_SHUTDOWN) {
 			DEBUG("Shutting down\n");
 			result = 1;
 		} else if(status.MPI_TAG == PARALLEL_ORDER) {
 			MPI_Recv(input_and_job_nr,
 			         1, ctx->order_datatype,
 			         0, PARALLEL_ORDER, MPI_COMM_WORLD,
 			         &status);
 			DEBUG("Working on job %d\n", *job_nr);
 			jobtime = -MPI_Wtime();
 			// do the actual work
 			ctx->job(global_data, node_data, input, output);
 			jobtime += MPI_Wtime();
 			INFO("TIMING: job %d took %f seconds\n", *job_nr, jobtime);
 			*output_job_nr = *job_nr;
 			jobtime = -MPI_Wtime();
 			MPI_Send(output_and_job_nr,
 			         1, ctx->result_datatype,
 			         0, PARALLEL_RESULT, MPI_COMM_WORLD);
 			jobtime += MPI_Wtime();
 			INFO("TIMING: Send() took %f seconds\n", jobtime);
 		}
 	} else {
 		result = 2;
 	}
 	free(input_and_job_nr);
 	free(output_and_job_nr);
 	return result;
 }
 int parallel_work(parallel_context *ctx)
 {
 	// do nothing in non-mpi mode
 	if(ctx->mpi_mode == 0)
 		return 0;
 	void *global_data = malloc(ctx->global_data_size);
 	void *node_data = malloc(ctx->node_data_size);
 	// wait for global data
 	MPI_Bcast(global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD);
 	DEBUG("Global data received\n");
 	// initialize node_data (and do once-per-node computation)
 	ctx->init(global_data, node_data);
 	DEBUG("Initialization completed\n");
 	int shutdown = 0;
 	while(shutdown != 1) {
 		shutdown = parallel_job(ctx, global_data, node_data, 1);
 	}
 	ctx->destroy(global_data, node_data);
 	free(global_data);
 	free(node_data);
 	return 0;
 }
 int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *_restart_filename)
 {
 	// in non-mpi-mode, just run init, forall(jobs) job
 	if(ctx->mpi_mode == 0) {
 		void *node_data = malloc(ctx->node_data_size);
 		int result = ctx->init(global_data, node_data);
 		if(result != 0)
 			goto cleanup_standalone;
 		for(int i = 0; i < njobs; i++) {
 			result = ctx->job(
 				global_data,
 				node_data,
 				input_array + ctx->input_size*i,
 				output_array + ctx->output_size*i);
 			if(result != 0)
 				goto cleanup_standalone;
 		}
 	cleanup_standalone:
 		ctx->destroy(global_data, node_data);
 		return result;
 	} else {
 		// if no restart file was specified, pick a filename
 		char *restart_filename;
 		char buffer[128];
 		int restartf;
 		if(_restart_filename == NULL) {
 			time_t t = time(NULL);
 			struct tm *loctm = localtime(&t);
 			strftime(buffer, sizeof(buffer), "restart/restart_%y%m%d_%H%M%S", loctm);
 			restart_filename = buffer;
 		} else {
 			restart_filename = (char *)_restart_filename;
 		}
 		// open restart file if it exists, otherwise create it
 		int continuing = 1;
 		restartf = open(restart_filename, O_RDWR);
 		if(restartf == -1 && errno == ENOENT) {
 			restartf = open(restart_filename, O_RDWR | O_CREAT, 0666);
 			continuing = 0;
 		}
 		if(restartf == -1) {
 			DEBUG("Error opening restart file: %s\n", strerror(errno));
 			exit(1);
 		}
 		// map restart file
 		int itemsize = (ctx->output_size + sizeof(int)); // for every job, store output, and completed flag
 		ftruncate(restartf, njobs*itemsize);
 		void *alljobs = mmap(0, njobs*itemsize, PROT_READ | PROT_WRITE, MAP_SHARED, restartf, 0);
 		if(alljobs == MAP_FAILED) {
 			DEBUG("Error mapping restart file: %s\n", strerror(errno));
 			exit(1);
 		}
 		// count completed jobs, or initialize jobs
 		int completed = 0;
 		if(continuing) {
 			for(int i = 0; i < njobs; i++)
 				if(DONE(alljobs + i*itemsize))
 					completed++;
 		} else {
 			for(int i = 0; i < njobs; i++) {
 				DONE(alljobs + i*itemsize) = 0;
 				memcpy(alljobs + i*itemsize + sizeof(int), input_array + i*ctx->input_size, ctx->input_size); // copy input data
 			}
 		}
 		fsync(restartf);
 		if(continuing) {
 			INFO("Continuing from restart file, %d/%d jobs completed, %d nodes\n", completed, njobs, ctx->size);
 		} else {
 			INFO("Starting from scratch, %d jobs, %d nodes\n", njobs, ctx->size);
 		}
 		if(completed >= njobs)
 			goto cleanup_mpi;
 		/* Send global data */
 		MPI_Bcast((void*)global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD);
 		DEBUG("Global data sent\n");
 		// we want to be able to run jobs ourselves, so initialize node_data
 		void *node_data = malloc(ctx->node_data_size);
 		ctx->init(global_data, node_data);
 		void *input_message_buffer = malloc(ctx->input_size + sizeof(int));
 		void *output_message_buffer = malloc(ctx->output_size + sizeof(int));
 		int *active_jobs = malloc(sizeof(int)*ctx->size);
 		memset(active_jobs, 0, ctx->size*sizeof(int));
 		int active_worker_nodes = ctx->size - 1;  // we don't count ourselves, since we can't shut ourselves down
 		// find next unfinished job
 		int current = 0;
 		while(current < njobs && DONE(alljobs + current*itemsize))
 			current++;
 		// assign initial jobs, 2 for each worker thread
 		for(int i = 0; i < 2*ctx->size; i++) {
 			if(current >= njobs) // all jobs are assigned
 				break;
 			// send job id and input data
 			// send to all nodes except ourself (node 0)
 			*((int*)input_message_buffer) = current;
 			memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
 			MPI_Send(input_message_buffer, 1, ctx->order_datatype,
 			         i%ctx->size, PARALLEL_ORDER, MPI_COMM_WORLD);
 			DEBUG("Job %d sent to node %d\n", current, i%ctx->size);
 			active_jobs[i%ctx->size]++;
 			current++;
 		}
 		MPI_Status status;
 		int message_present;
 		while(active_jobs[0] != 0 || active_worker_nodes != 0) {
 			MPI_Iprobe(MPI_ANY_SOURCE, PARALLEL_RESULT, MPI_COMM_WORLD, &message_present, &status);
 			DEBUG("Message present, tag = %d, source = %d\n", status.MPI_TAG, status.MPI_SOURCE);
 			if(!message_present) {
 				// if there are no more messages to process, we can run a job ourselves before returning to managing
 				DEBUG("Start running job myself\n");
 				int result = parallel_job(ctx, global_data, node_data, 0);
 				DEBUG("Finished running job myself, result = %d\n");
 			} else if(status.MPI_TAG == PARALLEL_RESULT) {
 				MPI_Recv(output_message_buffer, 1, ctx->result_datatype,
 				         MPI_ANY_SOURCE, PARALLEL_RESULT, MPI_COMM_WORLD, &status);
 				DEBUG("Got message tag %d from node %d\n", status.MPI_TAG, status.MPI_SOURCE);
 				int node = status.MPI_SOURCE;
 				int id = *((int*)output_message_buffer);
 				memcpy(alljobs + id*itemsize + sizeof(int), output_message_buffer + sizeof(int), ctx->output_size);
 				DONE(alljobs + id*itemsize) = 1;
 				active_jobs[node]--;
 				// todo: deal with unresponsive nodes
 				// strategy: when no jobs left, go through unfinished list again, incrementing oversubscribe counter
 				// if oversubscribe counter is at limit, shut node down instead
 				//
 				if(current >= njobs) { // all jobs are assigned, try to shut down node
 					// don't try to shut down ourselves, and only if it has no other jobs to do
 					if(node != 0 && active_jobs[node] == 0) {
 						MPI_Send(NULL, 0, MPI_BYTE, node, PARALLEL_SHUTDOWN, MPI_COMM_WORLD);
 						active_worker_nodes--;
 						INFO("job %d completed by node %d, shut down, %d workers remaining\n", id, node, active_worker_nodes);
 					}
 				} else {
 					*((int*)input_message_buffer) = current;
 					memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
 					MPI_Send(input_message_buffer, 1, ctx->order_datatype,
 					         node, PARALLEL_ORDER, MPI_COMM_WORLD);
 					active_jobs[node]++;
 					current++;
 					if(active_jobs[node] < 3) {
 						*((int*)input_message_buffer) = current;
 						memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
 						MPI_Send(input_message_buffer, 1, ctx->order_datatype,
 						          node, PARALLEL_ORDER, MPI_COMM_WORLD);
 						active_jobs[node]++;
 						current++;
 						INFO("job %d completed by node %d, continues with %d and %d\n", id, node, current-1, current-2);
 					} else {
 						INFO("job %d completed by node %d, continues with %d\n", id, node, current-1);
 					}
 				}
 			}
 		}
 		for(int i = 0; i < njobs; i++) {
 			memcpy(output_array + i*ctx->output_size, alljobs + i*itemsize + sizeof(int), ctx->output_size);
 		}
 		free(input_message_buffer);
 		free(output_message_buffer);
 		free(node_data);
 		free(active_jobs);
 	cleanup_mpi:
 		munmap(alljobs, njobs*itemsize);
 		close(restartf);
 	}
 	return 0;
 }
--- a/parallelization/parallel.h
+++ b/parallelization/parallel.h
@@ -1,118 +0,0 @@
 #ifndef PARALLEL_H
 #define PARALLEL_H
 /*
  this is a library to parallelize workloads which can be split up naturally
  into a sequence of independent jobs, using MPI. A program will usually
  - do precomputation
  - fill array with input data
  - do the parallel work
  - print the output data
  we want to enable restarts, so that only unfinshed jobs need to be repeated.
  Further, we want to be resilient to slow/unreliable network and to losing
  nodes. There is a main node and a number of workers. The main node does the
  precomputation and then retires do do administrative work, and the workers
  do the actual jobs. We also want to switch to serial mode if the program is
  called without MPI.
  The following data has to be transimitted between nodes:
  - results of the precomputation (read-only, shared between nodes)
  - job-specific input data, generated by main node before parallel part
  - output data for each job
  the parallel work shall be given as a callback function which takes
  input data and precomputation data as parameter
  the above program will look like this for us:
  - parallel_init
  - if we are a worker, do parallel_work(init_callback, job_callback), exit
  - do precomputation
  - fill array with input data
  - output_array = parallel_run(input_array)
  - print the output data
  parallel_init:
    - check if we're running as an mpi program
    - init mpi, check what kind of node we are
  parallel_work(init_callback1, init_callback2, job_callback):
    - receive global_precomp (???)
    - worker_precomp = init_callback2(global_precomp, worker_precomp)
    - infinite loop:
      - wait for job on network, receive input
      - output = job_callback(global_precomp, worker_precomp, input)
      - send output on network
      - exit loop on shutdown signal
  parallel_run(global_precomp, input_array, restart file, callbacks):
    - check if we're running as an MPI program
    - send global_precomp to all nodes (if MPI)
    - if(restart file given and exists) read restart file
    - else create new restart file
    - until(all jobs finished):
      - if MPI:
        - send next job & input to appropriate node
        - if all jobs are in work, reassign unfinished ones (up to limit)
        - collect outputs
      - if no MPI:
        - worker_precomp = init_callback1
        - worker_precomp = init_callback2(global_precomp, worker_precomp)
        - for(j in jobs)
          - output(j) = job_callback(global_precomp, worker_precomp, input(j))
    - delete restart file
    - return array of outputs
    parallel_destroy():
    - free everything
    have a context? probably yes: parallel_context
    plan:
    - make interface
    - implement no-MPI part
    - restructure singular_values.c to use interface
    - implement MPI part
 */
 #include <mpi.h>
 #include <time.h>
 typedef void (*parallel_callback_destroy)(const void*, void*);
 typedef int (*parallel_callback_init)(const void*,void*);
 typedef int (*parallel_callback_job)(const void*,void*,const void*,void*);
 typedef struct {
 	int mpi_mode;
 	struct timespec starttime;
 	char processor_name[MPI_MAX_PROCESSOR_NAME];
 	int processor_name_len;
 	int rank;
 	int size;
 	MPI_Datatype order_datatype;
 	MPI_Datatype result_datatype;
 	parallel_callback_init init;
 	parallel_callback_job job;
 	parallel_callback_destroy destroy;
 	void *global_data;
 	void *node_data;
 	int global_data_size;
 	int node_data_size;
 	int input_size;
 	int output_size;
 } parallel_context;
 parallel_context *parallel_init();
 void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size);
 int parallel_work(parallel_context *ctx);
 int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *restart_filename);
 void parallel_destroy(parallel_context* ctx);
 int mpi_rank();
 void start_timer();
 double runtime();
 #endif
--- a/parallelization/run_local
+++ b/parallelization/run_local
@@ -1,9 +0,0 @@
 #!/bin/bash
 # nmax=895882 # up to reflection group word length 22 ( 555 group)
 nmax=700000 # up to reflection group word length 22 ( 444 group)
 # nmax=11575 # up to reflection group word length 14
 #time mpirun --mca opal_warn_on_missing_libcuda 0  -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out
 time mpirun --mca opal_warn_on_missing_libcuda 0 --mca mpi_yield_when_idle 1 -np 4 ./singular_values 700000 4 4 4 1 1 100 1 100 100 $1
--- a/parallelization/run_utexas
+++ b/parallelization/run_utexas
@@ -1,14 +0,0 @@
 #!/bin/bash
 cd /home/stecker/svmpi/
 nmax=895882 # up to reflection group word length 22
 # nmax=11575 # up to reflection group word length 14
 outfile=result_$(date +%Y%m%d_%H%M%S).out
 unset DISPLAY
 make singular_values &&
 time mpirun -n 100 -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs --hostfile hostfile_big ./singular_values $nmax utexas_cluster_restart $outfile
--- a/parallelization/runjobs.py
+++ b/parallelization/runjobs.py
@@ -0,0 +1,57 @@
 #!/usr/bin/python
 from mpi4py import MPI
 import os
 import re
 import math
 import subprocess
 import time
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 nodes = comm.Get_size()
 # print(os.path.abspath(os.curdir))
 done = set()
 for f in os.listdir('.'):
 	if re.search('^done_[0-9]+', f):
 		fp = open(f, "r")
 		for x in fp:
 			done.add(int(x))
 		fp.close()
 f = open("commands", "r")
 idx = 0
 todo = []
 for c in f:
 	if not idx in done:
 		todo.append((idx,c))
 	idx = idx+1
 f.close()
 start = math.floor(len(todo)/nodes*rank)
 end = math.floor(len(todo)/nodes*(rank+1))
 if(rank == nodes-1):
 	end = len(todo)
 print("{n:d} commands awaiting execution, {nnode:d} of them in node {rank:d}".format(n=len(todo),nnode=end-start,rank=rank))
 time.sleep(1) # to make sure all nodes read the status first before more gets done
 outfilename = "result_{node:003d}".format(node=rank)
 donefilename = "done_{node:003d}".format(node=rank)
 outfile = open(outfilename, "a")
 donefile = open(donefilename, "a")
 for i in range(start, end):
 	result = subprocess.call(todo[i][1], stdout=outfile, shell=True)
 	if result == 0:
 		donefile.write(str(todo[i][0]) + '\n')
 	else:
 		print("Command failed: {cmd}".format(cmd=todo[i][1]))
 	outfile.flush()
 	donefile.flush()
 outfile.close()
 donefile.close()
--- a/parallelization/stampede.slurm
+++ b/parallelization/stampede.slurm
@@ -1,20 +0,0 @@
 #!/bin/bash
 #SBATCH -J ejp_trg
 #SBATCH -o logs/ejp_trg.o%j
 #SBATCH -e logs/ejp_trg.e%j
 #SBATCH -p skx-dev
 #SBATCH -N 1
 #SBATCH -n 48
 #SBATCH -t 00:05:00
 #SBATCH --mail-user=mail@florianstecker.net
 #SBATCH --mail-type=all
 export LD_LIBRARY_PATH=$WORK/mps/lib:$LD_LIBRARY_PATH
 d=$(date +%Y%m%d_%H%M%S)
 nmax=895882 # up to reflection group word length 22
 # nmax=11575 # up to reflection group word length 1
 ibrun ./singular_values $nmax $SCRATCH/ejp_trg_restart $WORK/ejp_trg/output/result_$d
--- a/parallelization/sync_stampede
+++ b/parallelization/sync_stampede
@@ -1,8 +0,0 @@
 #!/bin/bash
 rsync -vt *.c *.h Makefile stampede.slurm stampede:work/ejp_trg/
 #rsync -lvt /usr/lib/libmps.so* /usr/include/mps utexas:work/ejp_trg/libs/
 # now run it with a job script
 # get MPSolve from https://numpi.dm.unipi.it/_media/software/mpsolve/mpsolve-3.2.1.tar.bz2
--- a/parallelization/sync_utexas
+++ b/parallelization/sync_utexas
@@ -1,7 +0,0 @@
 #!/bin/bash
 rsync -vt *.c *.h Makefile hostfile hostfile_big allhosts localnames run_utexas run_local utexas:svmpi/
 rsync -lvt /usr/lib/libmps.so* utexas:svmpi/libs/
 rsync -rvt /usr/include/mps utexas:svmpi/libs/
 # now run it with ssh utexas -t ssh linux50 svmpi/run_utexas