create MPI library; not very optimized yet

2022-02-14 20:34:06 -05:00
parent 2735281300
commit d2d91d68b3
6 changed files with 728 additions and 165 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ billiard_words
 *.png
 *.hi
 gmon.out
 restart
 core
--- a/25
+++ b/25
@@ -1,14 +1,14 @@
-HEADERS=linalg.h mat.h coxeter.h enumerate_triangle_group.h
+HEADERS=linalg.h mat.h coxeter.h enumerate_triangle_group.h parallel.h
 #SPECIAL_OPTIONS=-O0 -g -D_DEBUG
-SPECIAL_OPTIONS=-O3 -pg -g -funroll-loops -fno-inline
+#SPECIAL_OPTIONS=-O3 -pg -g -funroll-loops -fno-inline
-#SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline
+SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline
 #SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline -mavx512f -mavx512cd -mavx512er -mavx512pf # KNL
 #SPECIAL_OPTIONS=
 OPTIONS=-I../mps/include -L../mps/lib -pthread -m64 -std=gnu99 -D_GNU_SOURCE $(SPECIAL_OPTIONS)
-all: singular_values special_element singular_values_mpi convert billiard_words
+all: singular_values special_element convert billiard_words
 convert: convert.hs
 	ghc --make -dynamic convert.hs
@@ -16,11 +16,11 @@ convert: convert.hs
 billiard_words: billiard_words.hs
 	ghc --make -dynamic billiard_words.hs
-singular_values: singular_values.o coxeter.o mat.o enumerate_triangle_group.o
+singular_values: singular_values.o coxeter.o mat.o enumerate_triangle_group.o parallel.o
-	gcc $(OPTIONS) -o singular_values coxeter.o singular_values.o mat.o enumerate_triangle_group.o -lm -lgmp -lmps
+	mpicc $(OPTIONS) -o singular_values coxeter.o singular_values.o mat.o enumerate_triangle_group.o parallel.o -lm -lgmp -lmps
-singular_values_mpi: singular_values_mpi.o coxeter.o mat.o
+#singular_values_mpi: singular_values_mpi.o coxeter.o mat.o
-	mpicc $(OPTIONS) -o singular_values_mpi coxeter.o singular_values_mpi.o mat.o -lm -lgmp -lmps
+#	mpicc $(OPTIONS) -o singular_values_mpi coxeter.o singular_values_mpi.o mat.o -lm -lgmp -lmps
 special_element: special_element.o coxeter.o linalg.o mat.o enumerate_triangle_group.o
 	gcc $(OPTIONS) -o special_element coxeter.o linalg.o special_element.o mat.o enumerate_triangle_group.o -lm -lgmp -lmps -lgsl -lcblas
@@ -28,8 +28,8 @@ special_element: special_element.o coxeter.o linalg.o mat.o enumerate_triangle_g
 singular_values.o: singular_values.c $(HEADERS)
 	gcc $(OPTIONS) -c singular_values.c
-singular_values_mpi.o: singular_values_mpi.c $(HEADERS)
+#singular_values_mpi.o: singular_values_mpi.c $(HEADERS)
-	mpicc $(OPTIONS) -c singular_values_mpi.c
+#	mpicc $(OPTIONS) -c singular_values_mpi.c
 special_element.o: special_element.c $(HEADERS)
 	gcc $(OPTIONS) -c special_element.c
@@ -46,5 +46,8 @@ coxeter.o: coxeter.c $(HEADERS)
 mat.o: mat.c $(HEADERS)
 	gcc $(OPTIONS) -c mat.c
 parallel.o: parallel.c $(HEADERS)
 	gcc $(OPTIONS) -c parallel.c
 clean:
-	rm -f singular_values special_element singular_values_mpi coxeter.o linalg.o singular_values.o singular_values_mpi.o mat.o special_element.o convert.hi convert.o convert billiard_words.hi billiard_words.o billiard_words enumerate_triangle_group.o
+	rm -f singular_values special_element singular_values_mpi coxeter.o linalg.o singular_values.o singular_values_mpi.o mat.o special_element.o convert.hi convert.o convert billiard_words.hi billiard_words.o billiard_words enumerate_triangle_group.o parallel.o
--- a/parallel.c
+++ b/parallel.c
@@ -0,0 +1,349 @@
 #include "parallel.h"
 #include <mpi.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
 #include <malloc.h>
 #include <stdlib.h>
 #define DEBUG(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__)
 //#define DEBUG(msg, ...) fprintf(stderr, "[   %10.3f] " msg, runtime(), ##__VA_ARGS__)
 //#define DEBUG_MPI(msg, node, ...) fprintf(stderr, "[%003d%10.3f] " msg, node, runtime(), ##__VA_ARGS__)
 #define DONE(x) *((int*)(x))
 enum message_tag {
 	PARALLEL_ORDER,
 	PARALLEL_RESULT,
 	PARALLEL_SHUTDOWN,
 	PARALLEL_GLOBAL_DATA
 };
 struct timespec starttime;
 int mpi_rank(int activate_mpi)
 {
 	static int active = 0;
 	if(activate_mpi)
 		active = 1;
 	if(!active)
 		return 0;
 	else {
 		int rank;
 		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 		return rank;
 	}
 }
 void start_timer()
 {
 	clock_gettime(CLOCK_MONOTONIC, &starttime);
 }
 double runtime()
 {
 	struct timespec curtime;
 	double diff;
 	clock_gettime(CLOCK_MONOTONIC, &curtime);
 	return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9;
 }
 parallel_context *parallel_init()
 {
 	parallel_context *ctx = malloc(sizeof(parallel_context));
 	if(!getenv("OMPI_COMM_WORLD_SIZE")) {
 		ctx->mpi_mode = 0;
 		DEBUG("Running standalone.\n");
 		return ctx;
 	}
 	ctx->mpi_mode = 1;
 	int result = MPI_Init(NULL, NULL);
 	MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
 	MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
 	MPI_Get_processor_name(ctx->processor_name, &ctx->processor_name_len);
 	mpi_rank(1); // display the rank in debug output from now on
 	if(ctx->rank == 0)
 		DEBUG("Running in mpi mode, %d nodes.\n", ctx->size);
 	return ctx;
 }
 void parallel_destroy(parallel_context* ctx)
 {
 	if(ctx->mpi_mode) {
 		MPI_Type_free(&ctx->order_datatype);
 		MPI_Type_free(&ctx->result_datatype);
 		MPI_Finalize();
 	}
 	free(ctx);
 }
 void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size)
 {
 	ctx->init = init;
 	ctx->destroy = destroy;
 	ctx->job = job;
 	ctx->global_data_size = global_data_size;
 	ctx->node_data_size = node_data_size;
 	ctx->input_size = input_size;
 	ctx->output_size = output_size;
 	if(ctx->mpi_mode) {
 		// create a datatype for job orders, consisting of an integer (the job id) and a user-defined section
 		int order_blocklengths[2] = {1, input_size};
 		MPI_Aint order_displacements[2] = {0, sizeof(int)};
 		MPI_Datatype order_types[2] = {MPI_INT, MPI_BYTE};
 		MPI_Type_create_struct(2, order_blocklengths, order_displacements, order_types, &ctx->order_datatype);
 		MPI_Type_commit(&ctx->order_datatype);
 		int result_blocklengths[2] = {1, output_size};
 		MPI_Aint result_displacements[2] = {0, sizeof(int)};
 		MPI_Datatype result_types[2] = {MPI_INT, MPI_BYTE};
 		MPI_Type_create_struct(2, result_blocklengths, result_displacements, result_types, &ctx->result_datatype);
 		MPI_Type_commit(&ctx->result_datatype);
 	}
 }
 int parallel_work(parallel_context *ctx)
 {
 	// do nothing in non-mpi mode
 	if(ctx->mpi_mode == 0)
 		return 0;
 	MPI_Status status;
 	void *global_data = malloc(ctx->global_data_size);
 	void *node_data = malloc(ctx->node_data_size);
 	void *input_and_job_nr = malloc(ctx->input_size + sizeof(int));
 	void *input = input_and_job_nr + sizeof(int);
 	int *job_nr = (int *)input_and_job_nr;
 	void *output_and_job_nr = malloc(ctx->output_size + sizeof(int));
 	void *output = output_and_job_nr + sizeof(int);
 	int *output_job_nr = (int *)output_and_job_nr;
 	double jobtime;
 	// wait for global data
 	MPI_Bcast(global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD);
 	DEBUG("Global data received\n");
 	// initialize node_data (and do once-per-node computation)
 	ctx->init(global_data, node_data);
 	DEBUG("Initialization completed\n");
 	while(1) {
 		MPI_Probe(0, MPI_ANY_TAG, MPI_COMM_WORLD,
 		          &status);
 		DEBUG("Message received: source = %d, tag = %d\n", status.MPI_SOURCE, status.MPI_TAG);
 		if(status.MPI_TAG == PARALLEL_SHUTDOWN) {
 			DEBUG("Shutting down\n");
 			break;
 		} else if(status.MPI_TAG == PARALLEL_ORDER) {
 			MPI_Recv(input_and_job_nr,
 			         1, ctx->order_datatype,
 			         0, PARALLEL_ORDER, MPI_COMM_WORLD,
 			         &status);
 			DEBUG("Working on job %d\n", *job_nr);
 			jobtime = -MPI_Wtime();
 			// do the actual work
 			ctx->job(global_data, node_data, input, output);
 			jobtime += MPI_Wtime();
 			DEBUG("Finished job %d in %f seconds\n", *job_nr, jobtime);
 			*output_job_nr = *job_nr;
 			MPI_Send(output_and_job_nr,
 			         1, ctx->result_datatype,
 			         0, PARALLEL_RESULT, MPI_COMM_WORLD);
 		}
 	}
 	ctx->destroy(global_data, node_data);
 	free(global_data);
 	free(node_data);
 	free(input_and_job_nr);
 	free(output_and_job_nr);
 	return 0;
 }
 int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *_restart_filename)
 {
 	// in non-mpi-mode, just run init1, init2, forall(jobs) job
 	if(ctx->mpi_mode == 0) {
 		int result;
 		void *node_data = malloc(ctx->node_data_size);
 		result = ctx->init(global_data, node_data);
 		if(result != 0)
 			goto cleanup_standalone;
 		for(int i = 0; i < njobs; i++) {
 			result = ctx->job(
 				global_data,
 				node_data,
 				input_array + ctx->input_size*i,
 				output_array + ctx->output_size*i);
 			if(result != 0)
 				goto cleanup_standalone;
 		}
 	cleanup_standalone:
 		ctx->destroy(global_data, node_data);
 		return result;
 	} else {
 		// if no restart file was specified, pick a filename
 		char *restart_filename;
 		char buffer[128];
 		int restartf;
 		if(_restart_filename == NULL) {
 			time_t t = time(NULL);
 			struct tm *loctm = localtime(&t);
 			strftime(buffer, sizeof(buffer), "restart/restart_%y%m%d_%H%M%S", loctm);
 			restart_filename = buffer;
 		} else {
 			restart_filename = (char *)_restart_filename;
 		}
 		// open restart file if it exists, otherwise create it
 		int continuing = 1;
 		restartf = open(restart_filename, O_RDWR);
 		if(restartf == -1 && errno == ENOENT) {
 			restartf = open(restart_filename, O_RDWR | O_CREAT, 0666);
 			continuing = 0;
 		}
 		if(restartf == -1) {
 			DEBUG("Error opening restart file: %s\n", strerror(errno));
 			exit(1);
 		}
 		// map restart file
 		int itemsize = (ctx->output_size + sizeof(int)); // for every job, store output, and completed flag
 		ftruncate(restartf, njobs*itemsize);
 		void *alljobs = mmap(0, njobs*itemsize, PROT_READ | PROT_WRITE, MAP_SHARED, restartf, 0);
 		if(alljobs == MAP_FAILED) {
 			DEBUG("Error mapping restart file: %s\n", strerror(errno));
 			exit(1);
 		}
 		// count completed jobs, or initialize jobs
 		int completed = 0;
 		if(continuing) {
 			for(int i = 0; i < njobs; i++)
 				if(DONE(alljobs + i*itemsize))
 					completed++;
 		} else {
 			for(int i = 0; i < njobs; i++) {
 				DONE(alljobs + i*itemsize) = 0;
 				memcpy(alljobs + i*itemsize + sizeof(int), input_array + i*ctx->input_size, ctx->input_size); // copy input data
 			}
 		}
 		fsync(restartf);
 		if(continuing) {
 			DEBUG("Continuing from restart file, %d/%d jobs completed, %d nodes\n", completed, njobs, ctx->size);
 		} else {
 			DEBUG("Starting from scratch, %d jobs, %d nodes\n", njobs, ctx->size);
 		}
 		if(completed >= njobs)
 			goto cleanup_mpi;
 		/* Send global data */
 		MPI_Bcast((void*)global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD);
 		DEBUG("Global data sent\n");
 		void *input_message_buffer = malloc(ctx->input_size + sizeof(int));
 		void *output_message_buffer = malloc(ctx->output_size + sizeof(int));
 		// find next unfinished job
 		int current = 0;
 		while(current < njobs && DONE(alljobs + current*itemsize))
 			current++;
 		// assign initial jobs, 2 for each worker thread
 		for(int i = 0; i < 2*(ctx->size-1); i++) {
 			if(current >= njobs) // all jobs are assigned
 				break;
 			// send job id and input data
 			// send to all nodes except ourself (node 0)
 			*((int*)input_message_buffer) = current;
 			memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
 			MPI_Send(input_message_buffer, 1, ctx->order_datatype,
 			         i%(ctx->size-1)+1, PARALLEL_ORDER, MPI_COMM_WORLD);
 			DEBUG("Job %d sent to node %d\n", current, i%(ctx->size-1)+1);
 			current++;
 		}
 		MPI_Status status;
 		int active_worker_nodes = ctx->size - 1;
 		while(1) {
 			MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
 			if(status.MPI_TAG == PARALLEL_RESULT) {
 				MPI_Recv(output_message_buffer, 1, ctx->result_datatype,
 				         MPI_ANY_SOURCE, PARALLEL_RESULT, MPI_COMM_WORLD, &status);
 				int id = *((int*)output_message_buffer);
 				memcpy(alljobs + id*itemsize + sizeof(int), output_message_buffer + sizeof(int), ctx->output_size);
 				DONE(alljobs + id*itemsize) = 1;
 				completed++;
 				DEBUG("job %d completed by node %d\n", id, status.MPI_SOURCE);
 				// todo: deal with unresponsive nodes
 				// strategy: when no jobs left, go through unfinished list again, incrementing oversubscribe counter
 				// if oversubscribe counter is at limit, shut node down instead
 				//
 				if(current >= njobs) { // all jobs are assigned, shut down node
 					MPI_Send(NULL, 0, MPI_BYTE, status.MPI_SOURCE, PARALLEL_SHUTDOWN, MPI_COMM_WORLD);
 					active_worker_nodes--;
 					if(active_worker_nodes)
 						continue;
 					else
 						break;
 				}
 				*((int*)input_message_buffer) = current;
 				memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
 				MPI_Send(input_message_buffer, 1, ctx->order_datatype,
 				         status.MPI_SOURCE, PARALLEL_ORDER, MPI_COMM_WORLD);
 				DEBUG("Job %d sent to node %d\n", current, status.MPI_SOURCE);
 				current++;
 			}
 		}
 		for(int i = 0; i < njobs; i++) {
 			memcpy(output_array + i*ctx->output_size, alljobs + i*itemsize + sizeof(int), ctx->output_size);
 		}
 		free(input_message_buffer);
 		free(output_message_buffer);
 	cleanup_mpi:
 		munmap(alljobs, njobs*itemsize);
 		close(restartf);
 	}
 	return 0;
 }
--- a/parallel.h
+++ b/parallel.h
@@ -0,0 +1,118 @@
 #ifndef PARALLEL_H
 #define PARALLEL_H
 /*
  this is a library to parallelize workloads which can be split up naturally
  into a sequence of independent jobs, using MPI. A program will usually
  - do precomputation
  - fill array with input data
  - do the parallel work
  - print the output data
  we want to enable restarts, so that only unfinshed jobs need to be repeated.
  Further, we want to be resilient to slow/unreliable network and to losing
  nodes. There is a main node and a number of workers. The main node does the
  precomputation and then retires do do administrative work, and the workers
  do the actual jobs. We also want to switch to serial mode if the program is
  called without MPI.
  The following data has to be transimitted between nodes:
  - results of the precomputation (read-only, shared between nodes)
  - job-specific input data, generated by main node before parallel part
  - output data for each job
  the parallel work shall be given as a callback function which takes
  input data and precomputation data as parameter
  the above program will look like this for us:
  - parallel_init
  - if we are a worker, do parallel_work(init_callback, job_callback), exit
  - do precomputation
  - fill array with input data
  - output_array = parallel_run(input_array)
  - print the output data
  parallel_init:
    - check if we're running as an mpi program
    - init mpi, check what kind of node we are
  parallel_work(init_callback1, init_callback2, job_callback):
    - receive global_precomp (???)
    - worker_precomp = init_callback2(global_precomp, worker_precomp)
    - infinite loop:
      - wait for job on network, receive input
      - output = job_callback(global_precomp, worker_precomp, input)
      - send output on network
      - exit loop on shutdown signal
  parallel_run(global_precomp, input_array, restart file, callbacks):
    - check if we're running as an MPI program
    - send global_precomp to all nodes (if MPI)
    - if(restart file given and exists) read restart file
    - else create new restart file
    - until(all jobs finished):
      - if MPI:
        - send next job & input to appropriate node
        - if all jobs are in work, reassign unfinished ones (up to limit)
        - collect outputs
      - if no MPI:
        - worker_precomp = init_callback1
        - worker_precomp = init_callback2(global_precomp, worker_precomp)
        - for(j in jobs)
          - output(j) = job_callback(global_precomp, worker_precomp, input(j))
    - delete restart file
    - return array of outputs
    parallel_destroy():
    - free everything
    have a context? probably yes: parallel_context
    plan:
    - make interface
    - implement no-MPI part
    - restructure singular_values.c to use interface
    - implement MPI part
 */
 #include <mpi.h>
 #include <time.h>
 typedef void (*parallel_callback_destroy)(const void*, void*);
 typedef int (*parallel_callback_init)(const void*,void*);
 typedef int (*parallel_callback_job)(const void*,void*,const void*,void*);
 typedef struct {
 	int mpi_mode;
 	struct timespec starttime;
 	char processor_name[MPI_MAX_PROCESSOR_NAME];
 	int processor_name_len;
 	int rank;
 	int size;
 	MPI_Datatype order_datatype;
 	MPI_Datatype result_datatype;
 	parallel_callback_init init;
 	parallel_callback_job job;
 	parallel_callback_destroy destroy;
 	void *global_data;
 	void *node_data;
 	int global_data_size;
 	int node_data_size;
 	int input_size;
 	int output_size;
 } parallel_context;
 parallel_context *parallel_init();
 void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size);
 int parallel_work(parallel_context *ctx);
 int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *restart_filename);
 void parallel_destroy(parallel_context* ctx);
 int mpi_rank();
 void start_timer();
 double runtime();
 #endif
--- a/7
+++ b/7
@@ -1,6 +1,9 @@
 #!/bin/bash
-nmax=895882 # up to reflection group word length 22
+# nmax=895882 # up to reflection group word length 22 ( 555 group)
 nmax=700000 # up to reflection group word length 22 ( 444 group)
 # nmax=11575 # up to reflection group word length 14
-time mpirun --mca opal_warn_on_missing_libcuda 0  -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out
+#time mpirun --mca opal_warn_on_missing_libcuda 0  -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out
 time mpirun --mca opal_warn_on_missing_libcuda 0 --mca mpi_yield_when_idle 1 -np 4 ./singular_values 700000 4 4 4 1 10 100 1 10 100
--- a/singular_values.c
+++ b/singular_values.c
@@ -2,21 +2,55 @@
 #include "linalg.h"
 #include "mat.h"
 #include "enumerate_triangle_group.h"
 #include "parallel.h"
 #include <time.h>
 #define SWAP(t,x,y) do { t _tmp = (x); (x) = (y); (y) = _tmp; } while (0);
-#define DEBUG(msg, ...) fprintf(stderr, "[%10.3f] " msg, runtime(), ##__VA_ARGS__);
+#define DEBUG(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__)
 //#define DEBUG(msg, ...)
 struct result {
 	int id;
 	int count;
 	mpq_t tr;
 	mpq_t trinv;
 	double x;
 	double y;
 	double slope;
 };
 // we want as much as possible to be node data, except if it is only known to the main node
 // (command line arguments) or should only be computed once (id list)
 struct global_data {
 	// command line arguments
 	unsigned int nmax;
 	unsigned int p1, p2, p3;
 	unsigned int sstart, send, sdenom;
 	unsigned int qstart, qend, qdenom;
 	unsigned int *id_list;
 	unsigned int id_list_length;
 };
 struct node_data {
 	group_t *group;
 	mat* matrices;
 	struct result *invariants;
 	struct result **distinct_invariants;
 	int distinct_invariants_length;
 	mps_context *solver;
 };
 struct input_data {
 	unsigned int snum, sden;
 	unsigned int qnum, qden;
 };
 struct output_data {
 	int max_slope_id;
 	double max_slope;
 };
 static int compare_result(const void *a_, const void *b_)
@@ -75,20 +109,6 @@ static int compare_result_by_slope(const void *a_, const void *b_)
 	return slopea > slopeb ? -1 : slopea < slopeb ? 1 : 0;
 }
 struct timespec starttime;
 static void start_timer()
 {
 	clock_gettime(CLOCK_MONOTONIC, &starttime);
 }
 static double runtime()
 {
 	struct timespec curtime;
 	double diff;
 	clock_gettime(CLOCK_MONOTONIC, &curtime);
 	return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9;
 }
 int compute_invariants(group_t *group, mat *matrices, struct result **invariants, int *n, int unique)
 {
 	mpq_t tmp;
@@ -120,10 +140,8 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
 		for(int i = 0; i < ntraces; i++) {
 			if(i == 0 || compare_result(&invariants[i], &invariants[nuniq-1]) != 0) {
 				invariants[nuniq] = invariants[i];
 				invariants[nuniq]->count = 1;
 				nuniq++;
 			} else {
 				invariants[nuniq-1]->count++;
 				int oldlength = group->elements[invariants[nuniq-1]->id].length;
 				int newlength = group->elements[invariants[i]->id].length;
 				if(newlength < oldlength)
@@ -141,7 +159,7 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
 	max_slope = 0;
 	for(int i = 0; i < nuniq; i++) {
 		retval = solve_characteristic_polynomial(solver, poly, invariants[i]->tr, invariants[i]->trinv, evs);
-		retval = 0;evs[0] = 2;evs[1] = 1;evs[2] = 0.5; // fake solving the polynomial for memory leak test
+
 		if(retval == 1) {
 			fprintf(stderr, "Error! Could not solve polynomial.\n");
 			continue;
@@ -161,6 +179,7 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
 		invariants[i]->x = x;
 		invariants[i]->y = y;
 		invariants[i]->slope = y/x;
 		if(y/x > max_slope + 1e-12 && (x > 0.1 || y > 0.1)) {
 			max_slope_id = invariants[i]->id;
@@ -194,180 +213,249 @@ long check_memory_usage(mat *matrices, int n)
 	return total;
 }
-int main(int argc, char *argv[])
+
 void destroy_node(const void *_g, void *_n)
 {
-	mpq_t s, q, t, tmp;
+	struct global_data *g = (struct global_data *)_g;
-	int p1, p2, p3;
+	struct node_data *n = (struct node_data *)_n;
 	int sstart, send, sdenom, qstart, qend, qdenom;
 	mat *matrices;
 	group_t *group;
 	int nmax, n;
 	int max_slope_id;
 	char buf[100];
 	char buf2[100];
 	struct result *invariants;
 	struct result **distinct_invariants;
-	start_timer();
+	for(int i = 0; i < g->nmax; i++) {
-
+		mpq_clear(n->invariants[i].tr);
-	mpq_inits(s, q, t, tmp, NULL);
+		mpq_clear(n->invariants[i].trinv);
 	if(argc < 11) {
 		fprintf(stderr, "Usage: %s <N> <p1> <p2> <p3> <s start> <s end> <s denom> <q start> <q end> <q denom>\n", argv[0]);
 		exit(1);
 	}
-	nmax = atoi(argv[1]);
+	free(n->invariants);
-	p1 = atoi(argv[2]);
+	free(n->distinct_invariants);
-	p2 = atoi(argv[3]);
+	for(int i = 0; i < g->nmax; i++)
-	p3 = atoi(argv[4]);
+		mat_clear(n->matrices[i]);
-	sstart = atoi(argv[5]);
+	free(n->matrices);
-	send = atoi(argv[6]);
+	coxeter_clear(n->group);
-	sdenom = atoi(argv[7]);
+}
-	qstart = atoi(argv[8]);
+
-	qend = atoi(argv[9]);
+int init_node(const void *_g, void *_n)
-	qdenom = atoi(argv[10]);
+{
 	struct global_data *g = (struct global_data *)_g;
 	struct node_data *n = (struct node_data *)_n;
 	DEBUG("Allocate\n");
-	matrices = malloc(nmax*sizeof(mat));
+	g->id_list = (int*)(g+1);  // pointers get scrambled by transmission, reconstruct
-	for(int i = 0; i < nmax; i++)
+	n->matrices = malloc(g->nmax*sizeof(mat));
-		mat_init(matrices[i], 3);
+	for(int i = 0; i < g->nmax; i++)
-	invariants = malloc(nmax*sizeof(struct result));
+		mat_init(n->matrices[i], 3);
-	distinct_invariants = malloc(nmax*sizeof(struct result));
+	n->invariants = malloc(g->nmax*sizeof(struct result));
-	for(int i = 0; i < nmax; i++) {
+	n->distinct_invariants = malloc(g->nmax*sizeof(struct result)); // we won't need that many, but just in case
-		mpq_init(invariants[i].tr);
+	for(int i = 0; i < g->nmax; i++) {
-		mpq_init(invariants[i].trinv);
+		mpq_init(n->invariants[i].tr);
 		mpq_init(n->invariants[i].trinv);
 		n->invariants[i].id = i;
 	}
 	// order of the triangle reflection generators: a, b, c
 	// order of the rotation orders: bc, ac, ab
 	DEBUG("Generate group\n");
-	group = coxeter_init_triangle(p1, p2, p3, nmax);
+	n->group = coxeter_init_triangle(g->p1, g->p2, g->p3, g->nmax);
-	// first run; compute all matrices
+	return 0;
-	for(int i = 0; i < group->size; i++)
+}
-		group->elements[i].need_to_compute = 1;
+
 int do_computation(const void *_g, void *_n, const void *_in, void *_out)
 {
 	struct global_data *g = (struct global_data *)_g;
 	struct node_data *n = (struct node_data *)_n;
 	struct input_data *in = (struct input_data *)_in;
 	struct output_data *out = (struct output_data *)_out;
 	mpq_t s, q;
 	mpq_inits(s, q, NULL);
 	mpq_set_ui(s, in->snum, in->sden);
 	mpq_set_ui(q, in->qnum, in->qden);
 	DEBUG("Computing max slope element for s = %d/%d and q = %d/%d.\n",
 	      in->snum, in->sden,
 	      in->qnum, in->qden);
 	for(int i = 0; i < n->group->size; i++)
 		n->group->elements[i].need_to_compute = 0;
 	n->group->elements[0].need_to_compute = 1;
 	int needed_elements = 1;
 	for(int i = 0; i < g->id_list_length; i++)
 	{
 		int id = g->id_list[i];
 		n->distinct_invariants[i] = &n->invariants[id];
 		groupelement_t *cur = &n->group->elements[id];
 		while(cur->need_to_compute == 0) {
 			cur->need_to_compute = 1;
 			needed_elements++;
 			cur = cur->parent->parent;  // also need to compute its even-length ancestors
 		}
 		cur = n->group->elements[id].inverse;
 		while(cur->need_to_compute == 0) {
 			cur->need_to_compute = 1;
 			needed_elements++;
 			cur = cur->parent->parent;
 		}
 	}
 	n->distinct_invariants_length = g->id_list_length;
 	DEBUG("Need to compute %d elements to get %d traces up to reflection length %d\n",
 	      needed_elements, g->id_list_length, n->group->elements[n->group->size-1].length);
 	DEBUG("Compute matrices\n");
 	enumerate(n->group, n->matrices, g->p1, g->p2, g->p3, s, q);
 	DEBUG("Compute invariants\n");
 	out->max_slope_id = compute_invariants(
 		n->group, n->matrices,
 		n->distinct_invariants, &n->distinct_invariants_length, 1);
 	out->max_slope = n->invariants[out->max_slope_id].slope;
 	mpq_clears(s, q, NULL);
 	return 0;
 }
 int main(int argc, char *argv[])
 {
 	char buf[100];
 	char buf2[100];
 	struct global_data *g;
 	struct node_data n;
 	start_timer();
 	// parse command line arguments
 	if(argc < 11) {
 		fprintf(stderr, "Usage: %s <N> <p1> <p2> <p3> <s start> <s end> <s denom> <q start> <q end> <q denom>\n", argv[0]);
 		exit(1);
 	}
 	int nmax = atoi(argv[1]);
 	g = (struct global_data*)malloc(sizeof(struct global_data) + nmax*sizeof(int));
 	g->id_list = (int*)(g+1);
 	g->nmax = nmax;
 	g->p1 = atoi(argv[2]);
 	g->p2 = atoi(argv[3]);
 	g->p3 = atoi(argv[4]);
 	g->sstart = atoi(argv[5]);
 	g->send = atoi(argv[6]);
 	g->sdenom = atoi(argv[7]);
 	g->qstart = atoi(argv[8]);
 	g->qend = atoi(argv[9]);
 	g->qdenom = atoi(argv[10]);
 	// initialize
 	parallel_context *ctx = parallel_init();
 	parallel_set_datasize_and_callbacks(ctx, init_node, do_computation, destroy_node,
 	                                    sizeof(struct global_data) + g->nmax*sizeof(int),
 	                                    sizeof(struct node_data),
 	                                    sizeof(struct input_data),
 	                                    sizeof(struct output_data));
 	if(ctx->mpi_mode == 1 && ctx->rank != 0) {
 		// worker mode
 		parallel_work(ctx);
 		parallel_destroy(ctx);
 		exit(0);
 	}
 	init_node(g, &n);
 	// use very generic values for the pilot run unless sstart=send and qstart=qend
-	if(sstart == send && qstart == qend) {
+	struct input_data pilot_input;
-		mpq_set_ui(s, sstart, sdenom);
+	struct output_data pilot_output;
-		mpq_set_ui(q, qstart, qdenom);
+	if(g->sstart == g->send && g->qstart == g->qend) {
-		DEBUG("Single run for s = %d/%d, q = %d/%d\n", sstart, sdenom, qstart, qdenom);
+		pilot_input.snum = g->sstart;
 		pilot_input.sden = g->sdenom;
 		pilot_input.qnum = g->qstart;
 		pilot_input.qden = g->qdenom;
 		DEBUG("Single run for s = %d/%d, q = %d/%d\n", g->sstart, g->sdenom, g->qstart, g->qdenom);
 	} else {
-		mpq_set_ui(s, 4, 100);
+		pilot_input.snum = 4;
-		mpq_set_ui(q, 7, 100);
+		pilot_input.sden = 100;
 		pilot_input.qnum = 7;
 		pilot_input.qden = 100;
 		DEBUG("Initial run for s = %d/%d, q = %d/%d\n", 4, 100, 7, 100);
 	}
-	DEBUG("Compute matrices\n");
+	g->id_list_length = 0;
-	enumerate(group, matrices, p1, p2, p3, s, q);
+	for(int i = 0; i < n.group->size; i++)
 		if(n.group->elements[i].length % 2 == 0 && n.group->elements[i].inverse)
 			g->id_list[g->id_list_length++] = i;
-	// prepare array of ids
+	do_computation(g, &n, &pilot_input, &pilot_output);
 	n = 0;
 	for(int i = 0; i < group->size; i++)
 	{
 		if(group->elements[i].length % 2 != 0 || !group->elements[i].inverse)
 			continue;
 		invariants[i].id = i;
 		distinct_invariants[n++] = &invariants[i];
 	}
-	DEBUG("Compute invariants\n");
+	for(int i = 0; i < n.distinct_invariants_length; i++)
-	max_slope_id = compute_invariants(group, matrices, distinct_invariants, &n, 1);
+		g->id_list[i] = n.distinct_invariants[i]->id;
 	g->id_list_length = n.distinct_invariants_length;
-	// prepare for next time; don't need to change ids in distinct_invariants!
+	if(g->sstart != g->send || g->qstart != g->qend) {
 	for(int i = 0; i < group->size; i++)
 		group->elements[i].need_to_compute = 0;
 	group->elements[0].need_to_compute = 1;
 	int multiplication_count = 1;
 	for(int i = 0; i < n; i++) {
 		groupelement_t *cur = &group->elements[distinct_invariants[i]->id];
 		while(cur->need_to_compute == 0) {
 			cur->need_to_compute = 1;
 			multiplication_count++;
 			cur = cur->parent->parent;  // also need to compute its even-length ancestors
 		}
 		cur = group->elements[distinct_invariants[i]->id].inverse;
 		while(cur->need_to_compute == 0) {
 			cur->need_to_compute = 1;
 			multiplication_count++;
 			cur = cur->parent->parent;
 		}
-	}
+		struct input_data *inputs = malloc((g->send - g->sstart + 1)*(g->qend - g->qstart + 1)*sizeof(struct input_data));
 		struct output_data *outputs = malloc((g->send - g->sstart + 1)*(g->qend - g->qstart + 1)*sizeof(struct input_data));
-	DEBUG("Would have needed %d matrix multiplications for %d unique traces up to reflection length %d\n", multiplication_count, n, group->elements[group->size-1].length);
+		int njobs = 0;
-
+		for(int sloop = g->sstart; sloop <= g->send; sloop++) {
-	if(sstart != send || qstart != qend) {
+			for(int qloop = g->qstart; qloop <= g->qend; qloop++) {
-		for(int sloop = sstart; sloop <= send; sloop++) {
+				inputs[njobs].sden = g->sdenom;
-			for(int qloop = qstart; qloop <= qend; qloop++) {
+				inputs[njobs].qden = g->qdenom;
-				DEBUG("Loop for s = %d/%d, q = %d/%d\n", sloop, sdenom, qloop, qdenom);
+				inputs[njobs].snum = sloop;
-				mpq_set_ui(s, sloop, sdenom);
+				inputs[njobs].qnum = qloop;
-				mpq_set_ui(q, qloop, qdenom);
+				njobs++;
 				DEBUG("Compute matrices\n");
 				enumerate(group, matrices, p1, p2, p3, s, q);
 				DEBUG("Compute invariants\n");
 				max_slope_id = compute_invariants(group, matrices, distinct_invariants, &n, 0);
 				// output
 				gmp_printf("%Qd %Qd %s\n", s, q,
 				           print_word(&group->elements[max_slope_id], buf));
 				fflush(stdout);
 			}
 		}
 		parallel_run(ctx, g, inputs, outputs, njobs, NULL);
 //				DEBUG("Loop for s = %d/%d, q = %d/%d\n", sloop, g->sdenom, qloop, g->qdenom);
 		for(int i = 0; i < njobs; i++)
 		{
 			gmp_printf("%d/%d %d/%d %s %f\n",
 			           inputs[i].snum, inputs[i].sden, inputs[i].qnum, inputs[i].qden,
 			           print_word(&n.group->elements[outputs[i].max_slope_id], buf),
 			           outputs[i].max_slope);
 		}
 		free(inputs);
 		free(outputs);
 	} else {
 		// output
-		for(int i = 0; i < n; i++) {
+		for(int i = 0; i < n.distinct_invariants_length; i++) {
 			double slope = distinct_invariants[i]->y/distinct_invariants[i]->x;
 			// exclude tr = trinv = 2/1/0/-1/3
 			mpq_t tmp;
 			mpq_init(tmp);
 			mpq_set_si(tmp, 2, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_set_si(tmp, 1, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_set_si(tmp, 0, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_set_si(tmp, -1, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_set_si(tmp, 3, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_clear(tmp);
-			gmp_printf("%d %d %s %f\n",
+			double slope = n.distinct_invariants[i]->y/n.distinct_invariants[i]->x;
 			           distinct_invariants[i]->id, distinct_invariants[i]->count,
 			           print_word(&group->elements[distinct_invariants[i]->id], buf),
 			           slope
 				);
-			/*
+			gmp_printf("%d %s %f\n",
-			gmp_printf("%d %d %d %Qd %Qd %f %f %f %f %f %s\n",
+			           n.distinct_invariants[i]->id,
-			           distinct_invariants[i]->id, distinct_invariants[i]->count, cumulative,
+			           print_word(&n.group->elements[n.distinct_invariants[i]->id], buf),
-			           distinct_invariants[i]->tr, distinct_invariants[i]->trinv,
+			           slope);
 			           log(fabs(mpq_get_d(distinct_invariants[i]->tr))), log(fabs(mpq_get_d(distinct_invariants[i]->trinv))),
 			           distinct_invariants[i]->x, distinct_invariants[i]->y, slope,
 			           print_word(&group->elements[distinct_invariants[i]->id], buf)
 				);
 			*/
 		}
 	}
-	DEBUG("Clean up\n");
+	destroy_node(g, &n);
-	for(int i = 0; i < nmax; i++) {
+	free(g);
-		mpq_clear(invariants[i].tr);
+	parallel_destroy(ctx);
 		mpq_clear(invariants[i].trinv);
 	}
 	free(invariants);
 	free(distinct_invariants);
 	for(int i = 0; i < nmax; i++)
 		mat_clear(matrices[i]);
 	free(matrices);
 	coxeter_clear(group);
 	mpq_clears(s, q, t, tmp, NULL);
 }