From d2d91d68b3d04e7906938ff4920207bdf32cc776 Mon Sep 17 00:00:00 2001
From: Florian Stecker <mail@florianstecker.de>
Date: Mon, 14 Feb 2022 20:34:06 -0500
Subject: [PATCH] create MPI library; not very optimized yet

---
 .gitignore        |   2 +
 Makefile          |  25 +--
 parallel.c        | 349 +++++++++++++++++++++++++++++++++++++++++
 parallel.h        | 118 ++++++++++++++
 run_local         |   7 +-
 singular_values.c | 392 ++++++++++++++++++++++++++++------------------
 6 files changed, 728 insertions(+), 165 deletions(-)
 create mode 100644 parallel.c
 create mode 100644 parallel.h

diff --git a/.gitignore b/.gitignore
index 1f23877..ebfcc8f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ billiard_words
 *.png
 *.hi
 gmon.out
+restart
+core
diff --git a/Makefile b/Makefile
index 0c7d7ba..eec1e19 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,14 @@
-HEADERS=linalg.h mat.h coxeter.h enumerate_triangle_group.h
+HEADERS=linalg.h mat.h coxeter.h enumerate_triangle_group.h parallel.h
 
 #SPECIAL_OPTIONS=-O0 -g -D_DEBUG
-SPECIAL_OPTIONS=-O3 -pg -g -funroll-loops -fno-inline
-#SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline
+#SPECIAL_OPTIONS=-O3 -pg -g -funroll-loops -fno-inline
+SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline
 #SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline -mavx512f -mavx512cd -mavx512er -mavx512pf # KNL
 #SPECIAL_OPTIONS=
 
 OPTIONS=-I../mps/include -L../mps/lib -pthread -m64 -std=gnu99 -D_GNU_SOURCE $(SPECIAL_OPTIONS)
 
-all: singular_values special_element singular_values_mpi convert billiard_words
+all: singular_values special_element convert billiard_words
 
 convert: convert.hs
 	ghc --make -dynamic convert.hs
@@ -16,11 +16,11 @@ convert: convert.hs
 billiard_words: billiard_words.hs
 	ghc --make -dynamic billiard_words.hs
 
-singular_values: singular_values.o coxeter.o mat.o enumerate_triangle_group.o
-	gcc $(OPTIONS) -o singular_values coxeter.o singular_values.o mat.o enumerate_triangle_group.o -lm -lgmp -lmps
+singular_values: singular_values.o coxeter.o mat.o enumerate_triangle_group.o parallel.o
+	mpicc $(OPTIONS) -o singular_values coxeter.o singular_values.o mat.o enumerate_triangle_group.o parallel.o -lm -lgmp -lmps
 
-singular_values_mpi: singular_values_mpi.o coxeter.o mat.o
-	mpicc $(OPTIONS) -o singular_values_mpi coxeter.o singular_values_mpi.o mat.o -lm -lgmp -lmps
+#singular_values_mpi: singular_values_mpi.o coxeter.o mat.o
+#	mpicc $(OPTIONS) -o singular_values_mpi coxeter.o singular_values_mpi.o mat.o -lm -lgmp -lmps
 
 special_element: special_element.o coxeter.o linalg.o mat.o enumerate_triangle_group.o
 	gcc $(OPTIONS) -o special_element coxeter.o linalg.o special_element.o mat.o enumerate_triangle_group.o -lm -lgmp -lmps -lgsl -lcblas
@@ -28,8 +28,8 @@ special_element: special_element.o coxeter.o linalg.o mat.o enumerate_triangle_g
 singular_values.o: singular_values.c $(HEADERS)
 	gcc $(OPTIONS) -c singular_values.c
 
-singular_values_mpi.o: singular_values_mpi.c $(HEADERS)
-	mpicc $(OPTIONS) -c singular_values_mpi.c
+#singular_values_mpi.o: singular_values_mpi.c $(HEADERS)
+#	mpicc $(OPTIONS) -c singular_values_mpi.c
 
 special_element.o: special_element.c $(HEADERS)
 	gcc $(OPTIONS) -c special_element.c
@@ -46,5 +46,8 @@ coxeter.o: coxeter.c $(HEADERS)
 mat.o: mat.c $(HEADERS)
 	gcc $(OPTIONS) -c mat.c
 
+parallel.o: parallel.c $(HEADERS)
+	gcc $(OPTIONS) -c parallel.c
+
 clean:
-	rm -f singular_values special_element singular_values_mpi coxeter.o linalg.o singular_values.o singular_values_mpi.o mat.o special_element.o convert.hi convert.o convert billiard_words.hi billiard_words.o billiard_words enumerate_triangle_group.o
+	rm -f singular_values special_element singular_values_mpi coxeter.o linalg.o singular_values.o singular_values_mpi.o mat.o special_element.o convert.hi convert.o convert billiard_words.hi billiard_words.o billiard_words enumerate_triangle_group.o parallel.o
diff --git a/parallel.c b/parallel.c
new file mode 100644
index 0000000..84ca72c
--- /dev/null
+++ b/parallel.c
@@ -0,0 +1,349 @@
+#include "parallel.h"
+
+#include <mpi.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <malloc.h>
+#include <stdlib.h>
+
+#define DEBUG(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__)
+//#define DEBUG(msg, ...) fprintf(stderr, "[   %10.3f] " msg, runtime(), ##__VA_ARGS__)
+//#define DEBUG_MPI(msg, node, ...) fprintf(stderr, "[%003d%10.3f] " msg, node, runtime(), ##__VA_ARGS__)
+#define DONE(x) *((int*)(x))
+
+enum message_tag {
+	PARALLEL_ORDER,
+	PARALLEL_RESULT,
+	PARALLEL_SHUTDOWN,
+	PARALLEL_GLOBAL_DATA
+};
+
+struct timespec starttime;
+
+int mpi_rank(int activate_mpi)
+{
+	static int active = 0;
+	if(activate_mpi)
+		active = 1;
+
+	if(!active)
+		return 0;
+	else {
+		int rank;
+		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+		return rank;
+	}
+}
+
+void start_timer()
+{
+	clock_gettime(CLOCK_MONOTONIC, &starttime);
+}
+
+double runtime()
+{
+	struct timespec curtime;
+	double diff;
+	clock_gettime(CLOCK_MONOTONIC, &curtime);
+	return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9;
+}
+
+parallel_context *parallel_init()
+{
+	parallel_context *ctx = malloc(sizeof(parallel_context));
+
+	if(!getenv("OMPI_COMM_WORLD_SIZE")) {
+		ctx->mpi_mode = 0;
+		DEBUG("Running standalone.\n");
+		return ctx;
+	}
+
+	ctx->mpi_mode = 1;
+	int result = MPI_Init(NULL, NULL);
+	MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
+	MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
+	MPI_Get_processor_name(ctx->processor_name, &ctx->processor_name_len);
+
+	mpi_rank(1); // display the rank in debug output from now on
+
+	if(ctx->rank == 0)
+		DEBUG("Running in mpi mode, %d nodes.\n", ctx->size);
+
+	return ctx;
+}
+
+void parallel_destroy(parallel_context* ctx)
+{
+	if(ctx->mpi_mode) {
+		MPI_Type_free(&ctx->order_datatype);
+		MPI_Type_free(&ctx->result_datatype);
+		MPI_Finalize();
+	}
+
+	free(ctx);
+}
+
+void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size)
+{
+	ctx->init = init;
+	ctx->destroy = destroy;
+	ctx->job = job;
+	ctx->global_data_size = global_data_size;
+	ctx->node_data_size = node_data_size;
+	ctx->input_size = input_size;
+	ctx->output_size = output_size;
+
+	if(ctx->mpi_mode) {
+		// create a datatype for job orders, consisting of an integer (the job id) and a user-defined section
+		int order_blocklengths[2] = {1, input_size};
+		MPI_Aint order_displacements[2] = {0, sizeof(int)};
+		MPI_Datatype order_types[2] = {MPI_INT, MPI_BYTE};
+		MPI_Type_create_struct(2, order_blocklengths, order_displacements, order_types, &ctx->order_datatype);
+		MPI_Type_commit(&ctx->order_datatype);
+
+		int result_blocklengths[2] = {1, output_size};
+		MPI_Aint result_displacements[2] = {0, sizeof(int)};
+		MPI_Datatype result_types[2] = {MPI_INT, MPI_BYTE};
+		MPI_Type_create_struct(2, result_blocklengths, result_displacements, result_types, &ctx->result_datatype);
+		MPI_Type_commit(&ctx->result_datatype);
+	}
+}
+
+int parallel_work(parallel_context *ctx)
+{
+	// do nothing in non-mpi mode
+	if(ctx->mpi_mode == 0)
+		return 0;
+
+	MPI_Status status;
+	void *global_data = malloc(ctx->global_data_size);
+	void *node_data = malloc(ctx->node_data_size);
+	void *input_and_job_nr = malloc(ctx->input_size + sizeof(int));
+	void *input = input_and_job_nr + sizeof(int);
+	int *job_nr = (int *)input_and_job_nr;
+	void *output_and_job_nr = malloc(ctx->output_size + sizeof(int));
+	void *output = output_and_job_nr + sizeof(int);
+	int *output_job_nr = (int *)output_and_job_nr;
+	double jobtime;
+
+	// wait for global data
+	MPI_Bcast(global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+	DEBUG("Global data received\n");
+
+	// initialize node_data (and do once-per-node computation)
+	ctx->init(global_data, node_data);
+
+	DEBUG("Initialization completed\n");
+
+	while(1) {
+		MPI_Probe(0, MPI_ANY_TAG, MPI_COMM_WORLD,
+		          &status);
+
+		DEBUG("Message received: source = %d, tag = %d\n", status.MPI_SOURCE, status.MPI_TAG);
+
+		if(status.MPI_TAG == PARALLEL_SHUTDOWN) {
+			DEBUG("Shutting down\n");
+			break;
+		} else if(status.MPI_TAG == PARALLEL_ORDER) {
+			MPI_Recv(input_and_job_nr,
+			         1, ctx->order_datatype,
+			         0, PARALLEL_ORDER, MPI_COMM_WORLD,
+			         &status);
+
+			DEBUG("Working on job %d\n", *job_nr);
+
+			jobtime = -MPI_Wtime();
+
+			// do the actual work
+			ctx->job(global_data, node_data, input, output);
+
+			jobtime += MPI_Wtime();
+
+			DEBUG("Finished job %d in %f seconds\n", *job_nr, jobtime);
+
+			*output_job_nr = *job_nr;
+			MPI_Send(output_and_job_nr,
+			         1, ctx->result_datatype,
+			         0, PARALLEL_RESULT, MPI_COMM_WORLD);
+		}
+	}
+
+	ctx->destroy(global_data, node_data);
+
+	free(global_data);
+	free(node_data);
+	free(input_and_job_nr);
+	free(output_and_job_nr);
+
+	return 0;
+}
+
+int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *_restart_filename)
+{
+	// in non-mpi-mode, just run init1, init2, forall(jobs) job
+	if(ctx->mpi_mode == 0) {
+		int result;
+		void *node_data = malloc(ctx->node_data_size);
+		result = ctx->init(global_data, node_data);
+		if(result != 0)
+			goto cleanup_standalone;
+
+		for(int i = 0; i < njobs; i++) {
+			result = ctx->job(
+				global_data,
+				node_data,
+				input_array + ctx->input_size*i,
+				output_array + ctx->output_size*i);
+			if(result != 0)
+				goto cleanup_standalone;
+		}
+
+	cleanup_standalone:
+		ctx->destroy(global_data, node_data);
+		return result;
+	} else {
+		// if no restart file was specified, pick a filename
+		char *restart_filename;
+		char buffer[128];
+		int restartf;
+		if(_restart_filename == NULL) {
+			time_t t = time(NULL);
+			struct tm *loctm = localtime(&t);
+			strftime(buffer, sizeof(buffer), "restart/restart_%y%m%d_%H%M%S", loctm);
+			restart_filename = buffer;
+		} else {
+			restart_filename = (char *)_restart_filename;
+		}
+
+		// open restart file if it exists, otherwise create it
+		int continuing = 1;
+		restartf = open(restart_filename, O_RDWR);
+		if(restartf == -1 && errno == ENOENT) {
+			restartf = open(restart_filename, O_RDWR | O_CREAT, 0666);
+			continuing = 0;
+		}
+		if(restartf == -1) {
+			DEBUG("Error opening restart file: %s\n", strerror(errno));
+			exit(1);
+		}
+
+		// map restart file
+		int itemsize = (ctx->output_size + sizeof(int)); // for every job, store output, and completed flag
+		ftruncate(restartf, njobs*itemsize);
+		void *alljobs = mmap(0, njobs*itemsize, PROT_READ | PROT_WRITE, MAP_SHARED, restartf, 0);
+		if(alljobs == MAP_FAILED) {
+			DEBUG("Error mapping restart file: %s\n", strerror(errno));
+			exit(1);
+		}
+
+		// count completed jobs, or initialize jobs
+		int completed = 0;
+		if(continuing) {
+			for(int i = 0; i < njobs; i++)
+				if(DONE(alljobs + i*itemsize))
+					completed++;
+		} else {
+			for(int i = 0; i < njobs; i++) {
+				DONE(alljobs + i*itemsize) = 0;
+				memcpy(alljobs + i*itemsize + sizeof(int), input_array + i*ctx->input_size, ctx->input_size); // copy input data
+			}
+		}
+
+		fsync(restartf);
+
+		if(continuing) {
+			DEBUG("Continuing from restart file, %d/%d jobs completed, %d nodes\n", completed, njobs, ctx->size);
+		} else {
+			DEBUG("Starting from scratch, %d jobs, %d nodes\n", njobs, ctx->size);
+		}
+
+		if(completed >= njobs)
+			goto cleanup_mpi;
+
+		/* Send global data */
+		MPI_Bcast((void*)global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+		DEBUG("Global data sent\n");
+
+		void *input_message_buffer = malloc(ctx->input_size + sizeof(int));
+		void *output_message_buffer = malloc(ctx->output_size + sizeof(int));
+
+		// find next unfinished job
+		int current = 0;
+		while(current < njobs && DONE(alljobs + current*itemsize))
+			current++;
+
+		// assign initial jobs, 2 for each worker thread
+		for(int i = 0; i < 2*(ctx->size-1); i++) {
+			if(current >= njobs) // all jobs are assigned
+				break;
+
+			// send job id and input data
+			// send to all nodes except ourself (node 0)
+			*((int*)input_message_buffer) = current;
+			memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
+			MPI_Send(input_message_buffer, 1, ctx->order_datatype,
+			         i%(ctx->size-1)+1, PARALLEL_ORDER, MPI_COMM_WORLD);
+
+			DEBUG("Job %d sent to node %d\n", current, i%(ctx->size-1)+1);
+			current++;
+		}
+
+		MPI_Status status;
+		int active_worker_nodes = ctx->size - 1;
+		while(1) {
+			MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+			if(status.MPI_TAG == PARALLEL_RESULT) {
+				MPI_Recv(output_message_buffer, 1, ctx->result_datatype,
+				         MPI_ANY_SOURCE, PARALLEL_RESULT, MPI_COMM_WORLD, &status);
+
+				int id = *((int*)output_message_buffer);
+				memcpy(alljobs + id*itemsize + sizeof(int), output_message_buffer + sizeof(int), ctx->output_size);
+				DONE(alljobs + id*itemsize) = 1;
+				completed++;
+
+				DEBUG("job %d completed by node %d\n", id, status.MPI_SOURCE);
+
+				// todo: deal with unresponsive nodes
+				// strategy: when no jobs left, go through unfinished list again, incrementing oversubscribe counter
+				// if oversubscribe counter is at limit, shut node down instead
+				//
+
+				if(current >= njobs) { // all jobs are assigned, shut down node
+					MPI_Send(NULL, 0, MPI_BYTE, status.MPI_SOURCE, PARALLEL_SHUTDOWN, MPI_COMM_WORLD);
+					active_worker_nodes--;
+					if(active_worker_nodes)
+						continue;
+					else
+						break;
+				}
+
+				*((int*)input_message_buffer) = current;
+				memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
+				MPI_Send(input_message_buffer, 1, ctx->order_datatype,
+				         status.MPI_SOURCE, PARALLEL_ORDER, MPI_COMM_WORLD);
+
+				DEBUG("Job %d sent to node %d\n", current, status.MPI_SOURCE);
+				current++;
+			}
+		}
+
+		for(int i = 0; i < njobs; i++) {
+			memcpy(output_array + i*ctx->output_size, alljobs + i*itemsize + sizeof(int), ctx->output_size);
+		}
+
+		free(input_message_buffer);
+		free(output_message_buffer);
+
+	cleanup_mpi:
+		munmap(alljobs, njobs*itemsize);
+		close(restartf);
+	}
+
+	return 0;
+}
diff --git a/parallel.h b/parallel.h
new file mode 100644
index 0000000..4239875
--- /dev/null
+++ b/parallel.h
@@ -0,0 +1,118 @@
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+/*
+  this is a library to parallelize workloads which can be split up naturally
+  into a sequence of independent jobs, using MPI. A program will usually
+
+  - do precomputation
+  - fill array with input data
+  - do the parallel work
+  - print the output data
+
+  we want to enable restarts, so that only unfinshed jobs need to be repeated.
+  Further, we want to be resilient to slow/unreliable network and to losing
+  nodes. There is a main node and a number of workers. The main node does the
+  precomputation and then retires do do administrative work, and the workers
+  do the actual jobs. We also want to switch to serial mode if the program is
+  called without MPI.
+
+  The following data has to be transimitted between nodes:
+  - results of the precomputation (read-only, shared between nodes)
+  - job-specific input data, generated by main node before parallel part
+  - output data for each job
+
+  the parallel work shall be given as a callback function which takes
+  input data and precomputation data as parameter
+
+  the above program will look like this for us:
+
+  - parallel_init
+  - if we are a worker, do parallel_work(init_callback, job_callback), exit
+  - do precomputation
+  - fill array with input data
+  - output_array = parallel_run(input_array)
+  - print the output data
+
+  parallel_init:
+    - check if we're running as an mpi program
+    - init mpi, check what kind of node we are
+
+  parallel_work(init_callback1, init_callback2, job_callback):
+    - receive global_precomp (???)
+    - worker_precomp = init_callback2(global_precomp, worker_precomp)
+    - infinite loop:
+      - wait for job on network, receive input
+      - output = job_callback(global_precomp, worker_precomp, input)
+      - send output on network
+      - exit loop on shutdown signal
+
+  parallel_run(global_precomp, input_array, restart file, callbacks):
+    - check if we're running as an MPI program
+    - send global_precomp to all nodes (if MPI)
+    - if(restart file given and exists) read restart file
+    - else create new restart file
+    - until(all jobs finished):
+      - if MPI:
+        - send next job & input to appropriate node
+        - if all jobs are in work, reassign unfinished ones (up to limit)
+        - collect outputs
+      - if no MPI:
+        - worker_precomp = init_callback1
+        - worker_precomp = init_callback2(global_precomp, worker_precomp)
+        - for(j in jobs)
+          - output(j) = job_callback(global_precomp, worker_precomp, input(j))
+    - delete restart file
+    - return array of outputs
+
+    parallel_destroy():
+    - free everything
+
+    have a context? probably yes: parallel_context
+
+    plan:
+    - make interface
+    - implement no-MPI part
+    - restructure singular_values.c to use interface
+    - implement MPI part
+*/
+
+#include <mpi.h>
+#include <time.h>
+
+typedef void (*parallel_callback_destroy)(const void*, void*);
+typedef int (*parallel_callback_init)(const void*,void*);
+typedef int (*parallel_callback_job)(const void*,void*,const void*,void*);
+
+typedef struct {
+	int mpi_mode;
+	struct timespec starttime;
+	char processor_name[MPI_MAX_PROCESSOR_NAME];
+	int processor_name_len;
+	int rank;
+	int size;
+	MPI_Datatype order_datatype;
+	MPI_Datatype result_datatype;
+	parallel_callback_init init;
+	parallel_callback_job job;
+	parallel_callback_destroy destroy;
+	void *global_data;
+	void *node_data;
+	int global_data_size;
+	int node_data_size;
+	int input_size;
+	int output_size;
+} parallel_context;
+
+parallel_context *parallel_init();
+void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size);
+int parallel_work(parallel_context *ctx);
+int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *restart_filename);
+void parallel_destroy(parallel_context* ctx);
+
+int mpi_rank();
+void start_timer();
+double runtime();
+
+
+#endif
diff --git a/run_local b/run_local
index caa99af..76dcf8c 100755
--- a/run_local
+++ b/run_local
@@ -1,6 +1,9 @@
 #!/bin/bash
 
-nmax=895882 # up to reflection group word length 22
+# nmax=895882 # up to reflection group word length 22 ( 555 group)
+nmax=700000 # up to reflection group word length 22 ( 444 group)
 # nmax=11575 # up to reflection group word length 14
 
-time mpirun --mca opal_warn_on_missing_libcuda 0  -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out
+#time mpirun --mca opal_warn_on_missing_libcuda 0  -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out
+
+time mpirun --mca opal_warn_on_missing_libcuda 0 --mca mpi_yield_when_idle 1 -np 4 ./singular_values 700000 4 4 4 1 10 100 1 10 100
diff --git a/singular_values.c b/singular_values.c
index 75e267a..85b248d 100644
--- a/singular_values.c
+++ b/singular_values.c
@@ -2,21 +2,55 @@
 #include "linalg.h"
 #include "mat.h"
 #include "enumerate_triangle_group.h"
+#include "parallel.h"
 
 #include <time.h>
 
 #define SWAP(t,x,y) do { t _tmp = (x); (x) = (y); (y) = _tmp; } while (0);
 
-#define DEBUG(msg, ...) fprintf(stderr, "[%10.3f] " msg, runtime(), ##__VA_ARGS__);
+#define DEBUG(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__)
 //#define DEBUG(msg, ...)
 
 struct result {
 	int id;
-	int count;
 	mpq_t tr;
 	mpq_t trinv;
 	double x;
 	double y;
+	double slope;
+};
+
+// we want as much as possible to be node data, except if it is only known to the main node
+// (command line arguments) or should only be computed once (id list)
+
+struct global_data {
+	// command line arguments
+	unsigned int nmax;
+	unsigned int p1, p2, p3;
+	unsigned int sstart, send, sdenom;
+	unsigned int qstart, qend, qdenom;
+
+	unsigned int *id_list;
+	unsigned int id_list_length;
+};
+
+struct node_data {
+	group_t *group;
+	mat* matrices;
+	struct result *invariants;
+	struct result **distinct_invariants;
+	int distinct_invariants_length;
+	mps_context *solver;
+};
+
+struct input_data {
+	unsigned int snum, sden;
+	unsigned int qnum, qden;
+};
+
+struct output_data {
+	int max_slope_id;
+	double max_slope;
 };
 
 static int compare_result(const void *a_, const void *b_)
@@ -75,20 +109,6 @@ static int compare_result_by_slope(const void *a_, const void *b_)
 	return slopea > slopeb ? -1 : slopea < slopeb ? 1 : 0;
 }
 
-struct timespec starttime;
-static void start_timer()
-{
-	clock_gettime(CLOCK_MONOTONIC, &starttime);
-}
-
-static double runtime()
-{
-	struct timespec curtime;
-	double diff;
-	clock_gettime(CLOCK_MONOTONIC, &curtime);
-	return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9;
-}
-
 int compute_invariants(group_t *group, mat *matrices, struct result **invariants, int *n, int unique)
 {
 	mpq_t tmp;
@@ -120,10 +140,8 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
 		for(int i = 0; i < ntraces; i++) {
 			if(i == 0 || compare_result(&invariants[i], &invariants[nuniq-1]) != 0) {
 				invariants[nuniq] = invariants[i];
-				invariants[nuniq]->count = 1;
 				nuniq++;
 			} else {
-				invariants[nuniq-1]->count++;
 				int oldlength = group->elements[invariants[nuniq-1]->id].length;
 				int newlength = group->elements[invariants[i]->id].length;
 				if(newlength < oldlength)
@@ -141,7 +159,7 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
 	max_slope = 0;
 	for(int i = 0; i < nuniq; i++) {
 		retval = solve_characteristic_polynomial(solver, poly, invariants[i]->tr, invariants[i]->trinv, evs);
-		retval = 0;evs[0] = 2;evs[1] = 1;evs[2] = 0.5; // fake solving the polynomial for memory leak test
+
 		if(retval == 1) {
 			fprintf(stderr, "Error! Could not solve polynomial.\n");
 			continue;
@@ -161,6 +179,7 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
 
 		invariants[i]->x = x;
 		invariants[i]->y = y;
+		invariants[i]->slope = y/x;
 
 		if(y/x > max_slope + 1e-12 && (x > 0.1 || y > 0.1)) {
 			max_slope_id = invariants[i]->id;
@@ -194,180 +213,249 @@ long check_memory_usage(mat *matrices, int n)
 	return total;
 }
 
-int main(int argc, char *argv[])
+
+void destroy_node(const void *_g, void *_n)
 {
-	mpq_t s, q, t, tmp;
-	int p1, p2, p3;
-	int sstart, send, sdenom, qstart, qend, qdenom;
-	mat *matrices;
-	group_t *group;
-	int nmax, n;
-	int max_slope_id;
-	char buf[100];
-	char buf2[100];
-	struct result *invariants;
-	struct result **distinct_invariants;
+	struct global_data *g = (struct global_data *)_g;
+	struct node_data *n = (struct node_data *)_n;
 
-	start_timer();
-
-	mpq_inits(s, q, t, tmp, NULL);
-	if(argc < 11) {
-		fprintf(stderr, "Usage: %s <N> <p1> <p2> <p3> <s start> <s end> <s denom> <q start> <q end> <q denom>\n", argv[0]);
-		exit(1);
+	for(int i = 0; i < g->nmax; i++) {
+		mpq_clear(n->invariants[i].tr);
+		mpq_clear(n->invariants[i].trinv);
 	}
-	nmax = atoi(argv[1]);
-	p1 = atoi(argv[2]);
-	p2 = atoi(argv[3]);
-	p3 = atoi(argv[4]);
-	sstart = atoi(argv[5]);
-	send = atoi(argv[6]);
-	sdenom = atoi(argv[7]);
-	qstart = atoi(argv[8]);
-	qend = atoi(argv[9]);
-	qdenom = atoi(argv[10]);
+	free(n->invariants);
+	free(n->distinct_invariants);
+	for(int i = 0; i < g->nmax; i++)
+		mat_clear(n->matrices[i]);
+	free(n->matrices);
+	coxeter_clear(n->group);
+}
+
+int init_node(const void *_g, void *_n)
+{
+	struct global_data *g = (struct global_data *)_g;
+	struct node_data *n = (struct node_data *)_n;
 
 	DEBUG("Allocate\n");
-	matrices = malloc(nmax*sizeof(mat));
-	for(int i = 0; i < nmax; i++)
-		mat_init(matrices[i], 3);
-	invariants = malloc(nmax*sizeof(struct result));
-	distinct_invariants = malloc(nmax*sizeof(struct result));
-	for(int i = 0; i < nmax; i++) {
-		mpq_init(invariants[i].tr);
-		mpq_init(invariants[i].trinv);
+	g->id_list = (int*)(g+1);  // pointers get scrambled by transmission, reconstruct
+	n->matrices = malloc(g->nmax*sizeof(mat));
+	for(int i = 0; i < g->nmax; i++)
+		mat_init(n->matrices[i], 3);
+	n->invariants = malloc(g->nmax*sizeof(struct result));
+	n->distinct_invariants = malloc(g->nmax*sizeof(struct result)); // we won't need that many, but just in case
+	for(int i = 0; i < g->nmax; i++) {
+		mpq_init(n->invariants[i].tr);
+		mpq_init(n->invariants[i].trinv);
+		n->invariants[i].id = i;
 	}
 
 	// order of the triangle reflection generators: a, b, c
 	// order of the rotation orders: bc, ac, ab
 	DEBUG("Generate group\n");
-	group = coxeter_init_triangle(p1, p2, p3, nmax);
+	n->group = coxeter_init_triangle(g->p1, g->p2, g->p3, g->nmax);
 
-	// first run; compute all matrices
-	for(int i = 0; i < group->size; i++)
-		group->elements[i].need_to_compute = 1;
+	return 0;
+}
+
+int do_computation(const void *_g, void *_n, const void *_in, void *_out)
+{
+	struct global_data *g = (struct global_data *)_g;
+	struct node_data *n = (struct node_data *)_n;
+	struct input_data *in = (struct input_data *)_in;
+	struct output_data *out = (struct output_data *)_out;
+
+	mpq_t s, q;
+
+	mpq_inits(s, q, NULL);
+	mpq_set_ui(s, in->snum, in->sden);
+	mpq_set_ui(q, in->qnum, in->qden);
+
+	DEBUG("Computing max slope element for s = %d/%d and q = %d/%d.\n",
+	      in->snum, in->sden,
+	      in->qnum, in->qden);
+
+	for(int i = 0; i < n->group->size; i++)
+		n->group->elements[i].need_to_compute = 0;
+	n->group->elements[0].need_to_compute = 1;
+
+	int needed_elements = 1;
+	for(int i = 0; i < g->id_list_length; i++)
+	{
+		int id = g->id_list[i];
+		n->distinct_invariants[i] = &n->invariants[id];
+		groupelement_t *cur = &n->group->elements[id];
+		while(cur->need_to_compute == 0) {
+			cur->need_to_compute = 1;
+			needed_elements++;
+			cur = cur->parent->parent;  // also need to compute its even-length ancestors
+		}
+		cur = n->group->elements[id].inverse;
+		while(cur->need_to_compute == 0) {
+			cur->need_to_compute = 1;
+			needed_elements++;
+			cur = cur->parent->parent;
+		}
+	}
+	n->distinct_invariants_length = g->id_list_length;
+
+	DEBUG("Need to compute %d elements to get %d traces up to reflection length %d\n",
+	      needed_elements, g->id_list_length, n->group->elements[n->group->size-1].length);
+
+	DEBUG("Compute matrices\n");
+	enumerate(n->group, n->matrices, g->p1, g->p2, g->p3, s, q);
+
+	DEBUG("Compute invariants\n");
+	out->max_slope_id = compute_invariants(
+		n->group, n->matrices,
+		n->distinct_invariants, &n->distinct_invariants_length, 1);
+	out->max_slope = n->invariants[out->max_slope_id].slope;
+
+	mpq_clears(s, q, NULL);
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	char buf[100];
+	char buf2[100];
+	struct global_data *g;
+	struct node_data n;
+
+	start_timer();
+
+	// parse command line arguments
+	if(argc < 11) {
+		fprintf(stderr, "Usage: %s <N> <p1> <p2> <p3> <s start> <s end> <s denom> <q start> <q end> <q denom>\n", argv[0]);
+		exit(1);
+	}
+	int nmax = atoi(argv[1]);
+	g = (struct global_data*)malloc(sizeof(struct global_data) + nmax*sizeof(int));
+	g->id_list = (int*)(g+1);
+	g->nmax = nmax;
+	g->p1 = atoi(argv[2]);
+	g->p2 = atoi(argv[3]);
+	g->p3 = atoi(argv[4]);
+	g->sstart = atoi(argv[5]);
+	g->send = atoi(argv[6]);
+	g->sdenom = atoi(argv[7]);
+	g->qstart = atoi(argv[8]);
+	g->qend = atoi(argv[9]);
+	g->qdenom = atoi(argv[10]);
+
+	// initialize
+	parallel_context *ctx = parallel_init();
+	parallel_set_datasize_and_callbacks(ctx, init_node, do_computation, destroy_node,
+	                                    sizeof(struct global_data) + g->nmax*sizeof(int),
+	                                    sizeof(struct node_data),
+	                                    sizeof(struct input_data),
+	                                    sizeof(struct output_data));
+	if(ctx->mpi_mode == 1 && ctx->rank != 0) {
+		// worker mode
+		parallel_work(ctx);
+		parallel_destroy(ctx);
+		exit(0);
+	}
+
+	init_node(g, &n);
 
 	// use very generic values for the pilot run unless sstart=send and qstart=qend
-	if(sstart == send && qstart == qend) {
-		mpq_set_ui(s, sstart, sdenom);
-		mpq_set_ui(q, qstart, qdenom);
-		DEBUG("Single run for s = %d/%d, q = %d/%d\n", sstart, sdenom, qstart, qdenom);
+	struct input_data pilot_input;
+	struct output_data pilot_output;
+	if(g->sstart == g->send && g->qstart == g->qend) {
+		pilot_input.snum = g->sstart;
+		pilot_input.sden = g->sdenom;
+		pilot_input.qnum = g->qstart;
+		pilot_input.qden = g->qdenom;
+		DEBUG("Single run for s = %d/%d, q = %d/%d\n", g->sstart, g->sdenom, g->qstart, g->qdenom);
 	} else {
-		mpq_set_ui(s, 4, 100);
-		mpq_set_ui(q, 7, 100);
+		pilot_input.snum = 4;
+		pilot_input.sden = 100;
+		pilot_input.qnum = 7;
+		pilot_input.qden = 100;
 		DEBUG("Initial run for s = %d/%d, q = %d/%d\n", 4, 100, 7, 100);
 	}
 
-	DEBUG("Compute matrices\n");
-	enumerate(group, matrices, p1, p2, p3, s, q);
+	g->id_list_length = 0;
+	for(int i = 0; i < n.group->size; i++)
+		if(n.group->elements[i].length % 2 == 0 && n.group->elements[i].inverse)
+			g->id_list[g->id_list_length++] = i;
 
-	// prepare array of ids
-	n = 0;
-	for(int i = 0; i < group->size; i++)
-	{
-		if(group->elements[i].length % 2 != 0 || !group->elements[i].inverse)
-			continue;
-		invariants[i].id = i;
-		distinct_invariants[n++] = &invariants[i];
-	}
+	do_computation(g, &n, &pilot_input, &pilot_output);
 
-	DEBUG("Compute invariants\n");
-	max_slope_id = compute_invariants(group, matrices, distinct_invariants, &n, 1);
+	for(int i = 0; i < n.distinct_invariants_length; i++)
+		g->id_list[i] = n.distinct_invariants[i]->id;
+	g->id_list_length = n.distinct_invariants_length;
 
-	// prepare for next time; don't need to change ids in distinct_invariants!
-	for(int i = 0; i < group->size; i++)
-		group->elements[i].need_to_compute = 0;
-	group->elements[0].need_to_compute = 1;
-	int multiplication_count = 1;
-	for(int i = 0; i < n; i++) {
-		groupelement_t *cur = &group->elements[distinct_invariants[i]->id];
-		while(cur->need_to_compute == 0) {
-			cur->need_to_compute = 1;
-			multiplication_count++;
-			cur = cur->parent->parent;  // also need to compute its even-length ancestors
-		}
-		cur = group->elements[distinct_invariants[i]->id].inverse;
-		while(cur->need_to_compute == 0) {
-			cur->need_to_compute = 1;
-			multiplication_count++;
-			cur = cur->parent->parent;
-		}
+	if(g->sstart != g->send || g->qstart != g->qend) {
 
-	}
+		struct input_data *inputs = malloc((g->send - g->sstart + 1)*(g->qend - g->qstart + 1)*sizeof(struct input_data));
+		struct output_data *outputs = malloc((g->send - g->sstart + 1)*(g->qend - g->qstart + 1)*sizeof(struct input_data));
 
-	DEBUG("Would have needed %d matrix multiplications for %d unique traces up to reflection length %d\n", multiplication_count, n, group->elements[group->size-1].length);
-
-	if(sstart != send || qstart != qend) {
-		for(int sloop = sstart; sloop <= send; sloop++) {
-			for(int qloop = qstart; qloop <= qend; qloop++) {
-				DEBUG("Loop for s = %d/%d, q = %d/%d\n", sloop, sdenom, qloop, qdenom);
-				mpq_set_ui(s, sloop, sdenom);
-				mpq_set_ui(q, qloop, qdenom);
-				DEBUG("Compute matrices\n");
-				enumerate(group, matrices, p1, p2, p3, s, q);
-				DEBUG("Compute invariants\n");
-				max_slope_id = compute_invariants(group, matrices, distinct_invariants, &n, 0);
-				// output
-				gmp_printf("%Qd %Qd %s\n", s, q,
-				           print_word(&group->elements[max_slope_id], buf));
-				fflush(stdout);
+		int njobs = 0;
+		for(int sloop = g->sstart; sloop <= g->send; sloop++) {
+			for(int qloop = g->qstart; qloop <= g->qend; qloop++) {
+				inputs[njobs].sden = g->sdenom;
+				inputs[njobs].qden = g->qdenom;
+				inputs[njobs].snum = sloop;
+				inputs[njobs].qnum = qloop;
+				njobs++;
 			}
 		}
+
+		parallel_run(ctx, g, inputs, outputs, njobs, NULL);
+
+//				DEBUG("Loop for s = %d/%d, q = %d/%d\n", sloop, g->sdenom, qloop, g->qdenom);
+
+		for(int i = 0; i < njobs; i++)
+		{
+			gmp_printf("%d/%d %d/%d %s %f\n",
+			           inputs[i].snum, inputs[i].sden, inputs[i].qnum, inputs[i].qden,
+			           print_word(&n.group->elements[outputs[i].max_slope_id], buf),
+			           outputs[i].max_slope);
+		}
+
+		free(inputs);
+		free(outputs);
 	} else {
 		// output
-		for(int i = 0; i < n; i++) {
-			double slope = distinct_invariants[i]->y/distinct_invariants[i]->x;
-
+		for(int i = 0; i < n.distinct_invariants_length; i++) {
 			// exclude tr = trinv = 2/1/0/-1/3
+			mpq_t tmp;
+			mpq_init(tmp);
 			mpq_set_si(tmp, 2, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_set_si(tmp, 1, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_set_si(tmp, 0, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_set_si(tmp, -1, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
 			mpq_set_si(tmp, 3, 1);
-			if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
-			   mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
+			if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
+			   mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
 				continue;
+			mpq_clear(tmp);
 
-			gmp_printf("%d %d %s %f\n",
-			           distinct_invariants[i]->id, distinct_invariants[i]->count,
-			           print_word(&group->elements[distinct_invariants[i]->id], buf),
-			           slope
-				);
+			double slope = n.distinct_invariants[i]->y/n.distinct_invariants[i]->x;
 
-			/*
-			gmp_printf("%d %d %d %Qd %Qd %f %f %f %f %f %s\n",
-			           distinct_invariants[i]->id, distinct_invariants[i]->count, cumulative,
-			           distinct_invariants[i]->tr, distinct_invariants[i]->trinv,
-			           log(fabs(mpq_get_d(distinct_invariants[i]->tr))), log(fabs(mpq_get_d(distinct_invariants[i]->trinv))),
-			           distinct_invariants[i]->x, distinct_invariants[i]->y, slope,
-			           print_word(&group->elements[distinct_invariants[i]->id], buf)
-				);
-			*/
+			gmp_printf("%d %s %f\n",
+			           n.distinct_invariants[i]->id,
+			           print_word(&n.group->elements[n.distinct_invariants[i]->id], buf),
+			           slope);
 		}
 
 	}
 
-	DEBUG("Clean up\n");
-	for(int i = 0; i < nmax; i++) {
-		mpq_clear(invariants[i].tr);
-		mpq_clear(invariants[i].trinv);
-	}
-	free(invariants);
-	free(distinct_invariants);
-	for(int i = 0; i < nmax; i++)
-		mat_clear(matrices[i]);
-	free(matrices);
-	coxeter_clear(group);
-	mpq_clears(s, q, t, tmp, NULL);
+	destroy_node(g, &n);
+	free(g);
+	parallel_destroy(ctx);
 }