From d2d91d68b3d04e7906938ff4920207bdf32cc776 Mon Sep 17 00:00:00 2001 From: Florian Stecker Date: Mon, 14 Feb 2022 20:34:06 -0500 Subject: [PATCH] create MPI library; not very optimized yet --- .gitignore | 2 + Makefile | 25 +-- parallel.c | 349 +++++++++++++++++++++++++++++++++++++++++ parallel.h | 118 ++++++++++++++ run_local | 7 +- singular_values.c | 392 ++++++++++++++++++++++++++++------------------ 6 files changed, 728 insertions(+), 165 deletions(-) create mode 100644 parallel.c create mode 100644 parallel.h diff --git a/.gitignore b/.gitignore index 1f23877..ebfcc8f 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ billiard_words *.png *.hi gmon.out +restart +core diff --git a/Makefile b/Makefile index 0c7d7ba..eec1e19 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,14 @@ -HEADERS=linalg.h mat.h coxeter.h enumerate_triangle_group.h +HEADERS=linalg.h mat.h coxeter.h enumerate_triangle_group.h parallel.h #SPECIAL_OPTIONS=-O0 -g -D_DEBUG -SPECIAL_OPTIONS=-O3 -pg -g -funroll-loops -fno-inline -#SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline +#SPECIAL_OPTIONS=-O3 -pg -g -funroll-loops -fno-inline +SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline #SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline -mavx512f -mavx512cd -mavx512er -mavx512pf # KNL #SPECIAL_OPTIONS= OPTIONS=-I../mps/include -L../mps/lib -pthread -m64 -std=gnu99 -D_GNU_SOURCE $(SPECIAL_OPTIONS) -all: singular_values special_element singular_values_mpi convert billiard_words +all: singular_values special_element convert billiard_words convert: convert.hs ghc --make -dynamic convert.hs @@ -16,11 +16,11 @@ convert: convert.hs billiard_words: billiard_words.hs ghc --make -dynamic billiard_words.hs -singular_values: singular_values.o coxeter.o mat.o enumerate_triangle_group.o - gcc $(OPTIONS) -o singular_values coxeter.o singular_values.o mat.o enumerate_triangle_group.o -lm -lgmp -lmps +singular_values: singular_values.o coxeter.o mat.o enumerate_triangle_group.o parallel.o + mpicc $(OPTIONS) -o singular_values coxeter.o singular_values.o mat.o enumerate_triangle_group.o parallel.o -lm -lgmp -lmps -singular_values_mpi: singular_values_mpi.o coxeter.o mat.o - mpicc $(OPTIONS) -o singular_values_mpi coxeter.o singular_values_mpi.o mat.o -lm -lgmp -lmps +#singular_values_mpi: singular_values_mpi.o coxeter.o mat.o +# mpicc $(OPTIONS) -o singular_values_mpi coxeter.o singular_values_mpi.o mat.o -lm -lgmp -lmps special_element: special_element.o coxeter.o linalg.o mat.o enumerate_triangle_group.o gcc $(OPTIONS) -o special_element coxeter.o linalg.o special_element.o mat.o enumerate_triangle_group.o -lm -lgmp -lmps -lgsl -lcblas @@ -28,8 +28,8 @@ special_element: special_element.o coxeter.o linalg.o mat.o enumerate_triangle_g singular_values.o: singular_values.c $(HEADERS) gcc $(OPTIONS) -c singular_values.c -singular_values_mpi.o: singular_values_mpi.c $(HEADERS) - mpicc $(OPTIONS) -c singular_values_mpi.c +#singular_values_mpi.o: singular_values_mpi.c $(HEADERS) +# mpicc $(OPTIONS) -c singular_values_mpi.c special_element.o: special_element.c $(HEADERS) gcc $(OPTIONS) -c special_element.c @@ -46,5 +46,8 @@ coxeter.o: coxeter.c $(HEADERS) mat.o: mat.c $(HEADERS) gcc $(OPTIONS) -c mat.c +parallel.o: parallel.c $(HEADERS) + gcc $(OPTIONS) -c parallel.c + clean: - rm -f singular_values special_element singular_values_mpi coxeter.o linalg.o singular_values.o singular_values_mpi.o mat.o special_element.o convert.hi convert.o convert billiard_words.hi billiard_words.o billiard_words enumerate_triangle_group.o + rm -f singular_values special_element singular_values_mpi coxeter.o linalg.o singular_values.o singular_values_mpi.o mat.o special_element.o convert.hi convert.o convert billiard_words.hi billiard_words.o billiard_words enumerate_triangle_group.o parallel.o diff --git a/parallel.c b/parallel.c new file mode 100644 index 0000000..84ca72c --- /dev/null +++ b/parallel.c @@ -0,0 +1,349 @@ +#include "parallel.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__) +//#define DEBUG(msg, ...) fprintf(stderr, "[ %10.3f] " msg, runtime(), ##__VA_ARGS__) +//#define DEBUG_MPI(msg, node, ...) fprintf(stderr, "[%003d%10.3f] " msg, node, runtime(), ##__VA_ARGS__) +#define DONE(x) *((int*)(x)) + +enum message_tag { + PARALLEL_ORDER, + PARALLEL_RESULT, + PARALLEL_SHUTDOWN, + PARALLEL_GLOBAL_DATA +}; + +struct timespec starttime; + +int mpi_rank(int activate_mpi) +{ + static int active = 0; + if(activate_mpi) + active = 1; + + if(!active) + return 0; + else { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + return rank; + } +} + +void start_timer() +{ + clock_gettime(CLOCK_MONOTONIC, &starttime); +} + +double runtime() +{ + struct timespec curtime; + double diff; + clock_gettime(CLOCK_MONOTONIC, &curtime); + return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9; +} + +parallel_context *parallel_init() +{ + parallel_context *ctx = malloc(sizeof(parallel_context)); + + if(!getenv("OMPI_COMM_WORLD_SIZE")) { + ctx->mpi_mode = 0; + DEBUG("Running standalone.\n"); + return ctx; + } + + ctx->mpi_mode = 1; + int result = MPI_Init(NULL, NULL); + MPI_Comm_size(MPI_COMM_WORLD, &ctx->size); + MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank); + MPI_Get_processor_name(ctx->processor_name, &ctx->processor_name_len); + + mpi_rank(1); // display the rank in debug output from now on + + if(ctx->rank == 0) + DEBUG("Running in mpi mode, %d nodes.\n", ctx->size); + + return ctx; +} + +void parallel_destroy(parallel_context* ctx) +{ + if(ctx->mpi_mode) { + MPI_Type_free(&ctx->order_datatype); + MPI_Type_free(&ctx->result_datatype); + MPI_Finalize(); + } + + free(ctx); +} + +void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size) +{ + ctx->init = init; + ctx->destroy = destroy; + ctx->job = job; + ctx->global_data_size = global_data_size; + ctx->node_data_size = node_data_size; + ctx->input_size = input_size; + ctx->output_size = output_size; + + if(ctx->mpi_mode) { + // create a datatype for job orders, consisting of an integer (the job id) and a user-defined section + int order_blocklengths[2] = {1, input_size}; + MPI_Aint order_displacements[2] = {0, sizeof(int)}; + MPI_Datatype order_types[2] = {MPI_INT, MPI_BYTE}; + MPI_Type_create_struct(2, order_blocklengths, order_displacements, order_types, &ctx->order_datatype); + MPI_Type_commit(&ctx->order_datatype); + + int result_blocklengths[2] = {1, output_size}; + MPI_Aint result_displacements[2] = {0, sizeof(int)}; + MPI_Datatype result_types[2] = {MPI_INT, MPI_BYTE}; + MPI_Type_create_struct(2, result_blocklengths, result_displacements, result_types, &ctx->result_datatype); + MPI_Type_commit(&ctx->result_datatype); + } +} + +int parallel_work(parallel_context *ctx) +{ + // do nothing in non-mpi mode + if(ctx->mpi_mode == 0) + return 0; + + MPI_Status status; + void *global_data = malloc(ctx->global_data_size); + void *node_data = malloc(ctx->node_data_size); + void *input_and_job_nr = malloc(ctx->input_size + sizeof(int)); + void *input = input_and_job_nr + sizeof(int); + int *job_nr = (int *)input_and_job_nr; + void *output_and_job_nr = malloc(ctx->output_size + sizeof(int)); + void *output = output_and_job_nr + sizeof(int); + int *output_job_nr = (int *)output_and_job_nr; + double jobtime; + + // wait for global data + MPI_Bcast(global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD); + + DEBUG("Global data received\n"); + + // initialize node_data (and do once-per-node computation) + ctx->init(global_data, node_data); + + DEBUG("Initialization completed\n"); + + while(1) { + MPI_Probe(0, MPI_ANY_TAG, MPI_COMM_WORLD, + &status); + + DEBUG("Message received: source = %d, tag = %d\n", status.MPI_SOURCE, status.MPI_TAG); + + if(status.MPI_TAG == PARALLEL_SHUTDOWN) { + DEBUG("Shutting down\n"); + break; + } else if(status.MPI_TAG == PARALLEL_ORDER) { + MPI_Recv(input_and_job_nr, + 1, ctx->order_datatype, + 0, PARALLEL_ORDER, MPI_COMM_WORLD, + &status); + + DEBUG("Working on job %d\n", *job_nr); + + jobtime = -MPI_Wtime(); + + // do the actual work + ctx->job(global_data, node_data, input, output); + + jobtime += MPI_Wtime(); + + DEBUG("Finished job %d in %f seconds\n", *job_nr, jobtime); + + *output_job_nr = *job_nr; + MPI_Send(output_and_job_nr, + 1, ctx->result_datatype, + 0, PARALLEL_RESULT, MPI_COMM_WORLD); + } + } + + ctx->destroy(global_data, node_data); + + free(global_data); + free(node_data); + free(input_and_job_nr); + free(output_and_job_nr); + + return 0; +} + +int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *_restart_filename) +{ + // in non-mpi-mode, just run init1, init2, forall(jobs) job + if(ctx->mpi_mode == 0) { + int result; + void *node_data = malloc(ctx->node_data_size); + result = ctx->init(global_data, node_data); + if(result != 0) + goto cleanup_standalone; + + for(int i = 0; i < njobs; i++) { + result = ctx->job( + global_data, + node_data, + input_array + ctx->input_size*i, + output_array + ctx->output_size*i); + if(result != 0) + goto cleanup_standalone; + } + + cleanup_standalone: + ctx->destroy(global_data, node_data); + return result; + } else { + // if no restart file was specified, pick a filename + char *restart_filename; + char buffer[128]; + int restartf; + if(_restart_filename == NULL) { + time_t t = time(NULL); + struct tm *loctm = localtime(&t); + strftime(buffer, sizeof(buffer), "restart/restart_%y%m%d_%H%M%S", loctm); + restart_filename = buffer; + } else { + restart_filename = (char *)_restart_filename; + } + + // open restart file if it exists, otherwise create it + int continuing = 1; + restartf = open(restart_filename, O_RDWR); + if(restartf == -1 && errno == ENOENT) { + restartf = open(restart_filename, O_RDWR | O_CREAT, 0666); + continuing = 0; + } + if(restartf == -1) { + DEBUG("Error opening restart file: %s\n", strerror(errno)); + exit(1); + } + + // map restart file + int itemsize = (ctx->output_size + sizeof(int)); // for every job, store output, and completed flag + ftruncate(restartf, njobs*itemsize); + void *alljobs = mmap(0, njobs*itemsize, PROT_READ | PROT_WRITE, MAP_SHARED, restartf, 0); + if(alljobs == MAP_FAILED) { + DEBUG("Error mapping restart file: %s\n", strerror(errno)); + exit(1); + } + + // count completed jobs, or initialize jobs + int completed = 0; + if(continuing) { + for(int i = 0; i < njobs; i++) + if(DONE(alljobs + i*itemsize)) + completed++; + } else { + for(int i = 0; i < njobs; i++) { + DONE(alljobs + i*itemsize) = 0; + memcpy(alljobs + i*itemsize + sizeof(int), input_array + i*ctx->input_size, ctx->input_size); // copy input data + } + } + + fsync(restartf); + + if(continuing) { + DEBUG("Continuing from restart file, %d/%d jobs completed, %d nodes\n", completed, njobs, ctx->size); + } else { + DEBUG("Starting from scratch, %d jobs, %d nodes\n", njobs, ctx->size); + } + + if(completed >= njobs) + goto cleanup_mpi; + + /* Send global data */ + MPI_Bcast((void*)global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD); + + DEBUG("Global data sent\n"); + + void *input_message_buffer = malloc(ctx->input_size + sizeof(int)); + void *output_message_buffer = malloc(ctx->output_size + sizeof(int)); + + // find next unfinished job + int current = 0; + while(current < njobs && DONE(alljobs + current*itemsize)) + current++; + + // assign initial jobs, 2 for each worker thread + for(int i = 0; i < 2*(ctx->size-1); i++) { + if(current >= njobs) // all jobs are assigned + break; + + // send job id and input data + // send to all nodes except ourself (node 0) + *((int*)input_message_buffer) = current; + memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size); + MPI_Send(input_message_buffer, 1, ctx->order_datatype, + i%(ctx->size-1)+1, PARALLEL_ORDER, MPI_COMM_WORLD); + + DEBUG("Job %d sent to node %d\n", current, i%(ctx->size-1)+1); + current++; + } + + MPI_Status status; + int active_worker_nodes = ctx->size - 1; + while(1) { + MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + if(status.MPI_TAG == PARALLEL_RESULT) { + MPI_Recv(output_message_buffer, 1, ctx->result_datatype, + MPI_ANY_SOURCE, PARALLEL_RESULT, MPI_COMM_WORLD, &status); + + int id = *((int*)output_message_buffer); + memcpy(alljobs + id*itemsize + sizeof(int), output_message_buffer + sizeof(int), ctx->output_size); + DONE(alljobs + id*itemsize) = 1; + completed++; + + DEBUG("job %d completed by node %d\n", id, status.MPI_SOURCE); + + // todo: deal with unresponsive nodes + // strategy: when no jobs left, go through unfinished list again, incrementing oversubscribe counter + // if oversubscribe counter is at limit, shut node down instead + // + + if(current >= njobs) { // all jobs are assigned, shut down node + MPI_Send(NULL, 0, MPI_BYTE, status.MPI_SOURCE, PARALLEL_SHUTDOWN, MPI_COMM_WORLD); + active_worker_nodes--; + if(active_worker_nodes) + continue; + else + break; + } + + *((int*)input_message_buffer) = current; + memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size); + MPI_Send(input_message_buffer, 1, ctx->order_datatype, + status.MPI_SOURCE, PARALLEL_ORDER, MPI_COMM_WORLD); + + DEBUG("Job %d sent to node %d\n", current, status.MPI_SOURCE); + current++; + } + } + + for(int i = 0; i < njobs; i++) { + memcpy(output_array + i*ctx->output_size, alljobs + i*itemsize + sizeof(int), ctx->output_size); + } + + free(input_message_buffer); + free(output_message_buffer); + + cleanup_mpi: + munmap(alljobs, njobs*itemsize); + close(restartf); + } + + return 0; +} diff --git a/parallel.h b/parallel.h new file mode 100644 index 0000000..4239875 --- /dev/null +++ b/parallel.h @@ -0,0 +1,118 @@ +#ifndef PARALLEL_H +#define PARALLEL_H + +/* + this is a library to parallelize workloads which can be split up naturally + into a sequence of independent jobs, using MPI. A program will usually + + - do precomputation + - fill array with input data + - do the parallel work + - print the output data + + we want to enable restarts, so that only unfinshed jobs need to be repeated. + Further, we want to be resilient to slow/unreliable network and to losing + nodes. There is a main node and a number of workers. The main node does the + precomputation and then retires do do administrative work, and the workers + do the actual jobs. We also want to switch to serial mode if the program is + called without MPI. + + The following data has to be transimitted between nodes: + - results of the precomputation (read-only, shared between nodes) + - job-specific input data, generated by main node before parallel part + - output data for each job + + the parallel work shall be given as a callback function which takes + input data and precomputation data as parameter + + the above program will look like this for us: + + - parallel_init + - if we are a worker, do parallel_work(init_callback, job_callback), exit + - do precomputation + - fill array with input data + - output_array = parallel_run(input_array) + - print the output data + + parallel_init: + - check if we're running as an mpi program + - init mpi, check what kind of node we are + + parallel_work(init_callback1, init_callback2, job_callback): + - receive global_precomp (???) + - worker_precomp = init_callback2(global_precomp, worker_precomp) + - infinite loop: + - wait for job on network, receive input + - output = job_callback(global_precomp, worker_precomp, input) + - send output on network + - exit loop on shutdown signal + + parallel_run(global_precomp, input_array, restart file, callbacks): + - check if we're running as an MPI program + - send global_precomp to all nodes (if MPI) + - if(restart file given and exists) read restart file + - else create new restart file + - until(all jobs finished): + - if MPI: + - send next job & input to appropriate node + - if all jobs are in work, reassign unfinished ones (up to limit) + - collect outputs + - if no MPI: + - worker_precomp = init_callback1 + - worker_precomp = init_callback2(global_precomp, worker_precomp) + - for(j in jobs) + - output(j) = job_callback(global_precomp, worker_precomp, input(j)) + - delete restart file + - return array of outputs + + parallel_destroy(): + - free everything + + have a context? probably yes: parallel_context + + plan: + - make interface + - implement no-MPI part + - restructure singular_values.c to use interface + - implement MPI part +*/ + +#include +#include + +typedef void (*parallel_callback_destroy)(const void*, void*); +typedef int (*parallel_callback_init)(const void*,void*); +typedef int (*parallel_callback_job)(const void*,void*,const void*,void*); + +typedef struct { + int mpi_mode; + struct timespec starttime; + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int processor_name_len; + int rank; + int size; + MPI_Datatype order_datatype; + MPI_Datatype result_datatype; + parallel_callback_init init; + parallel_callback_job job; + parallel_callback_destroy destroy; + void *global_data; + void *node_data; + int global_data_size; + int node_data_size; + int input_size; + int output_size; +} parallel_context; + +parallel_context *parallel_init(); +void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size); +int parallel_work(parallel_context *ctx); +int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *restart_filename); +void parallel_destroy(parallel_context* ctx); + +int mpi_rank(); +void start_timer(); +double runtime(); + + +#endif diff --git a/run_local b/run_local index caa99af..76dcf8c 100755 --- a/run_local +++ b/run_local @@ -1,6 +1,9 @@ #!/bin/bash -nmax=895882 # up to reflection group word length 22 +# nmax=895882 # up to reflection group word length 22 ( 555 group) +nmax=700000 # up to reflection group word length 22 ( 444 group) # nmax=11575 # up to reflection group word length 14 -time mpirun --mca opal_warn_on_missing_libcuda 0 -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out +#time mpirun --mca opal_warn_on_missing_libcuda 0 -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out + +time mpirun --mca opal_warn_on_missing_libcuda 0 --mca mpi_yield_when_idle 1 -np 4 ./singular_values 700000 4 4 4 1 10 100 1 10 100 diff --git a/singular_values.c b/singular_values.c index 75e267a..85b248d 100644 --- a/singular_values.c +++ b/singular_values.c @@ -2,21 +2,55 @@ #include "linalg.h" #include "mat.h" #include "enumerate_triangle_group.h" +#include "parallel.h" #include #define SWAP(t,x,y) do { t _tmp = (x); (x) = (y); (y) = _tmp; } while (0); -#define DEBUG(msg, ...) fprintf(stderr, "[%10.3f] " msg, runtime(), ##__VA_ARGS__); +#define DEBUG(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__) //#define DEBUG(msg, ...) struct result { int id; - int count; mpq_t tr; mpq_t trinv; double x; double y; + double slope; +}; + +// we want as much as possible to be node data, except if it is only known to the main node +// (command line arguments) or should only be computed once (id list) + +struct global_data { + // command line arguments + unsigned int nmax; + unsigned int p1, p2, p3; + unsigned int sstart, send, sdenom; + unsigned int qstart, qend, qdenom; + + unsigned int *id_list; + unsigned int id_list_length; +}; + +struct node_data { + group_t *group; + mat* matrices; + struct result *invariants; + struct result **distinct_invariants; + int distinct_invariants_length; + mps_context *solver; +}; + +struct input_data { + unsigned int snum, sden; + unsigned int qnum, qden; +}; + +struct output_data { + int max_slope_id; + double max_slope; }; static int compare_result(const void *a_, const void *b_) @@ -75,20 +109,6 @@ static int compare_result_by_slope(const void *a_, const void *b_) return slopea > slopeb ? -1 : slopea < slopeb ? 1 : 0; } -struct timespec starttime; -static void start_timer() -{ - clock_gettime(CLOCK_MONOTONIC, &starttime); -} - -static double runtime() -{ - struct timespec curtime; - double diff; - clock_gettime(CLOCK_MONOTONIC, &curtime); - return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9; -} - int compute_invariants(group_t *group, mat *matrices, struct result **invariants, int *n, int unique) { mpq_t tmp; @@ -120,10 +140,8 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants for(int i = 0; i < ntraces; i++) { if(i == 0 || compare_result(&invariants[i], &invariants[nuniq-1]) != 0) { invariants[nuniq] = invariants[i]; - invariants[nuniq]->count = 1; nuniq++; } else { - invariants[nuniq-1]->count++; int oldlength = group->elements[invariants[nuniq-1]->id].length; int newlength = group->elements[invariants[i]->id].length; if(newlength < oldlength) @@ -141,7 +159,7 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants max_slope = 0; for(int i = 0; i < nuniq; i++) { retval = solve_characteristic_polynomial(solver, poly, invariants[i]->tr, invariants[i]->trinv, evs); - retval = 0;evs[0] = 2;evs[1] = 1;evs[2] = 0.5; // fake solving the polynomial for memory leak test + if(retval == 1) { fprintf(stderr, "Error! Could not solve polynomial.\n"); continue; @@ -161,6 +179,7 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants invariants[i]->x = x; invariants[i]->y = y; + invariants[i]->slope = y/x; if(y/x > max_slope + 1e-12 && (x > 0.1 || y > 0.1)) { max_slope_id = invariants[i]->id; @@ -194,180 +213,249 @@ long check_memory_usage(mat *matrices, int n) return total; } -int main(int argc, char *argv[]) + +void destroy_node(const void *_g, void *_n) { - mpq_t s, q, t, tmp; - int p1, p2, p3; - int sstart, send, sdenom, qstart, qend, qdenom; - mat *matrices; - group_t *group; - int nmax, n; - int max_slope_id; - char buf[100]; - char buf2[100]; - struct result *invariants; - struct result **distinct_invariants; + struct global_data *g = (struct global_data *)_g; + struct node_data *n = (struct node_data *)_n; - start_timer(); - - mpq_inits(s, q, t, tmp, NULL); - if(argc < 11) { - fprintf(stderr, "Usage: %s \n", argv[0]); - exit(1); + for(int i = 0; i < g->nmax; i++) { + mpq_clear(n->invariants[i].tr); + mpq_clear(n->invariants[i].trinv); } - nmax = atoi(argv[1]); - p1 = atoi(argv[2]); - p2 = atoi(argv[3]); - p3 = atoi(argv[4]); - sstart = atoi(argv[5]); - send = atoi(argv[6]); - sdenom = atoi(argv[7]); - qstart = atoi(argv[8]); - qend = atoi(argv[9]); - qdenom = atoi(argv[10]); + free(n->invariants); + free(n->distinct_invariants); + for(int i = 0; i < g->nmax; i++) + mat_clear(n->matrices[i]); + free(n->matrices); + coxeter_clear(n->group); +} + +int init_node(const void *_g, void *_n) +{ + struct global_data *g = (struct global_data *)_g; + struct node_data *n = (struct node_data *)_n; DEBUG("Allocate\n"); - matrices = malloc(nmax*sizeof(mat)); - for(int i = 0; i < nmax; i++) - mat_init(matrices[i], 3); - invariants = malloc(nmax*sizeof(struct result)); - distinct_invariants = malloc(nmax*sizeof(struct result)); - for(int i = 0; i < nmax; i++) { - mpq_init(invariants[i].tr); - mpq_init(invariants[i].trinv); + g->id_list = (int*)(g+1); // pointers get scrambled by transmission, reconstruct + n->matrices = malloc(g->nmax*sizeof(mat)); + for(int i = 0; i < g->nmax; i++) + mat_init(n->matrices[i], 3); + n->invariants = malloc(g->nmax*sizeof(struct result)); + n->distinct_invariants = malloc(g->nmax*sizeof(struct result)); // we won't need that many, but just in case + for(int i = 0; i < g->nmax; i++) { + mpq_init(n->invariants[i].tr); + mpq_init(n->invariants[i].trinv); + n->invariants[i].id = i; } // order of the triangle reflection generators: a, b, c // order of the rotation orders: bc, ac, ab DEBUG("Generate group\n"); - group = coxeter_init_triangle(p1, p2, p3, nmax); + n->group = coxeter_init_triangle(g->p1, g->p2, g->p3, g->nmax); - // first run; compute all matrices - for(int i = 0; i < group->size; i++) - group->elements[i].need_to_compute = 1; + return 0; +} + +int do_computation(const void *_g, void *_n, const void *_in, void *_out) +{ + struct global_data *g = (struct global_data *)_g; + struct node_data *n = (struct node_data *)_n; + struct input_data *in = (struct input_data *)_in; + struct output_data *out = (struct output_data *)_out; + + mpq_t s, q; + + mpq_inits(s, q, NULL); + mpq_set_ui(s, in->snum, in->sden); + mpq_set_ui(q, in->qnum, in->qden); + + DEBUG("Computing max slope element for s = %d/%d and q = %d/%d.\n", + in->snum, in->sden, + in->qnum, in->qden); + + for(int i = 0; i < n->group->size; i++) + n->group->elements[i].need_to_compute = 0; + n->group->elements[0].need_to_compute = 1; + + int needed_elements = 1; + for(int i = 0; i < g->id_list_length; i++) + { + int id = g->id_list[i]; + n->distinct_invariants[i] = &n->invariants[id]; + groupelement_t *cur = &n->group->elements[id]; + while(cur->need_to_compute == 0) { + cur->need_to_compute = 1; + needed_elements++; + cur = cur->parent->parent; // also need to compute its even-length ancestors + } + cur = n->group->elements[id].inverse; + while(cur->need_to_compute == 0) { + cur->need_to_compute = 1; + needed_elements++; + cur = cur->parent->parent; + } + } + n->distinct_invariants_length = g->id_list_length; + + DEBUG("Need to compute %d elements to get %d traces up to reflection length %d\n", + needed_elements, g->id_list_length, n->group->elements[n->group->size-1].length); + + DEBUG("Compute matrices\n"); + enumerate(n->group, n->matrices, g->p1, g->p2, g->p3, s, q); + + DEBUG("Compute invariants\n"); + out->max_slope_id = compute_invariants( + n->group, n->matrices, + n->distinct_invariants, &n->distinct_invariants_length, 1); + out->max_slope = n->invariants[out->max_slope_id].slope; + + mpq_clears(s, q, NULL); + + return 0; +} + +int main(int argc, char *argv[]) +{ + char buf[100]; + char buf2[100]; + struct global_data *g; + struct node_data n; + + start_timer(); + + // parse command line arguments + if(argc < 11) { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(1); + } + int nmax = atoi(argv[1]); + g = (struct global_data*)malloc(sizeof(struct global_data) + nmax*sizeof(int)); + g->id_list = (int*)(g+1); + g->nmax = nmax; + g->p1 = atoi(argv[2]); + g->p2 = atoi(argv[3]); + g->p3 = atoi(argv[4]); + g->sstart = atoi(argv[5]); + g->send = atoi(argv[6]); + g->sdenom = atoi(argv[7]); + g->qstart = atoi(argv[8]); + g->qend = atoi(argv[9]); + g->qdenom = atoi(argv[10]); + + // initialize + parallel_context *ctx = parallel_init(); + parallel_set_datasize_and_callbacks(ctx, init_node, do_computation, destroy_node, + sizeof(struct global_data) + g->nmax*sizeof(int), + sizeof(struct node_data), + sizeof(struct input_data), + sizeof(struct output_data)); + if(ctx->mpi_mode == 1 && ctx->rank != 0) { + // worker mode + parallel_work(ctx); + parallel_destroy(ctx); + exit(0); + } + + init_node(g, &n); // use very generic values for the pilot run unless sstart=send and qstart=qend - if(sstart == send && qstart == qend) { - mpq_set_ui(s, sstart, sdenom); - mpq_set_ui(q, qstart, qdenom); - DEBUG("Single run for s = %d/%d, q = %d/%d\n", sstart, sdenom, qstart, qdenom); + struct input_data pilot_input; + struct output_data pilot_output; + if(g->sstart == g->send && g->qstart == g->qend) { + pilot_input.snum = g->sstart; + pilot_input.sden = g->sdenom; + pilot_input.qnum = g->qstart; + pilot_input.qden = g->qdenom; + DEBUG("Single run for s = %d/%d, q = %d/%d\n", g->sstart, g->sdenom, g->qstart, g->qdenom); } else { - mpq_set_ui(s, 4, 100); - mpq_set_ui(q, 7, 100); + pilot_input.snum = 4; + pilot_input.sden = 100; + pilot_input.qnum = 7; + pilot_input.qden = 100; DEBUG("Initial run for s = %d/%d, q = %d/%d\n", 4, 100, 7, 100); } - DEBUG("Compute matrices\n"); - enumerate(group, matrices, p1, p2, p3, s, q); + g->id_list_length = 0; + for(int i = 0; i < n.group->size; i++) + if(n.group->elements[i].length % 2 == 0 && n.group->elements[i].inverse) + g->id_list[g->id_list_length++] = i; - // prepare array of ids - n = 0; - for(int i = 0; i < group->size; i++) - { - if(group->elements[i].length % 2 != 0 || !group->elements[i].inverse) - continue; - invariants[i].id = i; - distinct_invariants[n++] = &invariants[i]; - } + do_computation(g, &n, &pilot_input, &pilot_output); - DEBUG("Compute invariants\n"); - max_slope_id = compute_invariants(group, matrices, distinct_invariants, &n, 1); + for(int i = 0; i < n.distinct_invariants_length; i++) + g->id_list[i] = n.distinct_invariants[i]->id; + g->id_list_length = n.distinct_invariants_length; - // prepare for next time; don't need to change ids in distinct_invariants! - for(int i = 0; i < group->size; i++) - group->elements[i].need_to_compute = 0; - group->elements[0].need_to_compute = 1; - int multiplication_count = 1; - for(int i = 0; i < n; i++) { - groupelement_t *cur = &group->elements[distinct_invariants[i]->id]; - while(cur->need_to_compute == 0) { - cur->need_to_compute = 1; - multiplication_count++; - cur = cur->parent->parent; // also need to compute its even-length ancestors - } - cur = group->elements[distinct_invariants[i]->id].inverse; - while(cur->need_to_compute == 0) { - cur->need_to_compute = 1; - multiplication_count++; - cur = cur->parent->parent; - } + if(g->sstart != g->send || g->qstart != g->qend) { - } + struct input_data *inputs = malloc((g->send - g->sstart + 1)*(g->qend - g->qstart + 1)*sizeof(struct input_data)); + struct output_data *outputs = malloc((g->send - g->sstart + 1)*(g->qend - g->qstart + 1)*sizeof(struct input_data)); - DEBUG("Would have needed %d matrix multiplications for %d unique traces up to reflection length %d\n", multiplication_count, n, group->elements[group->size-1].length); - - if(sstart != send || qstart != qend) { - for(int sloop = sstart; sloop <= send; sloop++) { - for(int qloop = qstart; qloop <= qend; qloop++) { - DEBUG("Loop for s = %d/%d, q = %d/%d\n", sloop, sdenom, qloop, qdenom); - mpq_set_ui(s, sloop, sdenom); - mpq_set_ui(q, qloop, qdenom); - DEBUG("Compute matrices\n"); - enumerate(group, matrices, p1, p2, p3, s, q); - DEBUG("Compute invariants\n"); - max_slope_id = compute_invariants(group, matrices, distinct_invariants, &n, 0); - // output - gmp_printf("%Qd %Qd %s\n", s, q, - print_word(&group->elements[max_slope_id], buf)); - fflush(stdout); + int njobs = 0; + for(int sloop = g->sstart; sloop <= g->send; sloop++) { + for(int qloop = g->qstart; qloop <= g->qend; qloop++) { + inputs[njobs].sden = g->sdenom; + inputs[njobs].qden = g->qdenom; + inputs[njobs].snum = sloop; + inputs[njobs].qnum = qloop; + njobs++; } } + + parallel_run(ctx, g, inputs, outputs, njobs, NULL); + +// DEBUG("Loop for s = %d/%d, q = %d/%d\n", sloop, g->sdenom, qloop, g->qdenom); + + for(int i = 0; i < njobs; i++) + { + gmp_printf("%d/%d %d/%d %s %f\n", + inputs[i].snum, inputs[i].sden, inputs[i].qnum, inputs[i].qden, + print_word(&n.group->elements[outputs[i].max_slope_id], buf), + outputs[i].max_slope); + } + + free(inputs); + free(outputs); } else { // output - for(int i = 0; i < n; i++) { - double slope = distinct_invariants[i]->y/distinct_invariants[i]->x; - + for(int i = 0; i < n.distinct_invariants_length; i++) { // exclude tr = trinv = 2/1/0/-1/3 + mpq_t tmp; + mpq_init(tmp); mpq_set_si(tmp, 2, 1); - if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 && - mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0) + if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 && + mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0) continue; mpq_set_si(tmp, 1, 1); - if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 && - mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0) + if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 && + mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0) continue; mpq_set_si(tmp, 0, 1); - if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 && - mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0) + if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 && + mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0) continue; mpq_set_si(tmp, -1, 1); - if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 && - mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0) + if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 && + mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0) continue; mpq_set_si(tmp, 3, 1); - if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 && - mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0) + if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 && + mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0) continue; + mpq_clear(tmp); - gmp_printf("%d %d %s %f\n", - distinct_invariants[i]->id, distinct_invariants[i]->count, - print_word(&group->elements[distinct_invariants[i]->id], buf), - slope - ); + double slope = n.distinct_invariants[i]->y/n.distinct_invariants[i]->x; - /* - gmp_printf("%d %d %d %Qd %Qd %f %f %f %f %f %s\n", - distinct_invariants[i]->id, distinct_invariants[i]->count, cumulative, - distinct_invariants[i]->tr, distinct_invariants[i]->trinv, - log(fabs(mpq_get_d(distinct_invariants[i]->tr))), log(fabs(mpq_get_d(distinct_invariants[i]->trinv))), - distinct_invariants[i]->x, distinct_invariants[i]->y, slope, - print_word(&group->elements[distinct_invariants[i]->id], buf) - ); - */ + gmp_printf("%d %s %f\n", + n.distinct_invariants[i]->id, + print_word(&n.group->elements[n.distinct_invariants[i]->id], buf), + slope); } } - DEBUG("Clean up\n"); - for(int i = 0; i < nmax; i++) { - mpq_clear(invariants[i].tr); - mpq_clear(invariants[i].trinv); - } - free(invariants); - free(distinct_invariants); - for(int i = 0; i < nmax; i++) - mat_clear(matrices[i]); - free(matrices); - coxeter_clear(group); - mpq_clears(s, q, t, tmp, NULL); + destroy_node(g, &n); + free(g); + parallel_destroy(ctx); }