create MPI library; not very optimized yet

This commit is contained in:
Florian Stecker 2022-02-14 20:34:06 -05:00
parent 2735281300
commit d2d91d68b3
6 changed files with 728 additions and 165 deletions

2
.gitignore vendored
View File

@ -12,3 +12,5 @@ billiard_words
*.png
*.hi
gmon.out
restart
core

View File

@ -1,14 +1,14 @@
HEADERS=linalg.h mat.h coxeter.h enumerate_triangle_group.h
HEADERS=linalg.h mat.h coxeter.h enumerate_triangle_group.h parallel.h
#SPECIAL_OPTIONS=-O0 -g -D_DEBUG
SPECIAL_OPTIONS=-O3 -pg -g -funroll-loops -fno-inline
#SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline
#SPECIAL_OPTIONS=-O3 -pg -g -funroll-loops -fno-inline
SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline
#SPECIAL_OPTIONS=-O3 -flto -funroll-loops -Winline -mavx512f -mavx512cd -mavx512er -mavx512pf # KNL
#SPECIAL_OPTIONS=
OPTIONS=-I../mps/include -L../mps/lib -pthread -m64 -std=gnu99 -D_GNU_SOURCE $(SPECIAL_OPTIONS)
all: singular_values special_element singular_values_mpi convert billiard_words
all: singular_values special_element convert billiard_words
convert: convert.hs
ghc --make -dynamic convert.hs
@ -16,11 +16,11 @@ convert: convert.hs
billiard_words: billiard_words.hs
ghc --make -dynamic billiard_words.hs
singular_values: singular_values.o coxeter.o mat.o enumerate_triangle_group.o
gcc $(OPTIONS) -o singular_values coxeter.o singular_values.o mat.o enumerate_triangle_group.o -lm -lgmp -lmps
singular_values: singular_values.o coxeter.o mat.o enumerate_triangle_group.o parallel.o
mpicc $(OPTIONS) -o singular_values coxeter.o singular_values.o mat.o enumerate_triangle_group.o parallel.o -lm -lgmp -lmps
singular_values_mpi: singular_values_mpi.o coxeter.o mat.o
mpicc $(OPTIONS) -o singular_values_mpi coxeter.o singular_values_mpi.o mat.o -lm -lgmp -lmps
#singular_values_mpi: singular_values_mpi.o coxeter.o mat.o
# mpicc $(OPTIONS) -o singular_values_mpi coxeter.o singular_values_mpi.o mat.o -lm -lgmp -lmps
special_element: special_element.o coxeter.o linalg.o mat.o enumerate_triangle_group.o
gcc $(OPTIONS) -o special_element coxeter.o linalg.o special_element.o mat.o enumerate_triangle_group.o -lm -lgmp -lmps -lgsl -lcblas
@ -28,8 +28,8 @@ special_element: special_element.o coxeter.o linalg.o mat.o enumerate_triangle_g
singular_values.o: singular_values.c $(HEADERS)
gcc $(OPTIONS) -c singular_values.c
singular_values_mpi.o: singular_values_mpi.c $(HEADERS)
mpicc $(OPTIONS) -c singular_values_mpi.c
#singular_values_mpi.o: singular_values_mpi.c $(HEADERS)
# mpicc $(OPTIONS) -c singular_values_mpi.c
special_element.o: special_element.c $(HEADERS)
gcc $(OPTIONS) -c special_element.c
@ -46,5 +46,8 @@ coxeter.o: coxeter.c $(HEADERS)
mat.o: mat.c $(HEADERS)
gcc $(OPTIONS) -c mat.c
parallel.o: parallel.c $(HEADERS)
gcc $(OPTIONS) -c parallel.c
clean:
rm -f singular_values special_element singular_values_mpi coxeter.o linalg.o singular_values.o singular_values_mpi.o mat.o special_element.o convert.hi convert.o convert billiard_words.hi billiard_words.o billiard_words enumerate_triangle_group.o
rm -f singular_values special_element singular_values_mpi coxeter.o linalg.o singular_values.o singular_values_mpi.o mat.o special_element.o convert.hi convert.o convert billiard_words.hi billiard_words.o billiard_words enumerate_triangle_group.o parallel.o

349
parallel.c Normal file
View File

@ -0,0 +1,349 @@
#include "parallel.h"
#include <mpi.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <malloc.h>
#include <stdlib.h>
#define DEBUG(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__)
//#define DEBUG(msg, ...) fprintf(stderr, "[ %10.3f] " msg, runtime(), ##__VA_ARGS__)
//#define DEBUG_MPI(msg, node, ...) fprintf(stderr, "[%003d%10.3f] " msg, node, runtime(), ##__VA_ARGS__)
#define DONE(x) *((int*)(x))
enum message_tag {
PARALLEL_ORDER,
PARALLEL_RESULT,
PARALLEL_SHUTDOWN,
PARALLEL_GLOBAL_DATA
};
struct timespec starttime;
int mpi_rank(int activate_mpi)
{
static int active = 0;
if(activate_mpi)
active = 1;
if(!active)
return 0;
else {
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
return rank;
}
}
void start_timer()
{
clock_gettime(CLOCK_MONOTONIC, &starttime);
}
double runtime()
{
struct timespec curtime;
double diff;
clock_gettime(CLOCK_MONOTONIC, &curtime);
return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9;
}
parallel_context *parallel_init()
{
parallel_context *ctx = malloc(sizeof(parallel_context));
if(!getenv("OMPI_COMM_WORLD_SIZE")) {
ctx->mpi_mode = 0;
DEBUG("Running standalone.\n");
return ctx;
}
ctx->mpi_mode = 1;
int result = MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
MPI_Get_processor_name(ctx->processor_name, &ctx->processor_name_len);
mpi_rank(1); // display the rank in debug output from now on
if(ctx->rank == 0)
DEBUG("Running in mpi mode, %d nodes.\n", ctx->size);
return ctx;
}
void parallel_destroy(parallel_context* ctx)
{
if(ctx->mpi_mode) {
MPI_Type_free(&ctx->order_datatype);
MPI_Type_free(&ctx->result_datatype);
MPI_Finalize();
}
free(ctx);
}
void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size)
{
ctx->init = init;
ctx->destroy = destroy;
ctx->job = job;
ctx->global_data_size = global_data_size;
ctx->node_data_size = node_data_size;
ctx->input_size = input_size;
ctx->output_size = output_size;
if(ctx->mpi_mode) {
// create a datatype for job orders, consisting of an integer (the job id) and a user-defined section
int order_blocklengths[2] = {1, input_size};
MPI_Aint order_displacements[2] = {0, sizeof(int)};
MPI_Datatype order_types[2] = {MPI_INT, MPI_BYTE};
MPI_Type_create_struct(2, order_blocklengths, order_displacements, order_types, &ctx->order_datatype);
MPI_Type_commit(&ctx->order_datatype);
int result_blocklengths[2] = {1, output_size};
MPI_Aint result_displacements[2] = {0, sizeof(int)};
MPI_Datatype result_types[2] = {MPI_INT, MPI_BYTE};
MPI_Type_create_struct(2, result_blocklengths, result_displacements, result_types, &ctx->result_datatype);
MPI_Type_commit(&ctx->result_datatype);
}
}
int parallel_work(parallel_context *ctx)
{
// do nothing in non-mpi mode
if(ctx->mpi_mode == 0)
return 0;
MPI_Status status;
void *global_data = malloc(ctx->global_data_size);
void *node_data = malloc(ctx->node_data_size);
void *input_and_job_nr = malloc(ctx->input_size + sizeof(int));
void *input = input_and_job_nr + sizeof(int);
int *job_nr = (int *)input_and_job_nr;
void *output_and_job_nr = malloc(ctx->output_size + sizeof(int));
void *output = output_and_job_nr + sizeof(int);
int *output_job_nr = (int *)output_and_job_nr;
double jobtime;
// wait for global data
MPI_Bcast(global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD);
DEBUG("Global data received\n");
// initialize node_data (and do once-per-node computation)
ctx->init(global_data, node_data);
DEBUG("Initialization completed\n");
while(1) {
MPI_Probe(0, MPI_ANY_TAG, MPI_COMM_WORLD,
&status);
DEBUG("Message received: source = %d, tag = %d\n", status.MPI_SOURCE, status.MPI_TAG);
if(status.MPI_TAG == PARALLEL_SHUTDOWN) {
DEBUG("Shutting down\n");
break;
} else if(status.MPI_TAG == PARALLEL_ORDER) {
MPI_Recv(input_and_job_nr,
1, ctx->order_datatype,
0, PARALLEL_ORDER, MPI_COMM_WORLD,
&status);
DEBUG("Working on job %d\n", *job_nr);
jobtime = -MPI_Wtime();
// do the actual work
ctx->job(global_data, node_data, input, output);
jobtime += MPI_Wtime();
DEBUG("Finished job %d in %f seconds\n", *job_nr, jobtime);
*output_job_nr = *job_nr;
MPI_Send(output_and_job_nr,
1, ctx->result_datatype,
0, PARALLEL_RESULT, MPI_COMM_WORLD);
}
}
ctx->destroy(global_data, node_data);
free(global_data);
free(node_data);
free(input_and_job_nr);
free(output_and_job_nr);
return 0;
}
int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *_restart_filename)
{
// in non-mpi-mode, just run init1, init2, forall(jobs) job
if(ctx->mpi_mode == 0) {
int result;
void *node_data = malloc(ctx->node_data_size);
result = ctx->init(global_data, node_data);
if(result != 0)
goto cleanup_standalone;
for(int i = 0; i < njobs; i++) {
result = ctx->job(
global_data,
node_data,
input_array + ctx->input_size*i,
output_array + ctx->output_size*i);
if(result != 0)
goto cleanup_standalone;
}
cleanup_standalone:
ctx->destroy(global_data, node_data);
return result;
} else {
// if no restart file was specified, pick a filename
char *restart_filename;
char buffer[128];
int restartf;
if(_restart_filename == NULL) {
time_t t = time(NULL);
struct tm *loctm = localtime(&t);
strftime(buffer, sizeof(buffer), "restart/restart_%y%m%d_%H%M%S", loctm);
restart_filename = buffer;
} else {
restart_filename = (char *)_restart_filename;
}
// open restart file if it exists, otherwise create it
int continuing = 1;
restartf = open(restart_filename, O_RDWR);
if(restartf == -1 && errno == ENOENT) {
restartf = open(restart_filename, O_RDWR | O_CREAT, 0666);
continuing = 0;
}
if(restartf == -1) {
DEBUG("Error opening restart file: %s\n", strerror(errno));
exit(1);
}
// map restart file
int itemsize = (ctx->output_size + sizeof(int)); // for every job, store output, and completed flag
ftruncate(restartf, njobs*itemsize);
void *alljobs = mmap(0, njobs*itemsize, PROT_READ | PROT_WRITE, MAP_SHARED, restartf, 0);
if(alljobs == MAP_FAILED) {
DEBUG("Error mapping restart file: %s\n", strerror(errno));
exit(1);
}
// count completed jobs, or initialize jobs
int completed = 0;
if(continuing) {
for(int i = 0; i < njobs; i++)
if(DONE(alljobs + i*itemsize))
completed++;
} else {
for(int i = 0; i < njobs; i++) {
DONE(alljobs + i*itemsize) = 0;
memcpy(alljobs + i*itemsize + sizeof(int), input_array + i*ctx->input_size, ctx->input_size); // copy input data
}
}
fsync(restartf);
if(continuing) {
DEBUG("Continuing from restart file, %d/%d jobs completed, %d nodes\n", completed, njobs, ctx->size);
} else {
DEBUG("Starting from scratch, %d jobs, %d nodes\n", njobs, ctx->size);
}
if(completed >= njobs)
goto cleanup_mpi;
/* Send global data */
MPI_Bcast((void*)global_data, ctx->global_data_size, MPI_BYTE, 0, MPI_COMM_WORLD);
DEBUG("Global data sent\n");
void *input_message_buffer = malloc(ctx->input_size + sizeof(int));
void *output_message_buffer = malloc(ctx->output_size + sizeof(int));
// find next unfinished job
int current = 0;
while(current < njobs && DONE(alljobs + current*itemsize))
current++;
// assign initial jobs, 2 for each worker thread
for(int i = 0; i < 2*(ctx->size-1); i++) {
if(current >= njobs) // all jobs are assigned
break;
// send job id and input data
// send to all nodes except ourself (node 0)
*((int*)input_message_buffer) = current;
memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
MPI_Send(input_message_buffer, 1, ctx->order_datatype,
i%(ctx->size-1)+1, PARALLEL_ORDER, MPI_COMM_WORLD);
DEBUG("Job %d sent to node %d\n", current, i%(ctx->size-1)+1);
current++;
}
MPI_Status status;
int active_worker_nodes = ctx->size - 1;
while(1) {
MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if(status.MPI_TAG == PARALLEL_RESULT) {
MPI_Recv(output_message_buffer, 1, ctx->result_datatype,
MPI_ANY_SOURCE, PARALLEL_RESULT, MPI_COMM_WORLD, &status);
int id = *((int*)output_message_buffer);
memcpy(alljobs + id*itemsize + sizeof(int), output_message_buffer + sizeof(int), ctx->output_size);
DONE(alljobs + id*itemsize) = 1;
completed++;
DEBUG("job %d completed by node %d\n", id, status.MPI_SOURCE);
// todo: deal with unresponsive nodes
// strategy: when no jobs left, go through unfinished list again, incrementing oversubscribe counter
// if oversubscribe counter is at limit, shut node down instead
//
if(current >= njobs) { // all jobs are assigned, shut down node
MPI_Send(NULL, 0, MPI_BYTE, status.MPI_SOURCE, PARALLEL_SHUTDOWN, MPI_COMM_WORLD);
active_worker_nodes--;
if(active_worker_nodes)
continue;
else
break;
}
*((int*)input_message_buffer) = current;
memcpy(input_message_buffer + sizeof(int), input_array + current*ctx->input_size, ctx->input_size);
MPI_Send(input_message_buffer, 1, ctx->order_datatype,
status.MPI_SOURCE, PARALLEL_ORDER, MPI_COMM_WORLD);
DEBUG("Job %d sent to node %d\n", current, status.MPI_SOURCE);
current++;
}
}
for(int i = 0; i < njobs; i++) {
memcpy(output_array + i*ctx->output_size, alljobs + i*itemsize + sizeof(int), ctx->output_size);
}
free(input_message_buffer);
free(output_message_buffer);
cleanup_mpi:
munmap(alljobs, njobs*itemsize);
close(restartf);
}
return 0;
}

118
parallel.h Normal file
View File

@ -0,0 +1,118 @@
#ifndef PARALLEL_H
#define PARALLEL_H
/*
this is a library to parallelize workloads which can be split up naturally
into a sequence of independent jobs, using MPI. A program will usually
- do precomputation
- fill array with input data
- do the parallel work
- print the output data
we want to enable restarts, so that only unfinshed jobs need to be repeated.
Further, we want to be resilient to slow/unreliable network and to losing
nodes. There is a main node and a number of workers. The main node does the
precomputation and then retires do do administrative work, and the workers
do the actual jobs. We also want to switch to serial mode if the program is
called without MPI.
The following data has to be transimitted between nodes:
- results of the precomputation (read-only, shared between nodes)
- job-specific input data, generated by main node before parallel part
- output data for each job
the parallel work shall be given as a callback function which takes
input data and precomputation data as parameter
the above program will look like this for us:
- parallel_init
- if we are a worker, do parallel_work(init_callback, job_callback), exit
- do precomputation
- fill array with input data
- output_array = parallel_run(input_array)
- print the output data
parallel_init:
- check if we're running as an mpi program
- init mpi, check what kind of node we are
parallel_work(init_callback1, init_callback2, job_callback):
- receive global_precomp (???)
- worker_precomp = init_callback2(global_precomp, worker_precomp)
- infinite loop:
- wait for job on network, receive input
- output = job_callback(global_precomp, worker_precomp, input)
- send output on network
- exit loop on shutdown signal
parallel_run(global_precomp, input_array, restart file, callbacks):
- check if we're running as an MPI program
- send global_precomp to all nodes (if MPI)
- if(restart file given and exists) read restart file
- else create new restart file
- until(all jobs finished):
- if MPI:
- send next job & input to appropriate node
- if all jobs are in work, reassign unfinished ones (up to limit)
- collect outputs
- if no MPI:
- worker_precomp = init_callback1
- worker_precomp = init_callback2(global_precomp, worker_precomp)
- for(j in jobs)
- output(j) = job_callback(global_precomp, worker_precomp, input(j))
- delete restart file
- return array of outputs
parallel_destroy():
- free everything
have a context? probably yes: parallel_context
plan:
- make interface
- implement no-MPI part
- restructure singular_values.c to use interface
- implement MPI part
*/
#include <mpi.h>
#include <time.h>
typedef void (*parallel_callback_destroy)(const void*, void*);
typedef int (*parallel_callback_init)(const void*,void*);
typedef int (*parallel_callback_job)(const void*,void*,const void*,void*);
typedef struct {
int mpi_mode;
struct timespec starttime;
char processor_name[MPI_MAX_PROCESSOR_NAME];
int processor_name_len;
int rank;
int size;
MPI_Datatype order_datatype;
MPI_Datatype result_datatype;
parallel_callback_init init;
parallel_callback_job job;
parallel_callback_destroy destroy;
void *global_data;
void *node_data;
int global_data_size;
int node_data_size;
int input_size;
int output_size;
} parallel_context;
parallel_context *parallel_init();
void parallel_set_datasize_and_callbacks(parallel_context *ctx, parallel_callback_init init, parallel_callback_job job, parallel_callback_destroy destroy, int global_data_size, int node_data_size, int input_size, int output_size);
int parallel_work(parallel_context *ctx);
int parallel_run(parallel_context *ctx, const void *global_data, const void *input_array, void *output_array, unsigned int njobs, const char *restart_filename);
void parallel_destroy(parallel_context* ctx);
int mpi_rank();
void start_timer();
double runtime();
#endif

View File

@ -1,6 +1,9 @@
#!/bin/bash
nmax=895882 # up to reflection group word length 22
# nmax=895882 # up to reflection group word length 22 ( 555 group)
nmax=700000 # up to reflection group word length 22 ( 444 group)
# nmax=11575 # up to reflection group word length 14
time mpirun --mca opal_warn_on_missing_libcuda 0 -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out
#time mpirun --mca opal_warn_on_missing_libcuda 0 -x LD_LIBRARY_PATH=/home/stecker/svmpi/libs ./singular_values $nmax ejp_trg_restart test.out
time mpirun --mca opal_warn_on_missing_libcuda 0 --mca mpi_yield_when_idle 1 -np 4 ./singular_values 700000 4 4 4 1 10 100 1 10 100

View File

@ -2,21 +2,55 @@
#include "linalg.h"
#include "mat.h"
#include "enumerate_triangle_group.h"
#include "parallel.h"
#include <time.h>
#define SWAP(t,x,y) do { t _tmp = (x); (x) = (y); (y) = _tmp; } while (0);
#define DEBUG(msg, ...) fprintf(stderr, "[%10.3f] " msg, runtime(), ##__VA_ARGS__);
#define DEBUG(msg, ...) fprintf(stderr, "[%003d%10.3f] " msg, mpi_rank(0), runtime(), ##__VA_ARGS__)
//#define DEBUG(msg, ...)
struct result {
int id;
int count;
mpq_t tr;
mpq_t trinv;
double x;
double y;
double slope;
};
// we want as much as possible to be node data, except if it is only known to the main node
// (command line arguments) or should only be computed once (id list)
struct global_data {
// command line arguments
unsigned int nmax;
unsigned int p1, p2, p3;
unsigned int sstart, send, sdenom;
unsigned int qstart, qend, qdenom;
unsigned int *id_list;
unsigned int id_list_length;
};
struct node_data {
group_t *group;
mat* matrices;
struct result *invariants;
struct result **distinct_invariants;
int distinct_invariants_length;
mps_context *solver;
};
struct input_data {
unsigned int snum, sden;
unsigned int qnum, qden;
};
struct output_data {
int max_slope_id;
double max_slope;
};
static int compare_result(const void *a_, const void *b_)
@ -75,20 +109,6 @@ static int compare_result_by_slope(const void *a_, const void *b_)
return slopea > slopeb ? -1 : slopea < slopeb ? 1 : 0;
}
struct timespec starttime;
static void start_timer()
{
clock_gettime(CLOCK_MONOTONIC, &starttime);
}
static double runtime()
{
struct timespec curtime;
double diff;
clock_gettime(CLOCK_MONOTONIC, &curtime);
return (curtime.tv_sec - starttime.tv_sec) + (curtime.tv_nsec - starttime.tv_nsec) / 1e9;
}
int compute_invariants(group_t *group, mat *matrices, struct result **invariants, int *n, int unique)
{
mpq_t tmp;
@ -120,10 +140,8 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
for(int i = 0; i < ntraces; i++) {
if(i == 0 || compare_result(&invariants[i], &invariants[nuniq-1]) != 0) {
invariants[nuniq] = invariants[i];
invariants[nuniq]->count = 1;
nuniq++;
} else {
invariants[nuniq-1]->count++;
int oldlength = group->elements[invariants[nuniq-1]->id].length;
int newlength = group->elements[invariants[i]->id].length;
if(newlength < oldlength)
@ -141,7 +159,7 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
max_slope = 0;
for(int i = 0; i < nuniq; i++) {
retval = solve_characteristic_polynomial(solver, poly, invariants[i]->tr, invariants[i]->trinv, evs);
retval = 0;evs[0] = 2;evs[1] = 1;evs[2] = 0.5; // fake solving the polynomial for memory leak test
if(retval == 1) {
fprintf(stderr, "Error! Could not solve polynomial.\n");
continue;
@ -161,6 +179,7 @@ int compute_invariants(group_t *group, mat *matrices, struct result **invariants
invariants[i]->x = x;
invariants[i]->y = y;
invariants[i]->slope = y/x;
if(y/x > max_slope + 1e-12 && (x > 0.1 || y > 0.1)) {
max_slope_id = invariants[i]->id;
@ -194,180 +213,249 @@ long check_memory_usage(mat *matrices, int n)
return total;
}
int main(int argc, char *argv[])
void destroy_node(const void *_g, void *_n)
{
mpq_t s, q, t, tmp;
int p1, p2, p3;
int sstart, send, sdenom, qstart, qend, qdenom;
mat *matrices;
group_t *group;
int nmax, n;
int max_slope_id;
char buf[100];
char buf2[100];
struct result *invariants;
struct result **distinct_invariants;
struct global_data *g = (struct global_data *)_g;
struct node_data *n = (struct node_data *)_n;
start_timer();
mpq_inits(s, q, t, tmp, NULL);
if(argc < 11) {
fprintf(stderr, "Usage: %s <N> <p1> <p2> <p3> <s start> <s end> <s denom> <q start> <q end> <q denom>\n", argv[0]);
exit(1);
for(int i = 0; i < g->nmax; i++) {
mpq_clear(n->invariants[i].tr);
mpq_clear(n->invariants[i].trinv);
}
nmax = atoi(argv[1]);
p1 = atoi(argv[2]);
p2 = atoi(argv[3]);
p3 = atoi(argv[4]);
sstart = atoi(argv[5]);
send = atoi(argv[6]);
sdenom = atoi(argv[7]);
qstart = atoi(argv[8]);
qend = atoi(argv[9]);
qdenom = atoi(argv[10]);
free(n->invariants);
free(n->distinct_invariants);
for(int i = 0; i < g->nmax; i++)
mat_clear(n->matrices[i]);
free(n->matrices);
coxeter_clear(n->group);
}
int init_node(const void *_g, void *_n)
{
struct global_data *g = (struct global_data *)_g;
struct node_data *n = (struct node_data *)_n;
DEBUG("Allocate\n");
matrices = malloc(nmax*sizeof(mat));
for(int i = 0; i < nmax; i++)
mat_init(matrices[i], 3);
invariants = malloc(nmax*sizeof(struct result));
distinct_invariants = malloc(nmax*sizeof(struct result));
for(int i = 0; i < nmax; i++) {
mpq_init(invariants[i].tr);
mpq_init(invariants[i].trinv);
g->id_list = (int*)(g+1); // pointers get scrambled by transmission, reconstruct
n->matrices = malloc(g->nmax*sizeof(mat));
for(int i = 0; i < g->nmax; i++)
mat_init(n->matrices[i], 3);
n->invariants = malloc(g->nmax*sizeof(struct result));
n->distinct_invariants = malloc(g->nmax*sizeof(struct result)); // we won't need that many, but just in case
for(int i = 0; i < g->nmax; i++) {
mpq_init(n->invariants[i].tr);
mpq_init(n->invariants[i].trinv);
n->invariants[i].id = i;
}
// order of the triangle reflection generators: a, b, c
// order of the rotation orders: bc, ac, ab
DEBUG("Generate group\n");
group = coxeter_init_triangle(p1, p2, p3, nmax);
n->group = coxeter_init_triangle(g->p1, g->p2, g->p3, g->nmax);
// first run; compute all matrices
for(int i = 0; i < group->size; i++)
group->elements[i].need_to_compute = 1;
return 0;
}
int do_computation(const void *_g, void *_n, const void *_in, void *_out)
{
struct global_data *g = (struct global_data *)_g;
struct node_data *n = (struct node_data *)_n;
struct input_data *in = (struct input_data *)_in;
struct output_data *out = (struct output_data *)_out;
mpq_t s, q;
mpq_inits(s, q, NULL);
mpq_set_ui(s, in->snum, in->sden);
mpq_set_ui(q, in->qnum, in->qden);
DEBUG("Computing max slope element for s = %d/%d and q = %d/%d.\n",
in->snum, in->sden,
in->qnum, in->qden);
for(int i = 0; i < n->group->size; i++)
n->group->elements[i].need_to_compute = 0;
n->group->elements[0].need_to_compute = 1;
int needed_elements = 1;
for(int i = 0; i < g->id_list_length; i++)
{
int id = g->id_list[i];
n->distinct_invariants[i] = &n->invariants[id];
groupelement_t *cur = &n->group->elements[id];
while(cur->need_to_compute == 0) {
cur->need_to_compute = 1;
needed_elements++;
cur = cur->parent->parent; // also need to compute its even-length ancestors
}
cur = n->group->elements[id].inverse;
while(cur->need_to_compute == 0) {
cur->need_to_compute = 1;
needed_elements++;
cur = cur->parent->parent;
}
}
n->distinct_invariants_length = g->id_list_length;
DEBUG("Need to compute %d elements to get %d traces up to reflection length %d\n",
needed_elements, g->id_list_length, n->group->elements[n->group->size-1].length);
DEBUG("Compute matrices\n");
enumerate(n->group, n->matrices, g->p1, g->p2, g->p3, s, q);
DEBUG("Compute invariants\n");
out->max_slope_id = compute_invariants(
n->group, n->matrices,
n->distinct_invariants, &n->distinct_invariants_length, 1);
out->max_slope = n->invariants[out->max_slope_id].slope;
mpq_clears(s, q, NULL);
return 0;
}
int main(int argc, char *argv[])
{
char buf[100];
char buf2[100];
struct global_data *g;
struct node_data n;
start_timer();
// parse command line arguments
if(argc < 11) {
fprintf(stderr, "Usage: %s <N> <p1> <p2> <p3> <s start> <s end> <s denom> <q start> <q end> <q denom>\n", argv[0]);
exit(1);
}
int nmax = atoi(argv[1]);
g = (struct global_data*)malloc(sizeof(struct global_data) + nmax*sizeof(int));
g->id_list = (int*)(g+1);
g->nmax = nmax;
g->p1 = atoi(argv[2]);
g->p2 = atoi(argv[3]);
g->p3 = atoi(argv[4]);
g->sstart = atoi(argv[5]);
g->send = atoi(argv[6]);
g->sdenom = atoi(argv[7]);
g->qstart = atoi(argv[8]);
g->qend = atoi(argv[9]);
g->qdenom = atoi(argv[10]);
// initialize
parallel_context *ctx = parallel_init();
parallel_set_datasize_and_callbacks(ctx, init_node, do_computation, destroy_node,
sizeof(struct global_data) + g->nmax*sizeof(int),
sizeof(struct node_data),
sizeof(struct input_data),
sizeof(struct output_data));
if(ctx->mpi_mode == 1 && ctx->rank != 0) {
// worker mode
parallel_work(ctx);
parallel_destroy(ctx);
exit(0);
}
init_node(g, &n);
// use very generic values for the pilot run unless sstart=send and qstart=qend
if(sstart == send && qstart == qend) {
mpq_set_ui(s, sstart, sdenom);
mpq_set_ui(q, qstart, qdenom);
DEBUG("Single run for s = %d/%d, q = %d/%d\n", sstart, sdenom, qstart, qdenom);
struct input_data pilot_input;
struct output_data pilot_output;
if(g->sstart == g->send && g->qstart == g->qend) {
pilot_input.snum = g->sstart;
pilot_input.sden = g->sdenom;
pilot_input.qnum = g->qstart;
pilot_input.qden = g->qdenom;
DEBUG("Single run for s = %d/%d, q = %d/%d\n", g->sstart, g->sdenom, g->qstart, g->qdenom);
} else {
mpq_set_ui(s, 4, 100);
mpq_set_ui(q, 7, 100);
pilot_input.snum = 4;
pilot_input.sden = 100;
pilot_input.qnum = 7;
pilot_input.qden = 100;
DEBUG("Initial run for s = %d/%d, q = %d/%d\n", 4, 100, 7, 100);
}
DEBUG("Compute matrices\n");
enumerate(group, matrices, p1, p2, p3, s, q);
g->id_list_length = 0;
for(int i = 0; i < n.group->size; i++)
if(n.group->elements[i].length % 2 == 0 && n.group->elements[i].inverse)
g->id_list[g->id_list_length++] = i;
// prepare array of ids
n = 0;
for(int i = 0; i < group->size; i++)
do_computation(g, &n, &pilot_input, &pilot_output);
for(int i = 0; i < n.distinct_invariants_length; i++)
g->id_list[i] = n.distinct_invariants[i]->id;
g->id_list_length = n.distinct_invariants_length;
if(g->sstart != g->send || g->qstart != g->qend) {
struct input_data *inputs = malloc((g->send - g->sstart + 1)*(g->qend - g->qstart + 1)*sizeof(struct input_data));
struct output_data *outputs = malloc((g->send - g->sstart + 1)*(g->qend - g->qstart + 1)*sizeof(struct input_data));
int njobs = 0;
for(int sloop = g->sstart; sloop <= g->send; sloop++) {
for(int qloop = g->qstart; qloop <= g->qend; qloop++) {
inputs[njobs].sden = g->sdenom;
inputs[njobs].qden = g->qdenom;
inputs[njobs].snum = sloop;
inputs[njobs].qnum = qloop;
njobs++;
}
}
parallel_run(ctx, g, inputs, outputs, njobs, NULL);
// DEBUG("Loop for s = %d/%d, q = %d/%d\n", sloop, g->sdenom, qloop, g->qdenom);
for(int i = 0; i < njobs; i++)
{
if(group->elements[i].length % 2 != 0 || !group->elements[i].inverse)
continue;
invariants[i].id = i;
distinct_invariants[n++] = &invariants[i];
gmp_printf("%d/%d %d/%d %s %f\n",
inputs[i].snum, inputs[i].sden, inputs[i].qnum, inputs[i].qden,
print_word(&n.group->elements[outputs[i].max_slope_id], buf),
outputs[i].max_slope);
}
DEBUG("Compute invariants\n");
max_slope_id = compute_invariants(group, matrices, distinct_invariants, &n, 1);
// prepare for next time; don't need to change ids in distinct_invariants!
for(int i = 0; i < group->size; i++)
group->elements[i].need_to_compute = 0;
group->elements[0].need_to_compute = 1;
int multiplication_count = 1;
for(int i = 0; i < n; i++) {
groupelement_t *cur = &group->elements[distinct_invariants[i]->id];
while(cur->need_to_compute == 0) {
cur->need_to_compute = 1;
multiplication_count++;
cur = cur->parent->parent; // also need to compute its even-length ancestors
}
cur = group->elements[distinct_invariants[i]->id].inverse;
while(cur->need_to_compute == 0) {
cur->need_to_compute = 1;
multiplication_count++;
cur = cur->parent->parent;
}
}
DEBUG("Would have needed %d matrix multiplications for %d unique traces up to reflection length %d\n", multiplication_count, n, group->elements[group->size-1].length);
if(sstart != send || qstart != qend) {
for(int sloop = sstart; sloop <= send; sloop++) {
for(int qloop = qstart; qloop <= qend; qloop++) {
DEBUG("Loop for s = %d/%d, q = %d/%d\n", sloop, sdenom, qloop, qdenom);
mpq_set_ui(s, sloop, sdenom);
mpq_set_ui(q, qloop, qdenom);
DEBUG("Compute matrices\n");
enumerate(group, matrices, p1, p2, p3, s, q);
DEBUG("Compute invariants\n");
max_slope_id = compute_invariants(group, matrices, distinct_invariants, &n, 0);
// output
gmp_printf("%Qd %Qd %s\n", s, q,
print_word(&group->elements[max_slope_id], buf));
fflush(stdout);
}
}
free(inputs);
free(outputs);
} else {
// output
for(int i = 0; i < n; i++) {
double slope = distinct_invariants[i]->y/distinct_invariants[i]->x;
for(int i = 0; i < n.distinct_invariants_length; i++) {
// exclude tr = trinv = 2/1/0/-1/3
mpq_t tmp;
mpq_init(tmp);
mpq_set_si(tmp, 2, 1);
if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
continue;
mpq_set_si(tmp, 1, 1);
if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
continue;
mpq_set_si(tmp, 0, 1);
if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
continue;
mpq_set_si(tmp, -1, 1);
if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
continue;
mpq_set_si(tmp, 3, 1);
if(mpq_cmp(distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(distinct_invariants[i]->trinv, tmp) == 0)
if(mpq_cmp(n.distinct_invariants[i]->tr, tmp) == 0 &&
mpq_cmp(n.distinct_invariants[i]->trinv, tmp) == 0)
continue;
mpq_clear(tmp);
gmp_printf("%d %d %s %f\n",
distinct_invariants[i]->id, distinct_invariants[i]->count,
print_word(&group->elements[distinct_invariants[i]->id], buf),
slope
);
double slope = n.distinct_invariants[i]->y/n.distinct_invariants[i]->x;
/*
gmp_printf("%d %d %d %Qd %Qd %f %f %f %f %f %s\n",
distinct_invariants[i]->id, distinct_invariants[i]->count, cumulative,
distinct_invariants[i]->tr, distinct_invariants[i]->trinv,
log(fabs(mpq_get_d(distinct_invariants[i]->tr))), log(fabs(mpq_get_d(distinct_invariants[i]->trinv))),
distinct_invariants[i]->x, distinct_invariants[i]->y, slope,
print_word(&group->elements[distinct_invariants[i]->id], buf)
);
*/
gmp_printf("%d %s %f\n",
n.distinct_invariants[i]->id,
print_word(&n.group->elements[n.distinct_invariants[i]->id], buf),
slope);
}
}
DEBUG("Clean up\n");
for(int i = 0; i < nmax; i++) {
mpq_clear(invariants[i].tr);
mpq_clear(invariants[i].trinv);
}
free(invariants);
free(distinct_invariants);
for(int i = 0; i < nmax; i++)
mat_clear(matrices[i]);
free(matrices);
coxeter_clear(group);
mpq_clears(s, q, t, tmp, NULL);
destroy_node(g, &n);
free(g);
parallel_destroy(ctx);
}