/* total.c
 *
 * total - summarizes columnar data
 * By Jon Rifkin <jon.rifkin@uconn.edu>
 * Copyright 1999-2005 Jonathan Rifkin
 *
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */


/*
------------------------------------------------------------------------
History
------------------------------------------------------------------------
*/
/*
Jan 23, 2000
	Started
Oct 19, 2001
	Added Queue option
2005-11-03
   Went crazy refactoring the entire program.
	- Added check_manual() to allow user to specify data_action_inc and
	  data_action_out independently.
	- Added s,i,I options to key field to store keys as string (default)
	  or as 4 byte ip address with normal (i) or zero-padded (I) output.
	- Added automatic ungzip, and forced -g (ungzip) or -p (plain text).
*/

/*

------------------------------------------------------------------------
Compile Switches
------------------------------------------------------------------------
*/
#define PARSE_COLUMN
#define USE_SORT
#define GET_DATA_COL


/*
------------------------------------------------------------------------
Include files
------------------------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include "hash.h"
#include "gzfile.h"

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif



/*
------------------------------------------------------------------------
Usage
------------------------------------------------------------------------
*/
char *Usage_m[] = {
"",
"   total -dmqsvFN <key-col> <data-col> <file>",
"",
"     -d     Debug",
"     -f M   Print first M records (must also use sort option)",
"     -q N   Store at most N records, printing overflow as they're deleted",
"     -s I[r][,J[r][,...]]",
"            Sort records by columns I,J,..; in reverse order if 'r'",
"     -v     Print version info and exit",
"     -F c   Use character c as field delimiter",
"     -N H   Use H number of slots in hash table (-v option prints default)",
"     -c d   Use delimeter d (for instance ',' or '.') in integer format",
"     -i     Use 'quick' number parser, saves 10% if reading only integers",
"",
"   Read text data <file>.  Each line contains a record consisting",
"   of space delimited fields, some of which are numeric.  Use",
"   <key-col> (comma delimited list of columns starting at 1, or '-'",
"   for no key) as record keys, and for each unique set of record",
"   keys calculate the column values requested in <data-col>.",
"",
"   <key-col>  comma delimited list of column numbers OR just '-'",
"              to indicate no keys, this groups all file records together.",
"   <data-col> comma delimited list of column numbers follow by",
"                   s  Sum (the default).",
"                   a  Average.",
"                   d  Standard deviation.",
"                   m  Minimum.",
"                   x  Maximum.",
"                   n  Number of rows.",
"                   f  Column value from first row.",
"                   l  Column value from last row.",
"                   z  If sum is non-zero, set value to 1.",
"",
"   Examples:",
"",
"     Input file:    burt dog  10 5",
"                    burt fish  1 1",
"                    bill dog   2 2",
"                    burt dog   5 5",
"                    bill fish  3 1",
"",
"     Command:       total 1,2 3,4 input.fil",
"",
"     Output:        burt dog  15 10",
"                    burt fish  1  1",
"                    bill dog   2  2",
"                    bill fish  3  1",
"",
"     Command:       total 2 4,3,n input.fil",
"",
"     Output:        dog  12 17 3",
"                    fish  2  4 2",
"",
"     Command:       total - n input.fil",
"                    5",
"",
VERSION,
0
};


						 
/*
------------------------------------------------------------------------
DEFINES
------------------------------------------------------------------------
*/
#define MAX_COL 256

/*  This is the maximum number of characters in an input line and also 
 *  in a key.  */
#define NSTR    4192

#define U_CHAR unsigned char

/*  Number of slots in hash table  */
#define N_HASH_SLOTS  500000

/*  Constant to denote unknown action  */
#define FOUND_FALSE -1

#define FIRST_FALSE 0
#define FIRST_TRUE  1


/*  General file type - chooses between stdin, text file or gzipped file */
#define G_STDIN 0
#define G_PLAIN 1
#define G_GZIP  2
#define G_BUFLEN 65536


/*
------------------------------------------------------------------------
Debugging Macros
------------------------------------------------------------------------
*/
#define WRITETXT(TXT) \
	printf ("FILE %s LINE %i: \"%s\"\n", __FILE__, __LINE__, TXT); \
	fflush (stdin);

#define WRITEMSG \
	printf ("In file %s at line %i.\n", __FILE__, __LINE__); \
	fflush (stdin);

#define WRITEVAR(VAR_NAME,VAR_TYPE) \
	printf ("FILE %s LINE %i: ", __FILE__, __LINE__); \
	printf ("%s <", #VAR_NAME); \
	printf (#VAR_TYPE, (VAR_NAME) ); \
	printf (">\n"); \
	fflush (stdin); 


/*
------------------------------------------------------------------------
Extern variables
------------------------------------------------------------------------
*/
/*  used by getop() function  */
extern char *optarg;

/*
------------------------------------------------------------------------
Types
------------------------------------------------------------------------
*/

/*   This structure holds several kinds of information.  In its most 
 *   general use it describes a transformation of one array to another.
 *   Elements of src[] and dst[] define positions an input and
 *   output array, while action[] is a character used to identify an
 *   operation maps src[] to dst[].
 *
 *   It can also use to descibe a list of columns to sort on (in which
 *   case only src[] is used).  */
typedef struct {
	/*  This holds data for temporary calculations  */
	double buf[MAX_COL];
	/*  For each operation, map input column src[] to output column dst[]
	 *  using action[]  */
	int src[MAX_COL];
	int dst[MAX_COL];
	int action[MAX_COL];
	/*  Number of action  */
	int n;
	/*  Number of columns written */
	int nout;
} action_t;


/*
------------------------------------------------------------------------
File variables
------------------------------------------------------------------------
*/
char *comment_char_m = "#";
int  debug_m = 0;

/*  Information about which fields to sort on  */
action_t sort_action_m;

/*  Information about the input fields that are combined to determine
 *  the hash key used for each record  */
action_t key_action_m;

/*  Information about how to convert input data to stored data */
action_t data_action_inc_m;

/*  Information about how to convert stored data to program output  */
action_t data_action_out_m;

/*  Flag to print integers with commas or other character  */
int use_comma_m = 0;

/*  Flag to select 'quick' number parser  */
int use_quick_m = 0;


/*
------------------------------------------------------------------------
Local Function Prototypes
------------------------------------------------------------------------
*/
int is_whitespace(int);
int is_comment(char *);
int find_action_col (action_t *, int, int);
int get_fields (char *s, char **fptr, int max_field, int field_sep);
void prformat(double);
void prcomma(double d);
int isnumber (char *);
void print_element(helem_t  *, int field_sep);
void print_key    (helem_t  *, int field_sep);
int dcmp (const void *aptr, const void *bptr);
int get_max_index ( helem_t **, int, int (*cmp)(const void *, const void *));
void parse_column       (char *, int, action_t *);
void get_inc_from_out (action_t *,   action_t *);

/*  Find maximum position stored in src and dst arrays in action_t  */
int  get_max_src (action_t *, int);
int  get_max_dst (action_t *, int);

/*  Make key  */
void make_key(char *keybuff, int *nkeybuff, char **fptr, action_t *key, int nlen);
void Usage(void);

/*  Convert data to be stored  */
void convert_data_inc(action_t *proc, double *data, char **fptr, int firsttime);

/*  Convert data before it is written out  */
void convert_data_out(action_t *proc, double *data);

int  copychars(char *p, char *buf, int nbuf, int is_digits);
void parse_manual (char *arg, int def_char, action_t *t);
void parse_auto   (char *arg, action_t *t1,  action_t *t2);
void check_manual (action_t *, action_t *);

void ip2char (char *p, char *buf);
void char2ip (char *buf, char *str, int pad);

void debug_action(char *msg, action_t *a);

double atof_quick(const char *);

/*
------------------------------------------------------------------------
Main function
------------------------------------------------------------------------
*/
int main (int argc, char *argv[]) {
	int nfield;
	int i;
	char *keystr, *datastr;
	char buffer [NSTR];
	int    nfirstdata;
	double *firstdata = NULL;
	double *rdata   = NULL;
	char keybuff[NSTR];
	int  nkeybuff;
	char *fptr[MAX_COL];
	htable_t *ht = NULL;
	helem_t  *t        = NULL;
	int max_col;
	int optchar;
	int nhashslots = N_HASH_SLOTS;
	int max_queue_length=0;
	int max_output=0;
	int use_sort  =0;
	int use_top_sort = 0;
	int use_manual   = 0;
	helem_t **index = NULL;
	int field_sep = ' ';
	int max_index;
	char *read_result;
	gfile_t gp;

	/*  Read command line options  */
	while (-1 != (optchar=getopt(argc,argv,"F:N:c:df:gimpq:s:v"))) {
		switch (optchar) {
			case '?':
			return 1;
		/*  Debugging option  */
		case 'd':
			debug_m = 1;
			printf ("Debugging mode is on\n");
			break;
		case 'v':
			printf ("Version %s (compiled %s)\n", VERSION,__DATE__);
			printf ("Default number of hash slots = %d\n", N_HASH_SLOTS);
			return 0;
			break;
		case 'N':
			nhashslots = atoi(optarg);
			if (nhashslots<1) {
				printf (
				"ERROR: Number of hash slots (-N%d) must be one or greater\n", 
				nhashslots);
				exit(1);
			}
			break;
		case 'F':
			field_sep = optarg[0];
			break;
		case 'q':
			max_queue_length = atoi(optarg);
			if (max_queue_length<0) max_queue_length=0;
			break;
		case 'f':
			max_output = atoi(optarg);
			break;
		case 's':
			use_sort   = 1;
			parse_column (optarg, 'f', &sort_action_m);
			break;
		case 'm':
			use_manual = 1;
			break;
		case 'c':
			use_comma_m = optarg[0];
			break;
		case 'i':
			use_quick_m = 1;
			break;
		default:
			printf ("option (%c) not yet implemented\n", optchar);
			return 1;
		}
	}

	/*  Print Usage  */
	/*  Are there enough remaining arguments on the command line?  */
	if ( (use_manual && (argc-optind)<4) || (argc-optind)<3 ) {
		Usage();
		return 0;
	}
	
	/*  Read user supplied keystr to determine what input fields to use
	 *  as data element keys  */
	keystr = strdup (argv[optind++]);
	parse_column (keystr, 's', &key_action_m);

	/*  Use low-level 'manual' method of obtaining data_action* where
	 *  user supplies a two strings to define the two structures  */
	if (use_manual) {
		datastr = strdup (argv[optind++]);
		parse_manual (datastr, 's', &data_action_inc_m);
		datastr = strdup (argv[optind++]);
		parse_manual (datastr, 's', &data_action_out_m);
		check_manual (&data_action_inc_m, &data_action_out_m);

	/*  Use 'automatic' method of obtaining data_action* where user
	 *  supplies one string describing desired output and the program
	 *  determines the action taken at input time and the actions taken
	 *  at output time  */
	} else {
		datastr = strdup (argv[optind++]);
		parse_column (datastr, 's', &data_action_out_m);
		get_inc_from_out (&data_action_out_m, &data_action_inc_m);
	}


	/*  Find maximum column that must be read from each input line.  This
	 *  is determined from the key and data columns requested.
	 *  By knowing the max column, and if the max column needed is less
	 *  than the total number of columns in the input line, we can save
	 *  significant work by spliting only the needed number of fields
	 *  instead of spliting all the fields.  */
	max_col = -1;
	max_col = get_max_src(&key_action_m, max_col);
	max_col = get_max_src(&data_action_inc_m, max_col);
	
	/*  Allocate storage for input data when adding a new key to hash.  */
	/*  Determine the maximum number of data elements to store per hash
	 *  element, whether storing input data or processing output data.
	 *  The firstdata[] array is copied to each hash element at creation
	 *  time, so the size if firstdata[] will determine the size of the
	 *  hash elements.  */
	nfirstdata = get_max_dst(&data_action_inc_m,0);
	nfirstdata = get_max_dst(&data_action_out_m,nfirstdata);
	nfirstdata = nfirstdata + 1;
	firstdata  = (double *) calloc (nfirstdata, sizeof(double));

	/*  Initialize hash table  */
	ht = ht_init(nhashslots, max_queue_length ? HT_HISTORY : 0);

	/*  Loop over all input files  */
	for (;optind<argc;optind++) {

		/*  Open generlised file (either standard in, plain text file or
		 *  gzipped file  */
		gopen (&gp, argv[optind], "r");

		/*  Read file  */
		while ((read_result = ggets(buffer, NSTR, &gp))) {

			/*  Remove trailing new line  */
			buffer[strlen(buffer)-1] = '\0';

			/*  Skip comments  */
			if (is_comment(buffer))  continue;

			/*  Read fields from input line  */
			nfield = get_fields (buffer, fptr, max_col+1, field_sep);
			if (max_col>=nfield) continue;

			/*  Form key from input columns  */
			make_key(keybuff, &nkeybuff, fptr, &key_action_m, NSTR);

			/*  If key found in hash table, update the stored data  */
			t=ht_findelem (ht,(U_CHAR *)&keybuff, nkeybuff);
			if (t) { 

				/*  Get pointer to data for this hash element  */
				rdata = (double *) t->data;

				convert_data_inc (&data_action_inc_m, rdata, fptr, FIRST_FALSE);
		
				/*  Make current key the newest, so data output can be newest
				 *  first  */
				ht_makenewest (ht, t);
				
			/*  No key found, create new entry  */
			} else {

				convert_data_inc (&data_action_inc_m, firstdata, fptr, FIRST_TRUE);

				/*  Store new data with new key  */
				ht_storekey (ht, (U_CHAR *) &keybuff, nkeybuff,
					(U_CHAR *) firstdata, nfirstdata*sizeof(firstdata[0]));

			}

			/*  If number of keys exceeds limit specified by -q, print and
			 *  remove the oldest  */
			if (max_queue_length && ht_getcount(ht)>max_queue_length) {
				t = ht_getoldest(ht);
				convert_data_out (&data_action_out_m, (double *) t->data);
				print_element (t, field_sep);
				ht_freeelem(ht,t);
			}

		}   /*  read file  */
			
		gclose(&gp);
	}

	/*  Do final output conversion on all stored data  */
	ht_initwalk(ht);
	while ( (t = ht_getnext(ht)) ) {
		convert_data_out (&data_action_out_m, (double *) t->data);
	}

	/*  If no elements, then suppress sort  */
	if (ht_getcount(ht)==0) {
		use_sort = 0;
		max_output = 0;
	}

	/*  When only printing the first N elements, a full sort is
	 *  relatively inefficient.  Instead, use a simple bubble sort if
	 *  first N element is less then 1/3 the total number.
	 *  This value of 1/3 is a very rough efficiany estimate, feel free
	 *  to change this if you have a better idea.
	 */
	if (use_sort && max_output && max_output<ht_getcount(ht)/3)
		use_top_sort = 1;

	/*  Sort and print top max_output only (the -s and -f options)  */
	if (use_top_sort) {

		/*  Allocate index storage for number of top data  */
		index = (helem_t **) calloc(max_output, sizeof(helem_t *));
		if (NULL==index) {
			printf ("ERROR:  Cannot allocate memory for top_sort index\n");
			return 1;
		}

		/*  Place the first max_output elements into index  */
		ht_initwalk(ht);
		for (i=0;i<max_output;i++) {
			index[i] = ht_getnext(ht);
		}

		/*  Read through rest of list comparing with max_index  */
		max_index = get_max_index(index,max_output,dcmp);
		while ( (t = ht_getnext(ht)) ) {
			if (dcmp(index+max_index,&t)>0) {
				index[max_index] = t;
				max_index = get_max_index(index,max_output,dcmp);
			}
		}

		/*  Now have pointers to top max_output elements 
		 *  next sort and print
		 */
		qsort (index, max_output, sizeof(helem_t*), dcmp);
		for (i=0;i<max_output;i++) {
			t = index[i];
			print_element (t, field_sep);
		}
		free (index);


	/*  Sort and print all elements  */
	} else if (use_sort) {
		ht_initwalk(ht);
		index = (helem_t **) calloc (ht_getcount(ht), sizeof(helem_t *));
		if (NULL==index) {
			printf ("ERROR:  Cannot allocate memory for sort index\n");
			return 1;
		}
		i=ht_getcount(ht);
		do {
			i--;
			index[i] = ht_getnext(ht);
		} while (i);
		qsort (index, ht_getcount(ht), sizeof(helem_t*), dcmp);
		for (i=0;i<ht_getcount(ht);i++) {
			t = index[i];
			if (max_output && i>=max_output)
				break;
			print_element (t, field_sep);
		}
		free (index);

	/*  Print all elements unsorted  */
	} else {
	
		/*  Write results  */
		ht_initwalk (ht);
		while ((t=ht_getnext(ht))) {
			/*  Print key and data  */
			print_element (t, field_sep);
		}
	
	}

	/*  Print debug information if asked  */
	if (debug_m) ht_debuginfo(ht);
		
	/*  Don't bother freeing table, it takes alot of time cause we free each node individually,
	 *  just let OS free it up, it goes *much* faster
	 */
	/*  Free table  */
	/*
	ht_free (ht);
	*/

	return 0;
}
	


/*
------------------------------------------------------------------------
Local functions
------------------------------------------------------------------------
*/
int is_delimiter(int c, char *delimiter) {
	return (NULL!=strchr(delimiter,c));
}

int is_whitespace(int c) {
	return (NULL!=strchr(" \t\n\r",c));
}

int is_comment(char *s) {
	while (*s && is_whitespace(*s)) s++;
	return (*s==comment_char_m[0]);
}

/*  Return array of pointers fptr to fields in string  */
int get_fields (char *s, char **fptr, int max_field, int field_sep) {
	int nfield=0;

	/*  Fields separated by one or more blanks - leading blanks ignored  */
	if (field_sep==' ') {

		/*  Skip leading whitespace  */
		while (*s==' ' || *s=='\t') s++;
		/*  Find tokens  */
		while (*s && nfield<max_field) {
			fptr[nfield++] = s;
			/*  Find first whitespace  */
			while (*s!=' ' && *s && *s!='\t')  s++;
			/*  Make first white space string terminator */
			if (*s)  *s++ = '\0';
			/*  Find last whitespace */
			while (*s==' ' || *s=='\t')  s++;
		}

	/*  Fields separated by single field separator  */
	} else {
		
		/*  Find tokens  */
		while (*s && nfield<max_field) {
			fptr[nfield++] = s;
			/*  Find next field separator  */
			while (*s && *s!=field_sep)  s++;
			/*  Make field separator string terminator */
			if (*s)  *s++ = '\0';
		}

	}

	return nfield;
}


/*  Return list_col[] index which is result of 'action' on
 *  output data column 'col'
 */ 
int find_action_col (action_t *proc, int col, int action) {
	int i;
	for (i=0;i<proc->n;i++) {
		if (proc->src[i]==col && proc->action[i]==action) 
			return i;
	}
	return FOUND_FALSE;
}


/*  Print double in integer format if integer  */
void prformat(double d) {
	if (rint(d)==d)  {
		if (use_comma_m) {
			prcomma(d);
		} else {
			printf (" %.0f", d);
		}
	} else             
		printf (" %e",   d);
}

/*  Print integer value stored in double with commas of whatever  */
void prcomma(double d) {
	int i,j,n;
	/*  Ridiculously large buffer */
	char buf[90];
	sprintf (buf, "%.0f", d);
	i = strlen(buf);
	j = i + (i-1)/3;
	buf[j] = '\0';
	n=3;
	while(i) {
		buf[--j] = buf[--i];
		n--;
		if (n==0 && i) {
			n=3;
			buf[--j] = use_comma_m;
		}
	}
	printf (" %s", buf);
}


/*  Return 0,1 if string non-numeric,numeric */
int isnumber (char *str) {
	while (*str) {
		if (*str<'0'  || *str>'9')
			return 0;
		str++;
	}
	return 1;
}


/*  Print key and data for this hash element  */
void print_element (helem_t *t, int field_sep) {
	int    idata;
	double *rdata;

	/*  No element - print blank line  */
	if (t==NULL) {
		printf ("\n");
		return;
	}

	/*  Print key  */
	print_key(t, field_sep);
	/*  Print requested fields  */
	rdata = (double *) t->data;
	for (idata=0;idata<data_action_out_m.nout;idata++) {
		prformat (rdata[idata]);
	}
	printf ("\n");
}


void convert_data_out(action_t *out, double *rdata) {
	int i;
	/*  Zero out->buffer  */
	for (i=0;i<out->nout;i++) out->buf[i] = 0.0;
	for (i=0;i<out->n;i++) {
			switch(out->action[i]) {
				case 'm':
				case 'x':
				case 'f':
				case 'l':
				case 's':
				case 'n':
						out->buf[out->dst[i]] = rdata[out->src[i]];
						break;
				case 'a':
						if (rdata[0]==0.0) {
							out->buf[out->dst[i]] = 0;
						} else {
							out->buf[out->dst[i]] = rdata[out->src[i]]/rdata[0];
						}
						break;
				case 'z':
						if (rdata[out->src[i]]==0) 
							out->buf[out->dst[i]] = 1;
						else
							out->buf[out->dst[i]] = 0;
						break;
				case 'Z':
						if (rdata[out->src[i]]==0) 
							out->buf[out->dst[i]] = 0;
						else
							out->buf[out->dst[i]] = 1;
						break;
				case '+':
						out->buf[out->dst[i]] += rdata[out->src[i]];
						break;
				case '-':
						out->buf[out->dst[i]] -= rdata[out->src[i]];
						break;
				case '*':
						out->buf[out->dst[i]] *= rdata[out->src[i]];
						break;
				case '/':
						if (rdata[out->src[i]]==0.0) {
							out->buf[out->dst[i]] = 0.0;
						} else {
							out->buf[out->dst[i]] /= rdata[out->src[i]];
						}
						break;
			}
	}
	/*  Copy temp out->buffer back to data element  */
	for (i=0;i<out->nout;i++) rdata[i] = out->buf[i];
}


/*  Callback function called by qsort() to order output records  */
int dcmp (const void *aptr, const void *bptr) {
	helem_t *a = * ( (helem_t **) aptr );
	helem_t *b = * ( (helem_t **) bptr );
	double *ad = (double *) a->data;
	double *bd = (double *) b->data;
	double diff;
	int i,j,idiff;

	for (i=0;i<sort_action_m.n;i++) {
		j = sort_action_m.src[i];
		diff = ad[j] - bd[j];
		if (diff==0.0) 
			continue;
		idiff = diff>0 ? 1 : -1;
		if (sort_action_m.action[i]=='r')
			return -idiff;
		else
			return idiff;
	}
	return 0;
}


/*  
Parse data column argument 

arg               string being parsed for data column and actions
def_char          default action code

Note that legal action code 'n' will result in column number -1.
Need to take precautions.

*/
void parse_column (char *arg, int def_char, action_t *t) {
	char *ptr;
	char lastchr;
	int  col;
	t->n=0;
	ptr=strtok(arg,",");
	while (ptr) {
		/*  Print error if too many data columns  */
		if (t->n>MAX_COL) {
			fprintf (stderr,
				"ERROR:  To many data columns specified.  Max allowed is %d\n", 
				MAX_COL);
			exit (1);
		}
		/*  Get trailing character if a,m,n,s,x  */
		lastchr = ptr[strlen(ptr)-1];
		if (!strchr("0123456789", lastchr)) {
			t->action[t->n] = lastchr;
			ptr[strlen(ptr)-1] = '\0';
		} else {
			t->action[t->n] = def_char;
		}
		/*  Read column number for this data item  */
		col = atoi(ptr)-1;
		t->src[(t->n)  ] = col;
		t->dst[(t->n)++] = col;
		ptr=strtok(NULL,",");
	}
}

int copychars(char *p, char *buf, int nbuf, int is_digits) {
	int n = 0;
	if (is_digits)
		while (p[n] && p[n]>='0' && p[n]<='9' && n<nbuf-1) {
			buf[n] = p[n];
			n++;
		}
	else 
		while (p[n] && (p[n]<'0' || p[n]>'9') && n<nbuf-1) {
			buf[n] = p[n];
			n++;
		}
	buf[n]='\0';
	return n;
}

/*  Parse parameter which maps columns read from input file to output
 *  data  */
/*  Under construction  */
#if 0
void parse_auto (char *arg, action_t *inc, action_t *out) {
	int c;
	/*  Replace trailing '\0' by ','  */
	if (*arg) {
		arg[strlen(arg)] = ',';
	} else {
		return;
	}
	/*  First input action is to count fields  */
	inc->src[0] = 0;
	inc->dst[0] = 0;
	inc->action[0] = 'n';
	inc->n = 1;
	/*  Break argument up on commas  */
	next_intermediate = 1;
	/*  Keep reading until end of arg string when we break out of while
	 *  loop  */
	out->n = 0;
	while (1) {
		/*  Read input column number  */
		n = 0;
		while (*arg>='0' && *arg<='9') {
			n = 10*n + *(arg++) - '0';
		}
		/*  Stop if end of string  */
		if (*arg=='\0') break;
		/*  Read operation value */
		op = c = *(arg++);
		/*  If found a comma insead of operation then use '+'  */
		if (op==',') {
			op = '+';
		}
		/*  Have we recorded this input->intermediate action before?  */
		this_intermediate = find_action_col (inc, n, op);
		if (this_intermediate!=-1) {
			/*  Set input column  */
			inc->src[inc->n] = n-1;
			/*  Set intermediate column  */
			inc->dst[inc->n] = next_intermediate;
			/*  Set action  */
			inc->action[inc->n] = op;
			inc->n++;
			this_intermediate = next_intermediate;
			next_intermediate++;
		}
		/*  Update output action  */
		out->src[out->n]    = this_intermediate;
		out->dst[out->n]    = this_out;
		out->action[out->n] = op;
		out->n++;
		/*  If at a comma then set output action  */
		if (c==',')  this_out++;
	}
}
#endif

/*  Parse parameter which maps either from input data to intermediate
 *  storage or from intermediate storage to output  */
void parse_manual (char *arg, int def_char, action_t *t) {
	char *ptr;
	char buf[100];  /*  Ridiculously oversized option char buffer  */
	int  nbuf = 100;
	int  nread1,nread2;
	int  i;

	t->n=0;
	ptr=strtok(arg,",");
	while (ptr) {
		/*  Print error if too many data columns  */
		if (t->n>MAX_COL) {
			fprintf (stderr,
				"ERROR:  To many data columns specified.  Max allowed is %d\n", 
				MAX_COL);
			exit (1);
		}
		/*  Get first digits  */
		nread1 = copychars (ptr, buf, nbuf, 1);
		t->src[t->n] = atoi(buf)-1;
		/*  Read symbol if present */
		nread2 = copychars (ptr+nread1, buf, nbuf, 0);
		if (buf[0]=='\0')
			t->action[t->n] = 's';
		else
			t->action[t->n] = buf[0];
		/*  Read last digits if present  */
		copychars (ptr+nread1+nread2, buf, nbuf, 1);
		if (buf[0]=='\0') 
			t->dst[t->n] = t->src[t->n];
		else
			t->dst[t->n] = atoi(buf)-1;
		if (t->src[t->n]<0 || t->dst[t->n]<0) {
			printf ("ERROR:  Cannot use zero for a manual column\n");
			exit(1);
		}
		/*  Increment number of stored actions  */
		t->n++;
		ptr=strtok(NULL,",");
	}

	/*  Number of columns to which output conversion writes  */
	t->nout = -1;
	for (i=0;i<t->n;i++) {
		if ( t->nout < t->dst[i] )  
			t->nout = t->dst[i];
	}
	t->nout++;
}

/*  Check for consistency between input action and output actions */
void check_manual (action_t *inc, action_t *out) {
	/*  Make sure that output actions does only reads data from elements
	 *  that input actions have written  */
	int i,j,ok;
	for (i=0;i<out->n;i++) {
		ok = 0;
		for (j=0;j<inc->n;j++) {
			if (out->src[i]==inc->dst[j]) {
				ok = 1;
				break;
			}
		}
		if (!ok) {
			printf ("ERROR: Your manual output rule reads from a non-existant input elemenet\n");
			exit(1);
		}
	}
}


/*  Search list of pointers for smallest element  */
int get_max_index (
		helem_t **index, 
		int n,
		int (*cmp)(const void *, const void *))
{
	int max = 0;
	int j;
	for (j=1;j<n;j++) {
		if ( cmp( index+max, index+j )<0 ) {
			max = j;
		}
	}
	return max;
}


/*  Read integer from string using simple quick algorithm.  If it turns
 *  out that string is not a simple integer, call standard library
 *  routine atof()  */
double atof_quick(const char *pin) {
	long sum = 0;
	char *p = (char *) pin;
	if (*p) {
		/*  If string not simple integer then call standard library atof() */
		if (*p<'0' || *p>'9') return atof(pin);
		/*  Add leading digit to sum  */
		sum = *(p++) - '0';
	}
	while (*p) {
		/*  If string not simple integer then call standard library atof() */
		if (*p<'0' || *p>'9') return atof(pin);
		/*  Multiply current sum by 10 and add current digit  */
		sum = (sum << 3) + (sum << 1) + *(p++) - '0';
	}
	return (double) sum;
}

#if 0
/*  Apply action and input value to stored value  */
void convert_data_inc_first (action_t *inc, double *dout, char **fptr) {
	int i;
	double *dptr, val=0;
	for (i=0;i<inc->n;i++) {
		if (inc->action[i]!='n') {
			if (use_quick_m) {
				val  = atof_quick(fptr[inc->src[i]]);
			} else {
				val  = atof(fptr[inc->src[i]]);
			}
		}
		dptr = dout + inc->dst[i];
		switch (inc->action[i]) {
			/*  Store sum  */
			case 'a':
			case 's':
			case '+':
			case '*':
			case 'm':
			case 'x':
			case 'f':
			case 'l':
				*dptr  = val;
				break;
			/*  Count occurances */
			case 'n':
				*dptr = 1.0;
				break;
			/*  Subtract  */
			case '-':
				*dptr  = -val;
				break;
			/*  Zero */
			case 'z':
				if (val==0.0) {
					*dptr = 1.0;
				} else {
					*dptr = 0.0;
				}
				break;
			/*  Zero */
			case 'Z':
				if (val==0.0) {
					*dptr = 0.0;
				} else {
					*dptr = 1.0;
				}
				break;
			/*  Store variance (sum of squares)  */
			case 'd':
			case 'e':
				*dptr  = val*val;
				break;
		}
	}
}

/*  Apply action and input value to stored value  */
void convert_data_inc_next(action_t *inc, double *dout, char **fptr) {
	int i;
	double *dptr, val=0;
	for (i=0;i<inc->n;i++) {
		if (inc->action[i]!='n') {
			if (use_quick_m) {
				val  = atof_quick(fptr[inc->src[i]]);
			} else {
				val  = atof(fptr[inc->src[i]]);
			}
		}
		dptr = dout + inc->dst[i];
		switch (inc->action[i]) {
			/*  Store sum  */
			case 'a':
			case 's':
			case '+':
				*dptr += val;
				break;
			/*  Count numbe of occurances  */
			case 'n':
				*dptr += 1.0;
				break;
			/*  Subtract  */
			case '-':
				*dptr -= val;
				break;
			/*  Multiply */
			case '*':
				*dptr *= val;
				break;
			/*  Division */
			case '/':
				if (val==0.0)
					*dptr = 0.0;
				else 
					*dptr /= val;
				break;
			/*  Zero */
			case 'z':
				if (val==0.0) *dptr += 1;
				break;
			/*  Not Zero */
			case 'Z':
				if (val!=0.0) *dptr += 1;
				break;
			/*  Store variance (sum of squares)  */
			case 'd':
			case 'e':
				*dptr += val*val;
				break;
			/*  Store minimum value  */
			case 'm':
				if (val<*dptr) *dptr = val;
				break;
			/*  Store maximum value  */
			case 'x':
				if (val>*dptr) *dptr = val;
				break;
			/*  Store last occurance  */
			case 'l':
				*dptr = val;
				break;
		}
	}
}
#endif



/*  Apply action and input value to stored value  */
void convert_data_inc(action_t *inc, double *dout, char **fptr, int firsttime) {
	int i;
	double *dptr, val=0;
	for (i=0;i<inc->n;i++) {
		if (inc->action[i]!='n') {
			if (use_quick_m) {
				val  = atof_quick(fptr[inc->src[i]]);
			} else {
				val  = atof(fptr[inc->src[i]]);
			}
		}
		dptr = dout + inc->dst[i];
		switch (inc->action[i]) {
			/*  Store sum  */
			case 'a':
			case 's':
			case '+':
				if (firsttime) {
					*dptr  = val;
				} else {
					*dptr += val;
				}
				break;
			/*  Subtract  */
			case '-':
				if (firsttime) {
					*dptr  = -val;
				} else {
					*dptr -= val;
				}
				break;
			/*  Multiply */
			case '*':
				if (firsttime) {
					*dptr  = -val;
				} else {
					*dptr *= val;
				}
				break;
			/*  Division */
			case '/':
				if (firsttime) {
					*dptr  = -val;
				} else {
					if (val==0.0)
						*dptr = 0.0;
					else 
						*dptr /= val;
				}
				break;
			/*  Zero */
			case 'z':
				if (val!=0.0) val = 1.0;
				if (firsttime) {
					*dptr  = val;
				} else {
					*dptr += val;
				}
				break;
			/*  Store variance (sum of squares)  */
			case 'd':
			case 'e':
				if (firsttime) {
					*dptr  = val*val; } else {
					*dptr += val*val;
				}
				break;
			/*  Do nothing, this is handled automatically  */
			case 'n':
				if (firsttime) {
					*dptr = 1.0;
				} else {
					*dptr += 1.0;
				}
				break;
			/*  Store minimum value  */
			case 'm':
				if (firsttime) {
					*dptr = val;
				} else {
					if (val<*dptr) *dptr = val;
				}
				break;
			/*  Store maximum value  */
			case 'x':
				if (firsttime) {
					*dptr = val;
				} else {
					if (val>*dptr) *dptr = val;
				}
				break;
			/*  Store first occurance  */
			case 'f':
				if (firsttime) *dptr = val;
				break;
			/*  Store last occurance  */
			case 'l':
				*dptr = val;
				break;
		}
	}
}


/*  Read configuration for output conversion to obtain data conversions
 *  on input  */
void get_inc_from_out (action_t *out, action_t *inc) {
	int i;

	/*  By default always count number of occurances for each key  */
	inc->n = 0;
	inc->src   [ inc->n   ] =   0;
	inc->dst   [ inc->n   ] =   0;
	inc->action[(inc->n)++] = 'n';

	for (i=0;i<out->n;i++) {

		/*  See if combination of out_col[i],out_action[i] is not
		 *  already present in inc->src,inc->action.  If so, then we
		 *  don't need to store it again.  This can happen because some
		 *  out_action[] values (for example 'd' or 'e') will also
		 *  add an 's' value.  */
		if (FOUND_FALSE != find_action_col(inc, out->src[i], out->action[i]) )
			continue;

		/*  Print error if too many out items */
		if (inc->n==MAX_COL)  {
			fprintf (stderr, 
				"ERROR:  To many out actions specified.  Max allowed is %d\n", MAX_COL);
			exit (1);
		}

		/*  Save out column action in incess  */
		switch (out->action[i]) {
			/*  Field counts happens automatically  */
			case 'n':
				out->src[i] = 0;
				out->dst[i] = i;
				break;
			/*  'Simple' incesses */
			case 'm':  /*  Minimum  */
			case 'x':  /*  Maximum  */
			case 'a':  /*  Average  */
			case 's':  /*  Sum      */
			case 'f':  /*  First    */
			case 'l':  /*  Last     */
			case 'z':  /*  Zero/One */
			case '+':  /*  Add      */
			case '-':  /*  Subtract */
			case '*':  /*  Multiply */
			case '/':  /*  Divide   */
				inc->action[inc->n] = out->action[i];
				inc->src   [inc->n] = out->src[i];
				inc->dst   [inc->n] = inc->n;
				out->src   [i     ] = inc->dst[inc->n];
				out->dst   [i     ] = i;
				inc->n++;
				break;
		}
	}

	/*  Number of columns to which input conversion writes  */
	out->nout = -1;
	for (i=0;i<out->n;i++) {
		if ( out->nout < out->dst[i] )  
			out->nout = out->dst[i];
	}
	out->nout++;

	/*  Number of columns to which output conversion writes  */
	inc->nout = -1;
	for (i=0;i<inc->n;i++) {
		if ( inc->nout < inc->dst[i] )  
			inc->nout = inc->dst[i];
	}
	inc->nout++;
}


/*  Find maximum position stored in src arrays in action_t  */
int get_max_src(action_t *a, int max) {
	int i;
	for (i=0;i<a->n;i++)
		if (max < a->src[i])  max = a->src[i];
	return max;
}


/*  Find maximum position stored in dst arrays in action_t  */
int get_max_dst(action_t *a, int max) {
	int i;
	for (i=0;i<a->n;i++)
		if (max < a->dst[i])  max = a->dst[i];
	return max;
}


void make_key(char *keybuff, int *nkeybuff, char **fptr, action_t *key, int nbuflen) {
	int ikey;
	char *bufend = keybuff + nbuflen;
	char *dst = keybuff;
	char *src;
	for (ikey=0;ikey<key->n;ikey++) {
		/*  Append string to key  */
		if (key->action[ikey]=='s') {
			src = fptr[key->src[ikey]];
			while (*src && dst<bufend-1) *(dst++) = *(src++);
			*(dst++) = '\0';
		/*  Append encoded ip address to key  */
		} else if (key->action[ikey]=='i' || key->action[ikey]=='I') {
			if (dst+4>=bufend) {
				printf ("ERROR:  Key length exceeds maximum (%d)\n", nbuflen);
				exit(1);
			}
			ip2char(fptr[key->src[ikey]], dst);
			dst += 4;
		}
	}
	/*  Get length of buffer by subtracting key start from key end  */
	*nkeybuff = dst - keybuff;
}
#if 0
void make_key(char *keybuff, int *nkeybuff, char **fptr, action_t *key, int nbuflen) {
	int ikey;
	int keytype;
	char *fieldstr;
	char *bufend = keybuff + nbuflen;
	char *dst = keybuff;
	char *src;
	for (ikey=0;ikey<key->n;ikey++) {
		/*  Handle different key types  */
		keytype  = key->action[ikey];
		fieldstr = fptr[key->src[ikey]];
		/*  Append string to key  */
		if (keytype=='s') {
			src = fieldstr;
			while (*src && dst<bufend-1) *(dst++) = *(src++);
			*(dst++) = '\0';
		/*  Append encoded ip address to key  */
		} else if (keytype=='i' || keytype=='I') {
			if (dst+4>=bufend) {
				printf ("ERROR:  Key length exceeds maximum (%d)\n", nbuflen);
				exit(1);
			}
			ip2char(fieldstr,dst);
			dst += 4;
		}
	}
	/*  Get length of buffer by subtracting key start from key end  */
	*nkeybuff = dst - keybuff;
}
#endif

void print_key(helem_t *t, int field_sep) {
	int i;
	int keytype;
	char buf[16];
	char *p = (char *) t->key;
	/*  Split key field, reconstitute to text and print  */
	for (i=0;i<key_action_m.n;i++) {
		/*  Print field separator if not first field  */
		if (i) printf ("%c", field_sep);
		/*  Get field type to determine print format  */
		keytype = key_action_m.action[i];
		/*  String  */
		if (keytype=='s') {
			printf ("%s",p);
			/*  Advance buffer pointer past current string  */
			p += strlen(p)+1;
		/*  Print normal ip address  */
		} else if (keytype=='i') {
			char2ip (p, buf, 0);
			printf ("%s",buf);
			p += 4; 
		/*  Print normal zero padded ip address  */
		} else if (keytype=='I') {
			char2ip (p, buf, 1);
			printf ("%s",buf);
			p += 4; 
		} else {
		}
	}
}


void debug_action(char *msg, action_t *a) {
	int i;
	printf ("actionname action_p len: (%s) (%p) (%d)\n", msg, a, a->n);
	for (i=0;i<a->n;i++) 
		printf ("  src dst action %2d %2d %c\n", a->src[i], a->dst[i], a->action[i]);
}

void Usage(void) {
		char **ptr = Usage_m;
		printf ("%s\n", *ptr);
		while (*ptr) {
			printf ("%s\n", *(ptr++));
		}
}

/*  Convert string ip address to four character buffer.  Break ip
 *  address on *any* non-integer (not just '.'), and only read four
 *  octets.  This will not break if improper IP address is entered,
 *  it will just return its best guess  */
void ip2char (char *p, char *buf) {
	char *bufend = buf + 4;
	*buf = 0;
	while (*p) {
		if (*p<'0' || *p>'9') {
			p++;
			if ((++buf)==bufend) break;
			*buf = 0;
		}
		*buf = (*buf << 3) + (*buf << 1) + *(p++) - '0';
	}
}

/*  Convert four character buffer to string ip address */
void char2ip (char *buf, char *str, int pad) {
	if (pad) {
		sprintf (str,"%03hhu.%03hhu.%03hhu.%03hhu",buf[0],buf[1],buf[2],buf[3]); 
	} else {
		sprintf (str,"%hhu.%hhu.%hhu.%hhu",buf[0],buf[1],buf[2],buf[3]); 
	}
}
#if 0


void gopen (gfile_t *gp, char *filename, char *mode) {
	/*  Read from standard in  */
	if (!strcmp("-",filename)) {
		gp->filetype = G_STDIN;
		gp->fp      = stdin;

	/*  Read from gzipped file (with .gz suffix)  */
	} else if (!strcmp(".gz",filename+strlen(filename)-3)) {
		gp->filetype = G_GZIP;
		gp->fp  = (void *) gzopen (filename, mode);
		gp->buf = (char *) malloc(G_BUFLEN);
		/*  Initialize cusor, limit pointers to indicate empty buffer */
		gp->cursor = gp->limit = gp->buf;

	/*  Read from normal plain text file */
	} else {
		gp->filetype = G_PLAIN;
		gp->fp = (void *) fopen (filename, mode);
	}

	/*  Test for error  */
	if (NULL==gp->fp) {
			printf ("ERROR:  Cannot open input file <%s>\n", filename);
			exit(1);
	}
}

int gclose (gfile_t *gp) {
	if (gp->filetype==G_GZIP) {
		return gzclose( (gzFile)gp->fp);
		if (gp->fp) free (gp->fp);
		gp->fp = NULL;
	} else if (gp->filetype==G_PLAIN) {
		return fclose ( (FILE *)gp->fp);
	} else {
		return 0;
	}
}

char *ggets (char *str, int nstr, gfile_t *gp) {
	if (gp->filetype==G_GZIP) {
		char *str_cursor = str;
		char *str_limit  = str + nstr - 1;
		while (1) {
			/*  Load input buffer if empty */
			/*  If gp->cursor equals gp->limit, then the last string from
			 *  the buffer ended at the last legal byte.  Otherwise the
			 *  last string overran the legal buffer  */
			if (gp->cursor>=gp->limit) {
				gp->len    = gzread( (gzFile) gp->fp, gp->buf, G_BUFLEN-1);
				gp->cursor = gp->buf;
				gp->limit  = gp->buf + gp->len;
				/*  Add a '\n' beyond buffer end to indicate buffer overrun */
				*gp->limit = '\n';
			}
			/*  Is buffer empty?  */
			if (gp->cursor==gp->limit) {
				/*  No string stored so return end of file  */
				if (str_cursor==str) {
					return (char *) NULL;
				/*  String is stored so break this loop to return it */
				} else {
					break;
				}
			}
			/*  Read through buffer and copy to str until string fills up  */
			while (*gp->cursor!='\n') {
				/*  Skip this loop if we've filled up str  */
				if (str_cursor==str_limit) break;
				*(str_cursor++) = *(gp->cursor++);
			}
			/* Terminate str (whether we've reached the str or buffer end) */
			*str_cursor = '\0';
			/* Advance through buffer past next '\n'  */
			gp->cursor = strchr(gp->cursor,'\n') + 1;
			/*  Don't read next buffer if not yet at this buffer's end  */
			if (gp->cursor<gp->limit) break;
		}
		/*  Return current un-gzip'ed string  */
		return str;
	} else {
		return fgets(str, nstr, (FILE *) gp->fp);
	}
}
#endif
