#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <time.h>
#include <imsl.h>

#define TEST 0
#define RECLEN 7925
#define FIELDLEN 10
#define FIELDLEN1 FIELDLEN+1
#define MAXRECLEN 8000
#define MAXINDATA 5000

/*     This job reads a file of area|year X occupation counts */
/*      and calculates adjusted dstats */

/*   This program depends on an imsl3 subroutine
    imsl_f_hypergeometric_cdf(temp1, npersons, ntot1, ntotp);
    to calculate the hyper-geometric probability of getting temp1 persons
    out of a total of npersons in this category(occupation), when there are
    a total of ntot1 of these persons in the population of ntotp persons.
    Currently it gets this from the cdf function and subtracts from the
    previous (because I didn't have a good imsl reference while writing the
    program).  This is a little indirect, but since we need the probability
    for each possibility of temp1 = 0 to npersons, nothing is lost.
    Because the program loops through each of these possibilities, it can
    take a long while if npersons gets very large.
*/

/*  At Maryland, to compile:

    tap imsl3
    cc $CFLAGS dstat.c $LINK_CNL_STATIC

    where the environment variables
    CFLAGS=-fsingle -w -DANSI -I/afs/glue.umd.edu/software/imsl3/include
     and
    LINK_CNL_STATIC= -Bstatic 
      -L/afs/glue.umd.edu/software/imsl3/lib/lib.solaris 
      -L/opt/SUNWspro/SC3.0/lib -limslcmath -limslcstat -lsunmath 
      -Bdynamic -lm -lsocket -lnsl -ldl
*/

/*   Input file is in fields of 10 characters each, one record per occupation:
   cols 1-10:  group id (e.g., 1991 for CPS 1991;  
               the program will calculate dstats when it gets to a new
               group id or to the end of the input file.
   cols 11-20: occupation id
   cols 21-30: number of type 0 (e.g. women)
   cols 31-40: number of type 1 (e.g. men)
   cols 41-50: (optional) number of type 2 (e.g. black women)
   cols 51-60: (optional) number of type 3 (e.g. black men)
   etc.
   The program will detect how many types there are from the first input line.
   It will calculate dstats for each possible pair (= Ntype(Ntype-1)/2 ).
*/
/* Output of dstats and some explanatory text will be to the screen 
   (or in unix this can be re-directed to a .lst file),
   and to an output file named dstats.dat (which has no explanatory text) 

   Besides some descriptive statistics (e.g., % of type 0), the program
   produces 3 dstats:  
     1.  unadjusted (the usual calculation)
     2.  adjusted in an unrealistic way that allows for fractional people
         to move between occupations.
     3.  adjusted in a more realistic way to estimate what proportion of
         people would have to move to reach a distribution where men's and
         women's counts differ no more than what would be expected by
         chance.  This assumes fractional people can't change occupations,
         so if by chance we would expect up to 4.6 women in the occupation,
         only the number of women over 5 are counted as having to move.
*/

FILE *openfile (char *pfilename)
	{
	FILE *poutfile;
    if ((poutfile = fopen (pfilename, "a")) == NULL)
			{
			fprintf (stdout, "Cannot open %s.\n", pfilename);
			return NULL;
			}
		else
			{
			fprintf (stdout, "\nOpening output file %s.\n", pfilename);
			return poutfile;
			}
	}


float calc_dstat (char *group, int Nocc, int Nfield, int *Indata, FILE *poutfile)
	{
	int iocc, nocc0;
	int ifield1, ifield2;
	int *pIndata1, *pIndata2;
	int ntot1, ntot2, ntotp, ntype1, npersons;
	int diffint;
	float pct1, diff, tempdiff, difffloat, mu;
	float temp1, tempp;
	float prob, lastprob, cumprob, expect;
	float dstat_unadjusted, dstat_adjfloat, dstat_adjint;
	float b;

	if (TEST) fprintf (stdout, "\ncalculate dstat here\n\n");
	for (ifield1=0; ifield1 < (Nfield-1); ++ifield1)
		{
		for (ifield2= ifield1+1; ifield2<Nfield; ++ifield2)
			{
			/* get totals */
			pIndata1= Indata+ifield1;
			pIndata2= Indata+ifield2;
			ntot1=0;
			ntot2=0;
			for (iocc=0; iocc<Nocc; ++iocc)
				{
				ntot1= ntot1+*pIndata1;
				ntot2= ntot2+*pIndata2;
				pIndata1=pIndata1+Nfield;
				pIndata2=pIndata2+Nfield;
				}
			ntotp= ntot1+ntot2;
			pct1= (1.0*ntot1) / (1.0*ntotp);


			pIndata1= Indata+ifield1;
			pIndata2= Indata+ifield2;
			dstat_unadjusted= 0.0;
			dstat_adjfloat= 0.0;
			dstat_adjint= 0.0;
			nocc0=0;
			for (iocc=0; iocc<Nocc; ++iocc)
				{
				npersons= *pIndata1 + *pIndata2;
				if (npersons > 0)
					{
					++nocc0;
					ntype1= *pIndata1;
					mu= pct1 * npersons;
					diff= ntype1 - mu;
					if (diff<0) diff= diff * -1.0;
					dstat_unadjusted= dstat_unadjusted+diff;
				
					/* calculate expected difference from equality */
					expect= 0. ;
					lastprob= 0.; 
					for (temp1=0; temp1<=npersons; ++temp1)
						{
						if (temp1 < ntot1) 
							{
							cumprob= imsl_f_hypergeometric_cdf(temp1, npersons, ntot1, ntotp);
							prob= cumprob - lastprob ; 
							lastprob= cumprob;
							}
						else
							prob= 0.0;
						tempdiff= 1.0*temp1 - mu;
						if (tempdiff<0) tempdiff= tempdiff * -1.0;
						expect= expect + tempdiff*prob;
						}

					if (diff > expect)
						{
						difffloat= diff - expect;
						diffint  = difffloat + 1.0;
						}
					else
						{
						difffloat= 0.0;
						diffint=  0;
						}
					if (TEST>1) fprintf (stdout, "%4d %8d %8d %f %f %f %f %d\n", iocc, ntype1, npersons, diff, cumprob, expect, difffloat, diffint);
					
					dstat_adjfloat= dstat_adjfloat + difffloat;
					dstat_adjint=   dstat_adjint + diffint;
					}
				pIndata1=pIndata1+Nfield;
				pIndata2=pIndata2+Nfield;
				}
			b= 2.0 * pct1 * (1.0-pct1) * (0.+ntot1+ntot2);
			dstat_unadjusted= dstat_unadjusted / b;
			dstat_adjfloat= dstat_adjfloat / b;
			dstat_adjint= dstat_adjint / b;
			fprintf (stdout,   "%s %2d %2d nocc=%4d n1=%7d n2=%7d; pct1=%5.3f; dstat= %5.3f %5.3f %5.3f\n", group, ifield1, ifield2, nocc0, ntot1, ntot2, pct1, dstat_unadjusted, dstat_adjfloat, dstat_adjint);
			fprintf (poutfile, " %s %2d %2d %4d %8d %8d %7.4f %7.4f %7.4f %7.4f\n", group, ifield1, ifield2, nocc0, ntot1, ntot2, pct1, dstat_unadjusted, dstat_adjfloat, dstat_adjint);
			}

		}

	return;
	}

main()
	{
	int	Nrec = 0;
	int Nocc = 0;
	int Nfield;
	int Indata [MAXINDATA];
	int iocc;
	int icol, icol2 ;
	int linelen, ifield;
	int intest;
	int	nwrite = 0;
	int *pIndata;
	char	record[MAXRECLEN] ;
	char	group[FIELDLEN1] ;
	char	lastgroup[FIELDLEN1] ;
	char	occ[FIELDLEN1] ;
	char	field[FIELDLEN1] ;
	char    *precord, *pocc, *pgroup, *pfield;
	
	char	*poutname = "dstats.dat";
	FILE	*poutfile ;
	poutfile = openfile (poutname);
	if (poutfile == NULL) return;


	while (fgets (record, MAXRECLEN, stdin) != NULL)
		{
		++Nrec;
		if (TEST)
			{
			fprintf (stdout, "%4d                ", Nrec);
			fputs (record, stdout);
			}

		linelen= strlen(record) - 21;
		if (linelen - 10*(linelen/10)) 
			{
			fprintf (stdout, " Error in input line %d; linelen = %d\n", Nrec, linelen) ;
			}

		/* copy first two fields into group and occ IDs */
		precord= record; 
		strncpy(group,precord,FIELDLEN);
		precord= record+FIELDLEN;
		strncpy(occ,precord,FIELDLEN);

		/* if new group */
		if (strncmp(group, lastgroup, FIELDLEN1) != 0)
			{
			if (Nrec > 1) 
				{
				fprintf (stdout, "Group= %s; Nocc= %d\n", lastgroup, Nocc);
				calc_dstat (lastgroup, Nocc, Nfield, Indata, poutfile);
				}
			/* initialize for next group */
			Nfield= linelen / 10;
			Nocc=0;
			pIndata= Indata; /* init matrix of data */
			strncpy (lastgroup, group, FIELDLEN);
			}

		++Nocc;
		if (TEST)
			fprintf (stdout, "%4d %4d %4d %4d %s%s", Nrec,Nocc,linelen,Nfield,group,occ);
		/* read next field into Indata */
		precord= record + FIELDLEN;
		for (ifield=0; ifield < Nfield; ++ifield) 
			{
			precord= precord + FIELDLEN;
			intest= sscanf (precord, "%d", pIndata++);
			if (TEST) fprintf (stdout, "%10d", *(pIndata-1));
			}
		if (TEST) fprintf (stdout, "\n");
		}
		pIndata=Indata;
		if (TEST)
			{
			for (iocc=0; iocc<Nocc; ++iocc)
				{
				fprintf (stdout, "\n");
				for (ifield=0; ifield<Nfield; ++ifield)
					{
					fprintf (stdout, "%10d", *pIndata);
					++pIndata;
					}
			}
		}
		fprintf (stdout, "Group= %s; Nocc= %d\n", lastgroup, Nocc);
		if (Nrec > 1) calc_dstat (lastgroup, Nocc, Nfield, Indata, poutfile);
	}