#include #include #include #include #include #include #define TEST 0 #define RECLEN 7925 #define FIELDLEN 10 #define FIELDLEN1 FIELDLEN+1 #define MAXRECLEN 8000 #define MAXINDATA 5000 /* This job reads a file of area|year X occupation counts */ /* and calculates adjusted dstats */ /* This program depends on an imsl3 subroutine imsl_f_hypergeometric_cdf(temp1, npersons, ntot1, ntotp); to calculate the hyper-geometric probability of getting temp1 persons out of a total of npersons in this category(occupation), when there are a total of ntot1 of these persons in the population of ntotp persons. Currently it gets this from the cdf function and subtracts from the previous (because I didn't have a good imsl reference while writing the program). This is a little indirect, but since we need the probability for each possibility of temp1 = 0 to npersons, nothing is lost. Because the program loops through each of these possibilities, it can take a long while if npersons gets very large. */ /* At Maryland, to compile: tap imsl3 cc $CFLAGS dstat.c $LINK_CNL_STATIC where the environment variables CFLAGS=-fsingle -w -DANSI -I/afs/glue.umd.edu/software/imsl3/include and LINK_CNL_STATIC= -Bstatic -L/afs/glue.umd.edu/software/imsl3/lib/lib.solaris -L/opt/SUNWspro/SC3.0/lib -limslcmath -limslcstat -lsunmath -Bdynamic -lm -lsocket -lnsl -ldl */ /* Input file is in fields of 10 characters each, one record per occupation: cols 1-10: group id (e.g., 1991 for CPS 1991; the program will calculate dstats when it gets to a new group id or to the end of the input file. cols 11-20: occupation id cols 21-30: number of type 0 (e.g. women) cols 31-40: number of type 1 (e.g. men) cols 41-50: (optional) number of type 2 (e.g. black women) cols 51-60: (optional) number of type 3 (e.g. black men) etc. The program will detect how many types there are from the first input line. It will calculate dstats for each possible pair (= Ntype(Ntype-1)/2 ). */ /* Output of dstats and some explanatory text will be to the screen (or in unix this can be re-directed to a .lst file), and to an output file named dstats.dat (which has no explanatory text) Besides some descriptive statistics (e.g., % of type 0), the program produces 3 dstats: 1. unadjusted (the usual calculation) 2. adjusted in an unrealistic way that allows for fractional people to move between occupations. 3. adjusted in a more realistic way to estimate what proportion of people would have to move to reach a distribution where men's and women's counts differ no more than what would be expected by chance. This assumes fractional people can't change occupations, so if by chance we would expect up to 4.6 women in the occupation, only the number of women over 5 are counted as having to move. */ FILE *openfile (char *pfilename) { FILE *poutfile; if ((poutfile = fopen (pfilename, "a")) == NULL) { fprintf (stdout, "Cannot open %s.\n", pfilename); return NULL; } else { fprintf (stdout, "\nOpening output file %s.\n", pfilename); return poutfile; } } float calc_dstat (char *group, int Nocc, int Nfield, int *Indata, FILE *poutfile) { int iocc, nocc0; int ifield1, ifield2; int *pIndata1, *pIndata2; int ntot1, ntot2, ntotp, ntype1, npersons; int diffint; float pct1, diff, tempdiff, difffloat, mu; float temp1, tempp; float prob, lastprob, cumprob, expect; float dstat_unadjusted, dstat_adjfloat, dstat_adjint; float b; if (TEST) fprintf (stdout, "\ncalculate dstat here\n\n"); for (ifield1=0; ifield1 < (Nfield-1); ++ifield1) { for (ifield2= ifield1+1; ifield2 0) { ++nocc0; ntype1= *pIndata1; mu= pct1 * npersons; diff= ntype1 - mu; if (diff<0) diff= diff * -1.0; dstat_unadjusted= dstat_unadjusted+diff; /* calculate expected difference from equality */ expect= 0. ; lastprob= 0.; for (temp1=0; temp1<=npersons; ++temp1) { if (temp1 < ntot1) { cumprob= imsl_f_hypergeometric_cdf(temp1, npersons, ntot1, ntotp); prob= cumprob - lastprob ; lastprob= cumprob; } else prob= 0.0; tempdiff= 1.0*temp1 - mu; if (tempdiff<0) tempdiff= tempdiff * -1.0; expect= expect + tempdiff*prob; } if (diff > expect) { difffloat= diff - expect; diffint = difffloat + 1.0; } else { difffloat= 0.0; diffint= 0; } if (TEST>1) fprintf (stdout, "%4d %8d %8d %f %f %f %f %d\n", iocc, ntype1, npersons, diff, cumprob, expect, difffloat, diffint); dstat_adjfloat= dstat_adjfloat + difffloat; dstat_adjint= dstat_adjint + diffint; } pIndata1=pIndata1+Nfield; pIndata2=pIndata2+Nfield; } b= 2.0 * pct1 * (1.0-pct1) * (0.+ntot1+ntot2); dstat_unadjusted= dstat_unadjusted / b; dstat_adjfloat= dstat_adjfloat / b; dstat_adjint= dstat_adjint / b; fprintf (stdout, "%s %2d %2d nocc=%4d n1=%7d n2=%7d; pct1=%5.3f; dstat= %5.3f %5.3f %5.3f\n", group, ifield1, ifield2, nocc0, ntot1, ntot2, pct1, dstat_unadjusted, dstat_adjfloat, dstat_adjint); fprintf (poutfile, " %s %2d %2d %4d %8d %8d %7.4f %7.4f %7.4f %7.4f\n", group, ifield1, ifield2, nocc0, ntot1, ntot2, pct1, dstat_unadjusted, dstat_adjfloat, dstat_adjint); } } return; } main() { int Nrec = 0; int Nocc = 0; int Nfield; int Indata [MAXINDATA]; int iocc; int icol, icol2 ; int linelen, ifield; int intest; int nwrite = 0; int *pIndata; char record[MAXRECLEN] ; char group[FIELDLEN1] ; char lastgroup[FIELDLEN1] ; char occ[FIELDLEN1] ; char field[FIELDLEN1] ; char *precord, *pocc, *pgroup, *pfield; char *poutname = "dstats.dat"; FILE *poutfile ; poutfile = openfile (poutname); if (poutfile == NULL) return; while (fgets (record, MAXRECLEN, stdin) != NULL) { ++Nrec; if (TEST) { fprintf (stdout, "%4d ", Nrec); fputs (record, stdout); } linelen= strlen(record) - 21; if (linelen - 10*(linelen/10)) { fprintf (stdout, " Error in input line %d; linelen = %d\n", Nrec, linelen) ; } /* copy first two fields into group and occ IDs */ precord= record; strncpy(group,precord,FIELDLEN); precord= record+FIELDLEN; strncpy(occ,precord,FIELDLEN); /* if new group */ if (strncmp(group, lastgroup, FIELDLEN1) != 0) { if (Nrec > 1) { fprintf (stdout, "Group= %s; Nocc= %d\n", lastgroup, Nocc); calc_dstat (lastgroup, Nocc, Nfield, Indata, poutfile); } /* initialize for next group */ Nfield= linelen / 10; Nocc=0; pIndata= Indata; /* init matrix of data */ strncpy (lastgroup, group, FIELDLEN); } ++Nocc; if (TEST) fprintf (stdout, "%4d %4d %4d %4d %s%s", Nrec,Nocc,linelen,Nfield,group,occ); /* read next field into Indata */ precord= record + FIELDLEN; for (ifield=0; ifield < Nfield; ++ifield) { precord= precord + FIELDLEN; intest= sscanf (precord, "%d", pIndata++); if (TEST) fprintf (stdout, "%10d", *(pIndata-1)); } if (TEST) fprintf (stdout, "\n"); } pIndata=Indata; if (TEST) { for (iocc=0; iocc 1) calc_dstat (lastgroup, Nocc, Nfield, Indata, poutfile); }