######################################################################## ## Fri Sep 16 13:51:23 PDT 2011 ## Socsim is a venerable old micro simulation program developed in ## this department in the late 20th Century. The "Micro" in ## micro-simulation means that Socsim invents individuals; and makes ## all kinds of things happen to them such as birth marriage and death. In ## contrast to the Leslie Matrix simulation that we played with last ## week, Socsim produces HUGE amounts of "data". ##The Marriage Squeeze: In a population where (1) the ##typical marriage is between a slightly older male and a slightly ##smarter female and (2) where fertility is low (high), the marriage ##market will become catawampus. Low (high) fertility implies that ##cohort size will decline (increase) over time and if marriage ##partners generally have an age difference, the older partner will be ##in greater (lesser) supply .. ## Slovakia in the second half of the 20th Century met most of the ## criteria defined above for a marriage squeeze (by which I mean, it ## had a fertility decline). The data that you are going to work with ## today is based on a quick and dirty investigation of the 20th ## Century Slovakian Marriage Squeeze. ######################################################################## ## Read output population (opop) file and assign names to columns ######################################################################## ## It is important that we not store moderate sized data files in our ## home directories. To that end, the data for this exercise is in: ## /data/commons/carlm/MarriageSqueeze/SimResults. NOTE that ## /data/commons/... does NOT begin with "~" ## In order to avoid typing this long pathname more than once, we ## shall avail ourselves yet again of one the 12 most important Unix ## commands. To keep things tidy, we will use R's system() function ## to pass this Unix command to the shell for execution. ## make a symbolic link to the MarriageSqueeze directory. The "." in ## this command means that the link will be in whatever directory R ## was launched from. (Note that the second time this command runs, ## you'll get an error message telling you that the "file" (which is ## really a symbolic link) cannot be created because it already ## exists. You can ignore this cowardly message. system("ln -s /data/commons/carlm/MarriageSqueeze .") ## Let's verify that there are some file in the SimResults subdirectory system("ls MarriageSqueeze/SimResults") ## HEADS UP -- if you rsync this to your own machine, you'll need to ## separately rsync the MarriageSqueeze directory (but only once since ## it is not going to change) AND on MAC you will need to fiddle the ## ln command to point to the right place on your mac. OR on a WINDOWS ## machine, you will need to position the MarriageSqueeze directory so ## as to avoid the need for symbolic links entirely since windows does ## not do them. ## read.table() reads an array of data into a dataframe opop<-read.table(file="MarriageSqueeze/SimResults/squeeze.opop", header=F,as.is=T) dim(opop) ## rows and columns? ## assign names to columns names(opop)<-c("pid","fem","group", "nev","dob","mom","pop","nesibm","nesibp", "lborn","marid","mstat","dod","fmult") ## Data Dictionary ## pid - person id ## fem - 1 if female ## group - group identifier always 1 in this simulation ## nev - next scheduled event ## dob - date of birth integer month number ## mom - pid of mother ## pop - pid of father ## nesibm - pid of next eldest sibling through mother ## nesibp - pid of next eldest sibling through father ## lborn - pid of last born child ## marid - id of marriage in .omar file ## mstat - marital status at end of simulation integer 1=single;2=divorced; ## 3=widowed; 4=married ## dod - date of death or 0 if alive at end of simulation ## fmult - fertility multiplier ## handy for selecting by pid and safer than relying on pid==row, a ## condition that is generally true but not guaranteed to always obtain. rownames(opop)<-opop$pid ############################################# ## Let's see what's in opop ############################################# is.data.frame(opop) ## data frame? names(opop) ## what are the column names length(unique(opop$pid)) ; nrow(opop) ## one unique pid per record ## (row) mean(opop$fem) ## percent female range(opop$dob) ## range of dates of birth -- reported in socsim "months" ## use cut() and table() to plot birth cohort size by year of birth ## digression on cut(): ## cut converts a continuous variable into a discrete (categorical) ## one (a "factor" in R speak) by assigning it an integer number and a ## coresponding label. The breaks arguement determines how the ## intervals are to be constructed. At it's simplest, breaks=number, ## cut will use evenly spaced intervals cjunk<-cut (rnorm(10),breaks=2) cjunk cbind(as.numeric(cjunk),as.character(cjunk)) ## end of digression ## ## cut() and table() work beautifully together plot(table(cut(opop$dob,breaks= (b<-(max(opop$dob) -min(opop$dob))/12))), col=topo.colors(b)) ######################################################################## ## Converting socsim months to earth time ######################################################################## ## Working with Socsim months is clunky. with some constants and ## functions we can simplify this considerably FinalSimYear<-2028 ## simulation use 2008 rates for additional 20years EndMo<- max(c(opop$dob,opop$dod))## 3542 last month of simulation simDuration<- (57+20)*12 ## period of fertility decline. The ## simulation begins much earlier than this ## so that a population with kinship and ## stable age structure is established BEFORE ## we let fertility decline. Jan51<- EndMo - simDuration +1 ## first month of fertility decline ######################################################################## ## A function that will convert a socsim months ######################################################################## asYr<-function(x,Emo=EndMo,FSYear=FinalSimYear){ ## convert sim month to a real calendar year ## especially handy for graphing but also for calculations ## if(is.na(Emo)){stop("Emo/EndMo missing")} if(is.na(FSYear)){stop("FSYear/FinalSimYear missing")} yr<-ifelse(x==Emo,FSYear, FSYear - trunc((Emo -x )/12)) return(yr) } asYr(1234) ## Using asYr() in place of cut() we can have a much more informative plot plot(table(byr<-asYr(opop$dob)), col=terrain.colors(length(unique(byr)))) title("Size of birth cohorts by year of birth") ######################################################################## ## EXAMPLE create a 3 column dataframe wherein the first column holds ## a calendar year, and the second and third columns hold the number ## of females and males who are alive in that year of the ## simulation. A person is alive if their year of birth is equal to or ## BEFORE the given year, AND their year of death is equal to or AFTER ## -- OR if their month of death is zero -- which is that case for ## those who are alive at the end of the simulation. ## Using a loop res<-NULL for(yr in sort(unique(asYr(opop$dob)))){ ## alive<-ifelse( (asYr(opop$dob) <= yr) & ( (asYr(opop$dod) >= yr) | opop$dod==0), TRUE, FALSE) res<-rbind(res, c(yr,sum(alive & opop$fem),sum(alive & (! opop$fem)))) } colnames(res) <- c("yr","liveFem","liveMal") slovak0<-data.frame(res) ## add a column of total population slovak0$livePop<-slovak0$liveFem+slovak0$liveMal names(slovak0) matplot(x=slovak0$yr,y=slovak0[,c(2,4)],type='l',lty=2,lwd=5) ## or ... matplot(x=slovak0$yr,y=slovak0[,c(2,4)],type='h',lty=c(2,4),lwd=c(1,5),axes=F) ### QUESTION 1 -- Use the same approach as in the example above to create ### a dataframe that holds the numbers of living male and female ### simulated Slovaks that are both alive AND within their ### marriage-prone years between say 18 and 30 ## Question 2 -- Use the same approach as we did in the example above ## to find the average length of life by single year of birth. -- ## HeadsUp -- if you don't exclude those who were still alive at the ## end of the simulation, (opop$dod == 0) you will find observations ## with negative lifetimes. ## QUESTION 3 Did you find that e0 is declining in Slovakia since ## about 1940 ? if not, check you work, if so, WTF? ## EXAMPLE -- Use tapply to find the same -- average lenght of ## lifetime as you did using the loopy approach sel<-opop$dod !=0 e0<-tapply(opop$dod[sel] - opop$dob[sel], asYr(opop$dob[sel]), mean)/12 plot(e0) ##or plot(e0~as.numeric(names(e0)),type='o',lwd=5:30,cex=1:30,col=rainbow(9), axes='n') ## QUESTION 4-- Use tapply() as in the example above to find the ## probability of death by age 20 for each birth cohort -- Don't forget ## to remove cohorts born 2008 and beyond from the calculation ######################################################################## ## The output marriage file ######################################################################## ## Marriage records are stored in a separate file with id numbers that ## point back to the opop file omar<-read.table(file="MarriageSqueeze/SimResults/squeeze.omar", header=F,as.is=T) names(omar)<-c("mid","wpid","hpid","dstart","dend", "rend","wprior","hprior") ############################################# ## Data dictionary ############################################# ## mid = unique marriage id ## wpid = pid or wife (in output population file ) ## hpid = pid of husband ## dstart = month marriage begins ## dend = month marriage ends ## rend = reason marriage ends ## wprior = mid of wife's prior marriage if any ## hprior = mid of husband's prior marrage if any. rownames(omar)<-omar$mid ## in case the order of the rows changes is.data.frame(omar) names(omar) ## Example: ## Create a dataframe wherein column 1 holds a year of the simulation ## and column 2 holds the number of marriages executed in that year. res<-NULL for(myr in sort(unique(asYr(omar$dstart)))){ mcount<-sum( asYr(omar$dstart) == myr) res<-rbind(res,c(myr,mcount)) } mdat<-data.frame(res) names(mdat)<-c("yr","mcount") plot(mdat$mcount,type='h') ## QUESTION 5 draw a plot equivalent to the one above without using ## a loop ## EXAMPLE ## Create a dataframe that holds all the average age at marriage ## of males and females for each year of the simulation between 1951 ## and 2028 res<-NULL for (myr in 1951:2028){ sel<-asYr(omar$dstart) == myr xyM<- mean(omar$dstart[sel] - opop[omar$hpid[sel],"dob"]) xxM<- mean(omar$dstart[sel] - opop[omar$wpid[sel],"dob"]) res<-rbind(res,c(myr,xyM/12,xxM/12)) } Mage<-data.frame(res) names(Mage)<-c("yr","maleAge","femaleAge") plot(Mage$maleAge~Mage$femaleAge,type='n') text(y=Mage$maleAge,x=Mage$femaleAge,label=Mage$yr,cex=.8) ## Question 6 -- Using the same approach as in the example, create a ## datafarem that hold the average age at marriage of males and ## females by YEAR of BIRTH that is by birth cohort rather than year ## of marraige. use years 1940:1990 rather ### END ## ######################################################################## ## ## Frustration Management ## ######################################################################## ## ### QUESTION 1### ## res<-NULL ## for(yr in sort(unique(asYr(opop$dob)))){ ## ## ## alive<-ifelse( (asYr(opop$dob) <= yr) & ## ( (asYr(opop$dod) >= yr) | opop$dod==0), ## TRUE, ## FALSE) ## ## a logical vector whether or not marriage prone. ## mgable<- (yr - asYr(opop$dob)) %in% 18:30 ## res<-rbind(res, ## c(yr,sum(alive & opop$fem & mgable), ## sum(alive & (! opop$fem) &mgable))) ## } ## colnames(res) <- c("yr","MgblFem","MgblMal") ## slovak1<-data.frame(res) ## names(slovak1) ## matplot(x=slovak1$yr,y=slovak1[,2:3],type='l',lty=2,lwd=2) ## legend(x="topleft",fill=1:2,legend=c("Fem","Male")) ## ######################################################################## ## ## Frustration management ## ######################################################################## ## Question 2 ## res<-NULL ## for( byear in sort(unique(asYr(opop$dob)))){ ## ltimes<-ifelse( (asYr(opop$dob)==byear) & opop$dod !=0, ## opop$dod - opop$dob, NA) ## res<-rbind(res, ## c(byear,mean(ltimes,na.rm=TRUE))) ## } ## head(res) ## LtimesC<-data.frame(res) ## names(LtimesC) ## names(LtimesC)<-c("yr","e0") ## plot(e0 ~ yr,data=LtimesC) ## text(x=1900,y=400,label="?",cex=100) ## ######################################################################## ## ## Frustration Management ## ## Question 4 ## sel<-asYr(opop$dob) < 2008 ## p20<-tapply(opop$dod[sel] - opop$dob[sel] > (20*12), ## asYr(opop$dob[sel]), ## mean)/12 ## plot(p20~ as.numeric(names(p20)),type='s') ## ######################################################################## ## ### Frustration Management ## ##Question 5 ## plot(table(asYr(omar$dstart)),type='h') ## ######################################################################## ## ## Frustration Management ## ## ## ## Question 6 ## res<-NULL ## for (myr in 1940:1990){ ## selM<- asYr(opop[omar$hpid,"dob"]) == myr ## selF<- asYr(opop[omar$wpid,"dob"]) == myr ## xyM<- mean(omar$dstart[selM] - opop[omar$hpid[selM],"dob"]) ## xxM<- mean(omar$dstart[selF] - opop[omar$wpid[selF],"dob"]) ## res<-rbind(res,c(myr,xyM/12,xxM/12)) ## } ## MageC<-data.frame(res) ## names(MageC)<-c("yr","maleAge","femaleAge") ## plot(MageC$maleAge~MageC$femaleAge,type='n') ## text(y=MageC$maleAge,x=MageC$femaleAge,label=MageC$yr,cex=.8) # LocalWords: dataframe opop MarriageSqueeze nev nesibp lborn mstat fmult