## Emi Masaki and Dan Williams ## Demography 211--Homework 10 ## Question 1 pcdata_read.table('~/211/HW10/computer.txt',header=T) pcdata attach(pcdata) dim(pcdata) ## A. Percent owning a computer: percentown_(sum(pcdata[,8]))/400 percentown ## [1] 0.605 ## B. Percent of majority vs minority computer ownership: majtable_cbind(pcdata[,3],pcdata[,8]) majsampled_sum(majtable[,1]==1) majwithcomp_sum(majtable[,1]==1 & majtable[,2]==1) majwithcomp/majsampled ##[1] 0.7012195 ## vs. mintable_cbind(pcdata[,3],pcdata[,8]) minsampled_sum(mintable[,1]==2) minwithcomp_sum(mintable[,1]==2 & mintable[,2]==1) minwithcomp/minsampled ## [1] 0.1666667 ## OR AN EASIER WAY WE FIGURED OUT TOO LATE IS: table(STRATUM,COMPUTER) ## COMPUTER ##STRATUM 0 1 ## 1 98 230 230/(230+98) ##[1] 0.7012195 ## 2 60 12 12/72 ##[1] 0.1666667 ## C. Proportion of computer owners for within the five income classes: ## First create the classes: newincome_cut(pcdata$INCOME,c(0,10000,20000,30000,50000,200000),labels=c("$0-9999","$10000-19999","$20000-29999","$30000-49999","$50000+")) pcdata.cut_cbind(newincome,pcdata) pcdata.cut ## Now find the percentage (which we can do since the variable is ## either one or zero): totalvec_tapply(COMPUTER,newincome,mean) ##> tapply(COMPUTER,newincome,mean) ## $0-9999 $10000-19999 $20000-29999 $30000-49999 $50000+ ## 0.1111111 0.2807018 0.5949367 0.7159763 0.9152542 ## D. table1_aggregate(pcdata.cut$COMPUTER,list(pcdata.cut$newincome,pcdata.cut$STRATUM),mean) Group.1 Group.2 x ##1 $0-9999 1 0.40000000 ##2 $10000-19999 1 0.36363636 ##3 $20000-29999 1 0.61333333 ##4 $30000-49999 1 0.74213836 ##5 $50000+ 1 0.92857143 ##6 $0-9999 2 0.06451613 ##7 $10000-19999 2 0.16666667 ##8 $20000-29999 2 0.25000000 ##9 $30000-49999 2 0.30000000 ##10 $50000+ 2 0.66666667 ## Question 2 ## (John, we originally did this the long way and are now ## just going to use the calculations instead of writing code, just to ## save time...) ## A. ## For majority households, the odds are (230/(230+98))/(1-(230/(230+98))) ##[1] 2.346939 to one of owning a pc ## For minority households, the odds are (12/72)/(1-(12/72)) ##[1] 0.2 to one of owning a pc ## B. ## The relative odds of computer ownership for minority vs majority households: .2/2.346939 ## 0.08521738 ## C. ## The odds of owning pc ownership for hh in lowest and highest income classes: table(GRINCOME,COMPUTER) COMPUTER GRINCOME 0 1 1 32 4 2 41 16 3 32 47 4 48 121 5 5 54 ## Lowest: (4/36)/(1-(4/36)) ##[1] 0.125 ## Highest: (54/59)/(1-(54/59)) ##[1] 10.8 ## D. Relative odds of pc ownership in highest vs lowest: 10.8/.125 ##[1] 86.4 ## Question 3. ## A. table1 table3_matrix(c(table1$x),nrow=2,byrow=T) rowtotal_rbind(totalvec,table3) rowtotal Total_c(.7012195,.1666667,.6050000) coltotal_cbind(Total,rowtotal) table3a_coltotal table3a rowname_c("Total","Majority","Minority") colname_c("Total","$0-9999","$10000-19999","$20000-29999","$30000-49999","$50000+") dimnames(table3a)_list(rowname,colname) table3a Total $0-9999 $10000-19999 $20000-29999 $30000-49999 $50000+ Total 0.7012195 0.11111111 0.2807018 0.5949367 0.7159763 0.9152542 Majority 0.1666667 0.40000000 0.3636364 0.6133333 0.7421384 0.9285714 Minority 0.6050000 0.06451613 0.1666667 0.2500000 0.3000000 0.6666667 lnodds_log((table3a/(1-table3a))) lnodds tabletwo_table3a[2:3,2:6] tabletwo lnoddstwo_log((tabletwo/(1-tabletwo))) lnoddstwo mu3a_mean(lnoddstwo) mu3a rowmean3a<-rep(1,2) colmean3a<-rep(1,5) for (y in 1:2){ rowmean3a[y]_ mean(lnoddstwo[y,]) } for (y in 1:5){ colmean3a[y]<-mean(lnoddstwo[,y]) } lnoddstwo rowmean3a colmean3a alpha3a_rowmean3a-mu3a alpha3a beta3a_colmean3a-mu3a beta3a alpha.matrix_matrix(alpha3a,nrow=2,ncol=5,byrow=F) alpha.matrix beta.matrix_matrix(beta3a,nrow=2,ncol=5,byrow=T) beta.matrix resid.matrix3a_lnoddstwo-mu3a-alpha.matrix-beta.matrix resid.matrix3a $0-9999 $10000-19999 $20000-29999 $30000-49999 $50000+ Majority 0.2688742 -0.3405565 -0.08548868 0.0867376 0.07043348 Minority -0.2688742 0.3405565 0.08548868 -0.0867376 -0.07043348 ## B. alpha3b.adj_alpha3a-alpha3a[1] beta3b.adj_beta3a-beta3a[1] alpha3b.adj beta3b.adj ## The adjusted coefficients: > alpha3b.adj [1] 0.000000 -1.730935 > beta3b.adj [1] 0.000000 0.455280 1.221174 1.644714 3.168855 ## The coefficient for the 'minority' variable is -1.73, which ## means that holding income level constant, those in the minority ## are 1.73 times less likely to own a computer than those in the ## majority. ## C. plot(beta3b.adj,c(5000,15000,25000,40000,75000)) dev.print() ## The attached plot is very close to linear in the semilog scale, indicating ## that the income effect is linear. ## Question 4 zij_c(lnoddstwo) zij design.matrix_matrix(c(0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1),nrow=10,ncol=5,byrow=F) design.matrix lsfit(design.matrix,zij) $coefficients Intercept X1 X2 X3 X4 X5 -0.6743393 -1.7309352 0.4552800 1.2211735 1.6447142 3.1688551 ## compared to the coefficients in Question 3: beta3b.adj ##[1] 0.000000 0.455280 1.221174 1.644714 3.168855 alpha3b.adj ##[1] 0.000000 -1.730935 ## The coefficients match exactly. ## Question 5 ## A. stratum.dum_STRATUM-1 logitfit_glm(COMPUTER~INCOME+stratum.dum, family=binomial(link=logit)) summary(logitfit) Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.093e+00 3.212e-01 -3.402 0.000669 *** INCOME 5.630e-05 9.077e-06 6.202 5.57e-10 *** stratum.dum -1.602e+00 3.742e-01 -4.282 1.85e-05 *** Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 --- ## Both variables are significant at alpha = 0.001. ## A one dollar increase in income, holding stratum constant, would ## increase the predicted odds of computer ownership by a factor of ## exp(5.630e-05). On the other hand, the difference in predicted ## odds for majority relative to minority will decrease by a factor of ## exp(1.602e+00). ## B. incometwo_INCOME^2 malevec_MEN/SIZE logitfitb_glm(COMPUTER~INCOME+(stratum.dum)+incometwo+malevec+SIZE, family=binomial(link=logit)) summary(logitfitb) Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.106e+00 7.093e-01 -1.560 0.119 INCOME 3.463e-05 3.260e-05 1.062 0.288 stratum.dum -1.599e+00 3.964e-01 -4.035 5.45e-05 *** incometwo 2.258e-10 4.694e-10 0.481 0.630 malevec 1.340e+00 1.019e+00 1.315 0.188 SIZE -4.495e-02 6.290e-02 -0.715 0.475 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 logitfitc_glm(COMPUTER~INCOME+(stratum.dum)+malevec, family=binomial(link=logit)) summary(logitfitc) Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.500e+00 4.604e-01 -3.257 0.00113 ** INCOME 5.103e-05 9.885e-06 5.162 2.44e-07 *** stratum.dum -1.570e+00 3.722e-01 -4.219 2.45e-05 *** malevec 1.228e+00 9.871e-01 1.244 0.21336 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 ## Interpretation of 5b: Only stratum is still a statistically ## significant predictor of computer ownership, (possibly due to an ## interaction between Income and one of the added variables). The ## additional variables 'Income squared' ## and 'Size' do not add explanatory power, as their p-values are not ## significant and their z-values are close to zero. 'Percent male', ## with a z-value of .188, is included in a second model, but this glm ## results in a p-value of .213, which is not statistically ## significant. It appears that Income and Stratum are the best ## independent variables to include in the model. ## Question 6 stratum.dum_STRATUM-1 ols6a_lm(COMPUTER~INCOME+(stratum.dum)) ols6a summary(ols6a) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.658e-01 5.456e-02 6.705 6.95e-11 *** INCOME 9.003e-06 1.325e-06 6.797 3.93e-11 *** stratum.dum -3.361e-01 6.213e-02 -5.410 1.09e-07 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 ## A. ## The income and stratum coefficients are statistically significant ## at alpha=.001. An increase in income of one dollar (holding stratum constant) will increase the predicted probability of computer ownership by 9.003e-06. Holding income constant, the predicted probability of minority ownership differs from majority ownership by a factor of -3.361*exp(-01). ## B. y1_.3658+9.003e-06*(5000)-3.361e-01*(0) y2_.3658+9.003e-06*(5000)-3.361e-01*(1) y3_.3658+9.003e-06*(15000)-3.361e-01*(0) y4_.3658+9.003e-06*(15000)-3.361e-01*(1) y5_.3658+9.003e-06*(25000)-3.361e-01*(0) y6_.3658+9.003e-06*(25000)-3.361e-01*(1) y7_.3658+9.003e-06*(40000)-3.361e-01*(0) y8_.3658+9.003e-06*(40000)-3.361e-01*(1) y9_.3658+9.003e-06*(75000)-3.361e-01*(0) y10_.3658+9.003e-06*(75000)-3.361e-01*(1) matrix.yhat_matrix(c(y1,y2,y3,y4,y5,y6,y7,y8,y9,y10), nrow=2,ncol=5,byrow=F) matrix.yhat [1,] 0.410815 0.500845 0.590875 0.72592 1.041025 [2,] 0.074715 0.164745 0.254775 0.38982 0.704925 ## C. Estimate Std. Error z value Pr(>|z|) (Intercept) -1.093e+00 3.212e-01 -3.402 0.000669 *** INCOME 5.630e-05 9.077e-06 6.202 5.57e-10 *** stratum.dum -1.602e+00 3.742e-01 -4.282 1.85e-05 *** z1_-1.093+(5.630e-05)*(5000)-(1.602*0) z1 z2_-1.093+(5.63e-05)*(5000)-(1.602*1) z3_-1.093+(5.63e-05)*(15000)-(1.602*0) z4_-1.093+(5.63e-05)*(15000)-(1.602*1) z5_-1.093+(5.63e-05)*(25000)-(1.602*0) z6_-1.093+(5.63e-05)*(25000)-(1.602*1) z7_-1.093+(5.63e-05)*(40000)-(1.602*0) z8_-1.093+(5.63e-05)*(40000)-(1.602*1) z9_-1.093+(5.63e-05)*(75000)-(1.602*0) z10_-1.093+(5.63e-05)*(75000)-(1.602*1) zvector_c(z1,z2,z3,z4,z5,z6,z7,z8,z9,z10) zvector pvector_(exp(zvector)/(1+exp(zvector))) pvector matrix.phat_matrix(c(pvector), nrow=2,ncol=5,byrow=F) (matrix.phat) > (matrix.phat) Income Class: [,1] [,2] [,3] [,4] [,5] Majority 0.30757095 0.4381927 0.5779833 0.7611510 0.9580933 Minority 0.08214903 0.1358142 0.2162763 0.3910264 0.8216402 ## D. ## Using the OLS regression model from 5a, the predicted probabilites ## calculated are greater than 1 (due to the unbounded parameters), which ## theoretically and practically cannot be. The logistic model should ## be used to predict probabilities, because the predicted values ## must lie between zero and one. ## Question 7 ## A. ## The relative odds of computer ownership for minority vs majority households: ## .2/2.346939 ## 0.08521738 ## B. ## The relative odds based on the 2-way table model: exp(-1.730935) ## [1] 0.1771187 ## C. ## The relative odds based on the logistic regression model with 2 covariates: > exp(-1.602) [1] 0.2014931 Estimated odds ratio from (a) does not adjust the confounding effects of income, thus it is not precise estimate. The estimated odds ratio from two-way table (b) is better estimates compared to (a) because it includes the income effect. However, in this model, income is measured as a categorical variable. There is a potential that we lose information on data. The estated odds ratio from logistic model (c) is the best estimates of all, since it takes into account of income effect. Since there is a evidence that the income effect on log odds is linear (from question 3-c), it is better consider the income as a numerical variable.