Analyse bootstrap du jeu de données essence

# Rééchantillonnage des résidus

> essence.reg_lm(fuel~.,data=essence)
> essence.reg.resid_resid(essence.reg)-mean(resid(essence.reg))
> essence.reg.pred_predict(essence.reg)

# La première façon ne fonctionnera pas!!!

> essence.resid.boot_bootstrap(essence.reg.resid, 
+ lm(essence.reg.resid+essence.reg.pred~tax+dlic+inc+road,data=essence)$coef,
+ B=1000)
Forming replications  1  to  100 
Forming replications  101  to  200 
Forming replications  201  to  300 
Forming replications  301  to  400 
Forming replications  401  to  500 
Forming replications  501  to  600 
Forming replications  601  to  700 
Forming replications  701  to  800 
Forming replications  801  to  900 
Forming replications  901  to  1000 

Warning messages:
  Bootstrap replicates are identical; does your statistic depend on the
	resampled data?  In some cases bootstrap(..., assign.frame1=T) helps. 
	in: bootstats(replicates = reps, observed = observed, n = n, call =
	func.call, ....

# La raison que ça ne fonctionne pas, c'est qu'en spécifiant le data.frame
# essence, tous les noms de variables sont interprétés à l'intérieur de ce 
# data.frame et donc essence.reg.resid du lm est l'original et non pas celui
# du bootstrap.

# Voici une première façon qui fonctionne, mais qui est longue à écrire...

> essence.resid.boot_bootstrap(essence.reg.resid, 
+ lm(essence.reg.resid+essence.reg.pred~
+ essence$tax+essence$dlic+essence$inc+essence$road)$coef,B=1000)
Forming replications  1  to  100 
Forming replications  101  to  200 
Forming replications  201  to  300 
Forming replications  301  to  400 
Forming replications  401  to  500 
Forming replications  501  to  600 
Forming replications  601  to  700 
Forming replications  701  to  800 
Forming replications  801  to  900 
Forming replications  901  to  1000 

> essence.resid.boot
Call:
bootstrap(data = essence.reg.resid, statistic = lm(essence.reg.resid + 
	essence.reg.pred ~ essence$tax + essence$dlic + essence$inc + essence$
	road)$coef, B = 1000)

Number of Replications: 1000 

Summary Statistics:
             Observed       Bias     Mean      SE 
 (Intercept)  235.535 -6.1034636  229.431 209.890
 essence$tax   -8.171  0.3557803   -7.815  13.579
essence$dlic   11.886  0.0208963   11.906   2.183
 essence$inc  -68.788  0.4276140  -68.361  20.476
essence$road    2.868  0.0006632    2.869   3.564

# Une autre possibilité consiste à d'abord "attacher" le data frame comme
# suit:

> attach(essence)            
> essence.resid.boot2_bootstrap(essence.reg.resid, 
+ lm(essence.reg.resid+essence.reg.pred~tax+dlic+inc+road)$coef,B=1000)
Forming replications  1  to  100 
Forming replications  101  to  200 
Forming replications  201  to  300 
Forming replications  301  to  400 
Forming replications  401  to  500 
Forming replications  501  to  600 
Forming replications  601  to  700 
Forming replications  701  to  800 
Forming replications  801  to  900 
Forming replications  901  to  1000 

> essence.resid.boot2
Call:
bootstrap(data = essence.reg.resid, statistic = lm(essence.reg.resid + 
	essence.reg.pred ~ tax + dlic + inc + road)$coef, B = 1000)

Number of Replications: 1000 

Summary Statistics:
            Observed       Bias     Mean      SE 
(Intercept)  235.535 -4.2484125  231.286 215.504
        tax   -8.171  0.4138404   -7.757  13.941
       dlic   11.886  0.0619632   11.947   2.269
        inc  -68.788 -0.5449968  -69.333  20.090
       road    2.868  0.0003108    2.868   3.871

# Les intervalles de confiance de type percentile s'obtiennent automatiquement
# à partir le la fonction limits.emp:

> limits.emp(essence.resid.boot)
                    2.5%          5%         95%       97.5% 
 (Intercept) -204.541737 -126.390466  580.740770  659.995154
 essence$tax  -33.541673  -29.314765   15.074380   17.916127
essence$dlic    7.785748    8.219323   15.628862   16.176036
 essence$inc -106.901136 -102.237438  -34.240965  -29.096112
essence$road   -4.093765   -2.916489    8.633085    9.780476

# Pour les intervalles de base, on peut utiliser la formule suivante:
# [2\hat\theta - quantile percentile du haut,2\hat\theta - quantile percentile
#                                            du bas]

> 2*essence.resid.boot$observed-limits.emp(essence.resid.boot)
                   2.5%          5%         95%       97.5% 
 (Intercept)  675.61088  597.459606 -109.671631 -188.926015
 essence$tax   17.20047   12.973565  -31.415581  -34.257327
essence$dlic   15.98532   15.551750    8.142211    7.595037
 essence$inc  -30.67517  -35.338871 -103.335345 -108.480198
essence$road    9.82959    8.652314   -2.897260   -4.044650

# Ce n'est toutefois pas très joli puisque les colonnes sont "inversées" et
# les titres des colonnes sont incorrects.  On va plutôt s'écrire une fonction
# en s'inspirant de la fonction limits.emp

> limits.emp
function(x, probs = c(25, 50, 950, 975)/1000)
{
	# Calculate empirical percentiles for replicates in "resamp" object.
	resamp.percentiles <- as.matrix(t(apply(x$replicates, 2, quantile,
		probs = probs, na.rm = T)))
	dimnames(resamp.percentiles) <- list(dimnames(x$replicates)[[2]], paste(		100 * probs, "%", sep = ""))
	resamp.percentiles
}

# Voici donc la fonction limits.basic QUE JE ME SUIS CRÉÉ:

> limits.basic
function(x, probs = c(25, 50, 950, 975)/1000)
{
        # Put the probabilities in decreasing order.
	probs <- probs[order( - probs)]
	# Calculate empirical percentiles for replicates in "resamp" object.
	resamp.percentiles <- 2 * x$observed - as.matrix(t(apply(x$replicates,
		2, quantile, probs = probs, na.rm = T)))
	dimnames(resamp.percentiles) <- list(dimnames(x$replicates)[[2]], 
                paste(	100 * (1 - probs), "%", sep = ""))
	resamp.percentiles
}

# Rééchantillonnage par paires.
# Notez que cette fois-ci, bien qu'on spécifie que le data frame est essence,
# il n'y aura pas de problème puisque ce sont les lignes du data frame qui 
# seront rééchantillonnées.

> essence.paire.boot_bootstrap(essence,coef(lm(fuel~.,data=essence)))
Forming replications  1  to  100 
Forming replications  101  to  200 
Forming replications  201  to  300 
Forming replications  301  to  400 
Forming replications  401  to  500 
Forming replications  501  to  600 
Forming replications  601  to  700 
Forming replications  701  to  800 
Forming replications  801  to  900 
Forming replications  901  to  1000 

> essence.paire.boot
Call:
bootstrap(data = essence, statistic = coef(lm(fuel ~ ., data = essence)))

Number of Replications: 1000 

Summary Statistics:
            Observed      Bias    Mean      SE 
(Intercept)  235.535  37.83672  273.37 276.514
        tax   -8.171  -4.38807  -12.56  24.513
       dlic   11.886  -0.02258   11.86   3.154
        inc  -68.788   0.40864  -68.38  21.684
       road    2.868  -0.75794    2.11   5.890

> limits.emp(essence.paire.boot)
                   2.5%          5%        95%      97.5% 
(Intercept) -315.740462 -216.998986  698.16025  771.78914
        tax  -57.533514  -52.392822   23.18105   30.25121
       dlic    5.845627    7.078172   17.14405   18.45056
        inc -115.102555 -106.950439  -36.01459  -29.69933
       road   -9.682355   -7.544039   10.71383   12.48519

> limits.basic(essence.paire.boot)
                   2.5%          5%        95%      97.5% 
(Intercept) -300.719998 -227.091112  688.06813  786.80960
        tax  -46.592408  -39.522247   36.05162   41.19231
       dlic    5.320516    6.627022   16.69290   17.92545
        inc -107.876977 -101.561724  -30.62587  -22.47375
       road   -6.749365   -4.978005   13.27986   15.41818

# Notez la différence avec les intervalles lorsqu'on rééchantillonne les
# résidus, en particulier l'effet (asymétrique) sur tax.

> limits.basic(essence.resid.boot)
                    2.5%          5%         95%      97.5% 
 (Intercept) -188.926015 -109.671631  597.459606  675.61088
 essence$tax  -34.257327  -31.415581   12.973565   17.20047
essence$dlic    7.595037    8.142211   15.551750   15.98532
 essence$inc -108.480198 -103.335345  -35.338871  -30.67517
essence$road   -4.044650   -2.897260    8.652314    9.82959

# Rappelez-vous l'influence du cas Hawai sur tax.  Lorsqu'on rééchantillonne
# les paires, Hawai ne sera pas là avec probabilité (1 - 1/50)^50 = 0.3641697
# approx exp(-1).  Par contre, dans certains jeux de données bootstrap, Hawai
# sera là deux fois ou plus.

******************************************

# Régression robuste.

> essence.rreg_rreg(as.matrix(essence[,-5]),as.vector(essence[,"fuel"]))
> essence.rreg.resid_resid(essence.rreg)-mean(resid(essence.rreg))
> essence.rreg.pred_fitted(essence.rreg)
> essence.rreg.res.b_bootstrap(essence.rreg.resid,
+ rreg(as.matrix(essence[,-5]),essence.rreg.pred+essence.rreg.resid)$coef)
Forming replications  1  to  100 
Forming replications  101  to  200 
Forming replications  201  to  300 
Forming replications  301  to  400 
Forming replications  401  to  500 
Forming replications  501  to  600 
Forming replications  601  to  700 
Forming replications  701  to  800 
Forming replications  801  to  900 
Forming replications  901  to  1000 

> essence.rreg.paire.b_bootstrap(essence,rreg(as.matrix(essence[,-5]),
+ essence[,"fuel"])$coef) 
Forming replications  1  to  100 
Forming replications  101  to  200 
Forming replications  201  to  300 
Forming replications  301  to  400 
Forming replications  401  to  500 
Forming replications  501  to  600 
Forming replications  601  to  700 
Forming replications  701  to  800 
Forming replications  801  to  900 
Forming replications  901  to  1000 

Warning messages:
1: failed to converge in 20 steps in: rreg(as.matrix(essence[, -5]), essence[,
	"fuel"])
2: failed to converge in 20 steps in: rreg(as.matrix(essence[, -5]), essence[,
	"fuel"])
> essence.rreg.res.b
Call:
bootstrap(data = essence.rreg.resid, statistic = rreg(as.matrix(essence[, -5
	]), essence.rreg.pred + essence.rreg.resid)$coef)

Number of Replications: 1000 

Summary Statistics:
             Observed     Bias      Mean      SE 
(Intercept)  494.9474  2.49010  497.4375 150.089
        tax  -24.7909 -0.20626  -24.9971   9.840
       dlic    9.6088 -0.03565    9.5732   1.499
        inc  -69.1739  0.56615  -68.6078  13.579
       road   -0.4674  0.08651   -0.3808   2.587
> essence.rreg.paire.b
Call:
bootstrap(data = essence, statistic = rreg(as.matrix(essence[, -5]), essence[
	, "fuel"])$coef)

Number of Replications: 1000 

Summary Statistics:
             Observed     Bias       Mean      SE 
(Intercept)  503.2714 -39.1742  464.09721 236.086
        tax  -24.7909   1.5778  -23.21302  20.700
       dlic    9.6088   0.3276    9.93644   2.788
        inc  -69.1739   1.8593  -67.31462  19.903
       road   -0.4674   0.3981   -0.06923   5.368

# Observez bien la colonne "Observed" dans les deux cas.  Elles diffèrent.
# Pourquoi?

> rreg(as.matrix(essence[,-5]),essence[,"fuel"])$coef
 (Intercept)       tax     dlic       inc       road 
    503.2714 -24.79086 9.608846 -69.17393 -0.4673632
> rreg(as.matrix(essence[,-5]),essence.rreg.pred+essence.rreg.resid)$coef
 (Intercept)       tax     dlic       inc       road 
    494.9474 -24.79086 9.608846 -69.17393 -0.4673632

# Qu'est-il arrivé?  Les résidus essence.rreg.resid sont centrés, donc 
# valeurs prédites + résidus n'est pas égal au vecteur y!  Donc la colonne
# biais n'est pas correcte pour le bootstrap par les résidus.

# Voyons maintenant les intervalles de confiance bootstrap de base.

> limits.basic(essence.rreg.res.b)
                   2.5%          5%         95%       97.5% 
(Intercept)  204.242234  244.221601  737.037082  767.532580
        tax  -43.508999  -40.785735   -9.398401   -5.390540
       dlic    6.570962    7.147088   12.036276   12.489872
        inc  -97.330952  -91.922180  -48.110221  -42.313863
       road   -5.587874   -4.635123    3.803832    4.593914
> limits.basic(essence.rreg.paire.b)
                   2.5%          5%         95%        97.5% 
(Intercept)  106.529175  185.934863  926.835957  1043.385017
        tax  -78.948889  -66.652668    2.160762     7.047275
       dlic    2.721075    4.061848   13.462261    14.573277
        inc -107.524721 -102.563220  -38.527092   -30.126741
       road  -13.734533  -10.755674    7.350529     8.563121

# OUPS! Ceux pour la méthode du rééchantillonnage des résidus ne sont pas
# corrects puisque la colonne "Observed" n'est pas bonne.

> temp_2*rreg(as.matrix(essence[,-5]),essence[,"fuel"])$coef -
+ limits.emp(essence.rreg.res.b)
> dimnames(temp)[[2]]_dimnames(temp)[[2]][4:1]
> temp_temp[,4:1]
> temp
                   2.5%          5%         95%       97.5% 
(Intercept)  220.890192  260.869559  753.685040  784.180538
        tax  -43.508999  -40.785735   -9.398401   -5.390540
       dlic    6.570962    7.147088   12.036276   12.489872
        inc  -97.330952  -91.922180  -48.110221  -42.313863
       road   -5.587874   -4.635123    3.803832    4.593914

# Si on regarde le coefficient tax, avec les moindres carrés, avec la 
# méthode paire, on ajoutait 8 unités à gauche et 24 à droite lorsqu'on
# passait de résidus à paires.  Avec la régression robuste, on ajoute 26
# unités à gauche et 11 à droite, soit une situation essentiellement inversée.

# Quelques graphiques maintenant.


> postscript("essence.ps",setfont=ps.setfont.latin1)
> par(oma=c(0,0,4,0))
> plot(essence.resid.boot)
> mtext("Bootstrap des MC par résidus",outer=T,side=3,line=2,cex=1.5)
> plot(essence.paire.boot)
> mtext("Bootstrap des MC par paires",outer=T,side=3,line=2,cex=1.5)
> plot(essence.rreg.res.b)
> mtext("Bootstrap de rreg par résidus",outer=T,side=3,line=2,cex=1.5)
> plot(essence.rreg.paire.b)
> mtext("Bootstrap de rreg par paires",outer=T,side=3,line=2,cex=1.5)
> qqnorm(essence.resid.boot)
> mtext("Bootstrap des MC par résidus",outer=T,side=3,line=2,cex=1.5)
> qqnorm(essence.paire.boot)
> mtext("Bootstrap des MC par paires",outer=T,side=3,line=2,cex=1.5)
> qqnorm(essence.rreg.res.b)
> mtext("Bootstrap de rreg par résidus",outer=T,side=3,line=2,cex=1.5)
> qqnorm(essence.rreg.paire.b)
> mtext("Bootstrap de rreg par paires",outer=T,side=3,line=2,cex=1.5)
> dev.off()
Generated postscript file "essence.ps".
 motif 
     2
Dernière modification: Mon Feb 24 17:50:39 2003