Stratified sampling with equal/unequal probabilities.

Strata(x, stratanames = NULL, size,
       method = c("srswor", "srswr", "poisson", "systematic"),
       pik, description = FALSE)

Arguments

x

a data frame or a matrix; its number of rows is n, the population size.

stratanames

vector of stratification variables.

size

vector of stratum sample sizes (in the order in which the strata are given in the input data set).

method

method to select units; implemented are: a) simple random sampling without replacement ("srswor"), b) simple random sampling with replacement ("srswr"), c) Poisson sampling ("poisson"), d) systematic sampling ("systematic") (default is "srswor").

pik

vector of inclusion probabilities or auxiliary information used to compute them; this argument is only used for unequal probability sampling (Poisson and systematic). If an auxiliary information is provided, the function uses the inclusionprobabilities function for computing these probabilities. If the method is "srswr" and the sample size is larger than the population size, this vector is normalized to one.

description

a message is printed if its value is TRUE; the message gives the number of selected units and the number of the units in the population. By default, the value is FALSE.

Value

The function produces an object, which contains the following information:

id

the identifier of the selected units.

stratum

the unit stratum.

prob

the final unit inclusion probability.

Author

Andri Signorell <andri@signorell.net>
rewritten based on the ideas of Yves Tille <yves.tille@unine.ch> and Alina Matei <alina.matei@unine.ch>

See also

Examples

# Example from An and Watts (New SAS procedures for Analysis of Sample Survey Data)
# generates artificial data (a 235X3 matrix with 3 columns: state, region, income).
# the variable "state" has 2 categories ('nc' and 'sc').
# the variable "region" has 3 categories (1, 2 and 3).
# the sampling frame is stratified by region within state.
# the income variable is randomly generated

m <- rbind(matrix(rep("nc",165), 165, 1, byrow=TRUE),
           matrix(rep("sc", 70), 70, 1, byrow=TRUE))
m <- cbind.data.frame(m, c(rep(1, 100), rep(2,50), rep(3,15),
                      rep(1, 30), rep(2, 40)), 1000 * runif(235))
names(m) <- c("state", "region", "income")

# computes the population stratum sizes
table(m$region, m$state)
#>    
#>      nc  sc
#>   1 100  30
#>   2  50  40
#>   3  15   0

# not run
#     nc  sc
#  1 100  30
#  2  50  40
#  3  15   0
# there are 5 cells with non-zero values
# one draws 5 samples (1 sample in each stratum)
# the sample stratum sizes are 10,5,10,4,6, respectively
# the method is 'srswor' (equal probability, without replacement)

s <- Strata(m, c("region", "state"), size=c(10, 5, 10, 4, 6), method="srswor")

# extracts the observed data
data.frame(income=m[s$id, "income"], s)
#>          income state region  income.1 stratum size  id
#> 1.65  866.10058    nc      1 866.10058       1   10  65
#> 1.93  592.34629    nc      1 592.34629       1   10  93
#> 1.29  286.04175    nc      1 286.04175       1   10  29
#> 1.64  547.19689    nc      1 547.19689       1   10  64
#> 1.34  200.38347    nc      1 200.38347       1   10  34
#> 1.45  482.35179    nc      1 482.35179       1   10  45
#> 1.26   52.49346    nc      1  52.49346       1   10  26
#> 1.8   598.38073    nc      1 598.38073       1   10   8
#> 1.52  329.42273    nc      1 329.42273       1   10  52
#> 1.62  895.92032    nc      1 895.92032       1   10  62
#> 2.120 507.15857    nc      2 507.15857       2    5 120
#> 2.143 232.86548    nc      2 232.86548       2    5 143
#> 2.147 688.35110    nc      2 688.35110       2    5 147
#> 2.109 451.99868    nc      2 451.99868       2    5 109
#> 2.130 571.54761    nc      2 571.54761       2    5 130
#> 3.165 794.92344    nc      3 794.92344       3   10 165
#> 3.151 628.19458    nc      3 628.19458       3   10 151
#> 3.156 755.49937    nc      3 755.49937       3   10 156
#> 3.160 426.00580    nc      3 426.00580       3   10 160
#> 3.157 578.61374    nc      3 578.61374       3   10 157
#> 3.164 616.97988    nc      3 616.97988       3   10 164
#> 3.152 780.21599    nc      3 780.21599       3   10 152
#> 3.162 507.13136    nc      3 507.13136       3   10 162
#> 3.153 178.19258    nc      3 178.19258       3   10 153
#> 3.159 948.04473    nc      3 948.04473       3   10 159
#> 4.167 427.83172    sc      1 427.83172       4    4 167
#> 4.181 381.52305    sc      1 381.52305       4    4 181
#> 4.170 391.48706    sc      1 391.48706       4    4 170
#> 4.169  27.68708    sc      1  27.68708       4    4 169
#> 5.202 773.62990    sc      2 773.62990       5    6 202
#> 5.231 469.94421    sc      2 469.94421       5    6 231
#> 5.233 196.68003    sc      2 196.68003       5    6 233
#> 5.217 822.34924    sc      2 822.34924       5    6 217
#> 5.229 369.66882    sc      2 369.66882       5    6 229
#> 5.208 152.05055    sc      2 152.05055       5    6 208

# see the result using a contigency table
table(s$region, s$state)
#>    
#>     nc sc
#>   1 10  4
#>   2  5  6
#>   3 10  0


# The same data as in Example 1
# the method is 'systematic' (unequal probability, without replacement)
# the selection probabilities are computed using the variable 'income'
s <- Strata(m,c("region", "state"), size=c(10, 5, 10, 4, 6),
            method="systematic", pik=m$income)

# extracts the observed data
data.frame(income=m[s$id, "income"], s)
#>          income state region  income.1 stratum size  id
#> 1.9   322.88584    nc      1 322.88584       1   10   9
#> 1.29  286.04175    nc      1 286.04175       1   10  29
#> 1.86  636.40332    nc      1 636.40332       1   10  86
#> 1.16  104.97314    nc      1 104.97314       1   10  16
#> 1.8   598.38073    nc      1 598.38073       1   10   8
#> 1.36  793.01949    nc      1 793.01949       1   10  36
#> 1.92  448.64375    nc      1 448.64375       1   10  92
#> 1.100 380.39065    nc      1 380.39065       1   10 100
#> 1.66  655.96816    nc      1 655.96816       1   10  66
#> 1.18  408.09932    nc      1 408.09932       1   10  18
#> 2.145 737.76557    nc      2 737.76557       2    5 145
#> 2.135  80.87036    nc      2  80.87036       2    5 135
#> 2.112 919.35027    nc      2 919.35027       2    5 112
#> 2.150 749.40846    nc      2 749.40846       2    5 150
#> 2.103 552.14334    nc      2 552.14334       2    5 103
#> 3.157 578.61374    nc      3 578.61374       3   10 157
#> 3.160 426.00580    nc      3 426.00580       3   10 160
#> 3.152 780.21599    nc      3 780.21599       3   10 152
#> 3.156 755.49937    nc      3 755.49937       3   10 156
#> 3.158 177.50997    nc      3 177.50997       3   10 158
#> 3.164 616.97988    nc      3 616.97988       3   10 164
#> 3.151 628.19458    nc      3 628.19458       3   10 151
#> 3.163 385.42599    nc      3 385.42599       3   10 163
#> 3.153 178.19258    nc      3 178.19258       3   10 153
#> 3.154 712.35448    nc      3 712.35448       3   10 154
#> 4.183 195.25163    sc      1 195.25163       4    4 183
#> 4.170 391.48706    sc      1 391.48706       4    4 170
#> 4.186 131.10277    sc      1 131.10277       4    4 186
#> 4.184 982.18831    sc      1 982.18831       4    4 184
#> 5.200 388.30586    sc      2 388.30586       5    6 200
#> 5.214 677.27586    sc      2 677.27586       5    6 214
#> 5.229 369.66882    sc      2 369.66882       5    6 229
#> 5.198 483.59321    sc      2 483.59321       5    6 198
#> 5.224 771.66254    sc      2 771.66254       5    6 224
#> 5.205 192.33168    sc      2 192.33168       5    6 205

# see the result using a contigency table
table(s$region, s$state)
#>    
#>     nc sc
#>   1 10  4
#>   2  5  6
#>   3 10  0