How I used R for YouGov and Xbox in Election 2012

Mike Malecki, YouGov

1: MRP

2: Xbox Live live-polling

logo xboxtab

3: Xbox Live panel techniques

cumulativechange

cumulativechange

WTF/m

wtfm

wtfm

Multilevel Regression and Poststratification – Estimating Population Party & Ideology

head(df)
##   caseid weight region fips gender age race4 educyrs marstat child18 pid3
## 1      1   0.46      1   42      1  54     1      18       5       2    1
## 2      2   2.09      1   42      1  77     1      10       1       2    2
## 3      3   1.54      3   24      1  27     1      12       6       2    1
## 4      4   1.08      3   51      2  28     1      14       1       1    2
## 5      5   1.39      3   54      1  21     1      14       5       2    2
## 6      6   0.27      1   42      2  79     1      16       4       2    1
##   pid5 ideo5     date source reg income ordered.party y female race.female
## 1    1     1 20120105 gallup   1    600             1 1      0           1
## 2    5     4 20120105 gallup   1    600             5 5      0           1
## 3    1     1 20120105 gallup   1    600             1 1      0           1
## 4    5     4 20120105 gallup   1    600             5 5      1           5
## 5    5     4 20120105 gallup   1    600             5 5      0           1
## 6    1     2 20120105 gallup   1    600             1 1      1           5
##   state edu.cat age.cat age.edu survey
## 1    PA       4       3      15      4
## 2    PA       1       4       4      4
## 3    MD       1       1       1      4
## 4    VA       2       1       5      4
## 5    WV       2       1       5      4
## 6    PA       3       4      12      4

Response model

More response model


ηblock ∼ N(0, σblock)

ηstate ∼ N(αregionregion[state] + Xβstatepred[state], σstate)

mrp/lmer/blmer

m.Dem <- mrp( party==1 ~ race.female + age + edu + state, 
              data=df, add=list(
                  df.statepreds,
                  expression(race.edu <- 
                                 interaction(race.female, education))),
              ml.formula=.~.+(1|race.edu) + statepred1 + statepred2 )

The JAGS model (should use Stan!)

model{
  for(i in 1:N){  ## loop over observations
          ## form the linear predictor (no intercept)
          mu[i] <-
        eta.race.female[race.female[i]]
        + eta.age[age.cat[i]]
        + eta.edu[edu.cat[i]]
        + eta.age.edu[age.edu[i]]
        + eta.poll[survey[i]]
        + eta.state[state[i]]


## for (j in group) {
##   g.raw[j] ~ dnorm ( mu.g.raw, tau.g.raw )
##   g[k] <- xi.g*(g.raw[j] - mean(g.raw[]) )
## }
## xi.g ~ dunif(0,100)
## mu.g.raw ~ dnorm
## tau.g.raw <- pow(sig.g.raw, -2)
## sigma.g.raw <- dunif ()
## sigma.g <- xi.g* sigma.g.raw

...

Do that a few thousand times…

And you get back distributions like this. – houseeffects

Samples– now what?

What about MCMC from JAGS/STAN?

head(xf.all)
##    edu   age race.female state
## 1  <HS 18-29     White M    AK
## 2   HS 18-29     White M    AK
## 3 Some 18-29     White M    AK
## 4 Grad 18-29     White M    AK
## 5  <HS 30-44     White M    AK
## 6   HS 30-44     White M    AK
head(xm.all)
##   edu<HS eduHS eduSome eduGrad age18-29 age30-44 age45-64 age≥65
## 1      1     0       0       0        1        0        0      0
## 2      0     1       0       0        1        0        0      0
## 3      0     0       1       0        1        0        0      0
## 4      0     0       0       1        1        0        0      0
## 5      1     0       0       0        0        1        0      0
## 6      0     1       0       0        0        1        0      0
##   race.femaleWhite M race.femaleBlack M race.femaleHispanic M
## 1                  1                  0                     0
## 2                  1                  0                     0
## 3                  1                  0                     0
## 4                  1                  0                     0
## 5                  1                  0                     0
## 6                  1                  0                     0
##   race.femaleOther M race.femaleWhite F race.femaleBlack F
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   race.femaleHispanic F race.femaleOther F stateAK stateAL stateAR stateAZ
## 1                     0                  0       1       0       0       0
## 2                     0                  0       1       0       0       0
## 3                     0                  0       1       0       0       0
## 4                     0                  0       1       0       0       0
## 5                     0                  0       1       0       0       0
## 6                     0                  0       1       0       0       0
##   stateCA stateCO stateCT stateDC stateDE stateFL stateGA stateHI stateIA
## 1       0       0       0       0       0       0       0       0       0
## 2       0       0       0       0       0       0       0       0       0
## 3       0       0       0       0       0       0       0       0       0
## 4       0       0       0       0       0       0       0       0       0
## 5       0       0       0       0       0       0       0       0       0
## 6       0       0       0       0       0       0       0       0       0
##   stateID stateIL stateIN stateKS stateKY stateLA stateMA stateMD stateME
## 1       0       0       0       0       0       0       0       0       0
## 2       0       0       0       0       0       0       0       0       0
## 3       0       0       0       0       0       0       0       0       0
## 4       0       0       0       0       0       0       0       0       0
## 5       0       0       0       0       0       0       0       0       0
## 6       0       0       0       0       0       0       0       0       0
##   stateMI stateMN stateMO stateMS stateMT stateNC stateND stateNE stateNH
## 1       0       0       0       0       0       0       0       0       0
## 2       0       0       0       0       0       0       0       0       0
## 3       0       0       0       0       0       0       0       0       0
## 4       0       0       0       0       0       0       0       0       0
## 5       0       0       0       0       0       0       0       0       0
## 6       0       0       0       0       0       0       0       0       0
##   stateNJ stateNM stateNV stateNY stateOH stateOK stateOR statePA stateRI
## 1       0       0       0       0       0       0       0       0       0
## 2       0       0       0       0       0       0       0       0       0
## 3       0       0       0       0       0       0       0       0       0
## 4       0       0       0       0       0       0       0       0       0
## 5       0       0       0       0       0       0       0       0       0
## 6       0       0       0       0       0       0       0       0       0
##   stateSC stateSD stateTN stateTX stateUT stateVA stateVT stateWA stateWI
## 1       0       0       0       0       0       0       0       0       0
## 2       0       0       0       0       0       0       0       0       0
## 3       0       0       0       0       0       0       0       0       0
## 4       0       0       0       0       0       0       0       0       0
## 5       0       0       0       0       0       0       0       0       0
## 6       0       0       0       0       0       0       0       0       0
##   stateWV stateWY edu<HS:age18-29 eduHS:age18-29 eduSome:age18-29
## 1       0       0               1              0                0
## 2       0       0               0              1                0
## 3       0       0               0              0                1
## 4       0       0               0              0                0
## 5       0       0               0              0                0
## 6       0       0               0              0                0
##   eduGrad:age18-29 edu<HS:age30-44 eduHS:age30-44 eduSome:age30-44
## 1                0               0              0                0
## 2                0               0              0                0
## 3                0               0              0                0
## 4                1               0              0                0
## 5                0               1              0                0
## 6                0               0              1                0
##   eduGrad:age30-44 edu<HS:age45-64 eduHS:age45-64 eduSome:age45-64
## 1                0               0              0                0
## 2                0               0              0                0
## 3                0               0              0                0
## 4                0               0              0                0
## 5                0               0              0                0
## 6                0               0              0                0
##   eduGrad:age45-64 edu<HS:age≥65 eduHS:age≥65 eduSome:age≥65
## 1                0             0            0              0
## 2                0             0            0              0
## 3                0             0            0              0
## 4                0             0            0              0
## 5                0             0            0              0
## 6                0             0            0              0
##   eduGrad:age≥65
## 1              0
## 2              0
## 3              0
## 4              0
## 5              0
## 6              0

Select them:

jags.terms <- c(unlist(sapply(termnames, function(f, prefix = "eta", jagsvars = varnames(samples)) {
    findme <- paste(prefix, f, sep = ".")
    return(grep(paste0(findme, "\\["), jagsvars))
})))
str(termnames)
##  chr [1:5] "edu" "age" "race.female" "state" "age.edu"
head(varnames(samples))
## [1] "B.state"        "eta.age[1]"     "eta.age[2]"     "eta.age[3]"    
## [5] "eta.age[4]"     "eta.age.edu[1]"
head(jags.terms)
## edu1 edu2 edu3 edu4 age1 age2 
##   22   23   24   25    2    3

Form the predictor and draw the fitted value for every sample

fitted.full.m <- lapply(samples, function(c, ...) {
    ans <- apply(c, 1, function(s, jags.terms) {
        return(xm.all %*% s[jags.terms])
    }, ...)
    return(ans)
}, jags.terms)
taus.m <- lapply(samples, function(c) {
    return(c[, c("tau[1]", "tau[2]", "tau[3]", "tau[4]")])
})

Now go from logit to unweighted predicted probabilities for all cells:

cellpred.m <- array(NA, dim = c(nrow(fitted.full.m[[1]]), nrow(taus.m[[1]]), 
    5), dimnames = list(cell = rownames(xf.all), samples = 1:nrow(taus.m[[1]]), 
    party = c("+D", "-d", "I", "-r", "+R")))
cellpred.m[, , 1] <- sapply(1:nrow(taus.m[[1]]), function(s) {
    return(plogis(taus.m[[1]][s, 1] - fitted.full.m[[1]][, s]))
})
cellpred.m[, , 2] <- sapply(1:nrow(taus.m[[1]]), function(s) {
    return(plogis(taus.m[[1]][s, 2] - fitted.full.m[[1]][, s]) - plogis(taus.m[[1]][s, 
        1] - fitted.full.m[[1]][, s]))
})
cellpred.m[, , 3] <- sapply(1:nrow(taus.m[[1]]), function(s) {
    return(plogis(taus.m[[1]][s, 3] - fitted.full.m[[1]][, s]) - plogis(taus.m[[1]][s, 
        2] - fitted.full.m[[1]][, s]))
})
cellpred.m[, , 4] <- sapply(1:nrow(taus.m[[1]]), function(s) {
    return(plogis(taus.m[[1]][s, 4] - fitted.full.m[[1]][, s]) - plogis(taus.m[[1]][s, 
        3] - fitted.full.m[[1]][, s]))
})
cellpred.m[, , 5] <- sapply(1:nrow(taus.m[[1]]), function(s) {
    return(1 - plogis(taus.m[[1]][s, 4] - fitted.full.m[[1]][, s]))
})

Poststratify

by Education

out <- apply(party.post.array, c("edu", "party"), sum)/apply(Census.NWay, c("edu", 
    "party"), sum)
print(xtable(out), type = "html", digits = 2)
+D -d I -r +R
&lt HS 0.37 0.14 0.10 0.14 0.26
HS 0.33 0.14 0.10 0.15 0.29
Some 0.31 0.14 0.10 0.15 0.29
Grad 0.35 0.14 0.10 0.14 0.26

by State

out <- apply(party.post.array, c("state", "party"), sum)/apply(Census.NWay, 
    c("state", "party"), sum)
print(xtable(out), type = "html", digits = 2)
+D -d I -r +R
AK 0.27 0.14 0.10 0.16 0.32
AL 0.31 0.12 0.09 0.14 0.34
AR 0.34 0.14 0.10 0.14 0.28
AZ 0.30 0.14 0.10 0.15 0.29
CA 0.38 0.15 0.10 0.14 0.23
CO 0.29 0.14 0.10 0.16 0.31
CT 0.40 0.15 0.10 0.13 0.23
DC 0.64 0.12 0.06 0.08 0.10
DE 0.44 0.14 0.09 0.12 0.21
FL 0.33 0.14 0.10 0.15 0.29
GA 0.37 0.13 0.09 0.13 0.29
HI 0.38 0.16 0.10 0.14 0.22
IA 0.34 0.15 0.10 0.15 0.26
ID 0.22 0.13 0.10 0.17 0.38
IL 0.44 0.14 0.09 0.12 0.21
IN 0.33 0.14 0.10 0.15 0.29
KS 0.26 0.13 0.10 0.16 0.34
KY 0.31 0.14 0.10 0.15 0.30
LA 0.39 0.13 0.09 0.13 0.27
MA 0.40 0.15 0.10 0.13 0.21
MD 0.52 0.13 0.08 0.10 0.17
ME 0.30 0.15 0.11 0.16 0.28
MI 0.39 0.14 0.09 0.14 0.24
MN 0.37 0.15 0.10 0.14 0.23
MO 0.34 0.14 0.10 0.15 0.28
MS 0.33 0.13 0.09 0.14 0.32
MT 0.27 0.15 0.11 0.16 0.32
NC 0.39 0.14 0.09 0.13 0.25
ND 0.28 0.15 0.11 0.16 0.30
NE 0.24 0.13 0.10 0.16 0.37
NH 0.30 0.15 0.11 0.16 0.28
NJ 0.43 0.14 0.09 0.13 0.21
NM 0.35 0.15 0.10 0.15 0.25
NV 0.31 0.14 0.10 0.15 0.30
NY 0.42 0.15 0.09 0.13 0.21
OH 0.34 0.14 0.10 0.15 0.27
OK 0.30 0.14 0.10 0.15 0.31
OR 0.34 0.15 0.10 0.15 0.25
PA 0.39 0.14 0.09 0.14 0.24
RI 0.36 0.15 0.10 0.14 0.24
SC 0.38 0.13 0.09 0.13 0.28
SD 0.27 0.14 0.11 0.16 0.32
TN 0.34 0.13 0.09 0.14 0.29
TX 0.31 0.14 0.10 0.15 0.31
UT 0.17 0.11 0.09 0.17 0.45
VA 0.37 0.14 0.09 0.14 0.27
VT 0.32 0.15 0.11 0.15 0.27
WA 0.37 0.15 0.10 0.14 0.23
WI 0.35 0.15 0.10 0.14 0.26
WV 0.31 0.15 0.10 0.15 0.28
WY 0.27 0.13 0.10 0.16 0.35

Watching the Debates on Xbox Live

Challenges

Implementation

Getting the data with RCurl and a key in the header

msh <- getCurlHandle(httpheader = c(SecurityKey = "ugly-guid-hash-here"))
now <- format(Sys.time(), format = "%Y-%m-%dT%H:%M:%S", tz = "america/chicago")
P <- 1
batchsize = 5000
starttime <- "2012-11-05T15:08:10"
con <- file("electionday.csv", open = "at")
while (P < 1e+05) {
    lx <- xmlTreeParse(getForm("https://bdewebservices.com/MsBdeService.svc/rest/ElectionEntriesReport", 
        curl = msh, startDate = "2012-11-05", endDate = "2012-12-24", pageSize = batchsize, 
        pageNumber = P), asText = TRUE, useInternalNodes = TRUE)
    lx <- try(xmlSApply(lx[["//Entries"]], xmlValue), silent = TRUE)
    if (!is.error(lx)) {
        writeLines(lx, con = con)
        P <- P + 1
    } else {
        Sys.sleep(5)
        message(".")
    }
}

Which gives us…

{xml} <?xml version="1.0"?> <Report> <Count>6562746</Count> <Entries> <Entry>E88170913273F348124A648081C66117C9B0F0C1,NBCElection_event.e07d0a75-1cb1-4040-9598-1e653d0d1b6c, Romney,11/6/2012 5:55:36 PM</Entry> <Entry>947986E0A42422D56F4751BBDFB5F0AFF865A158,NBCElection_event.e07d0a75-1cb1-4040-9598-1e653d0d1b6c, Obama,11/6/2012 5:55:37 PM</Entry> <Entry>D884FA867C73E1C160CEC514AE923646439C9EE0,NBCElection_event.e07d0a75-1cb1-4040-9598-1e653d0d1b6c,_NoResponse,11/6/2012 5:55:44 PM</Entry> </Entries> </Report>

xml metadata (the questionIDs aren’t known until they are seeded)

{xml} <QuestionModel> <ID>c92d9183-4965-41d0-ac2d-98f5316ec6e4</ID> <Question>If you could play Xbox LIVE with one person, who would it be?</Question> <DisplayStreamTime>8843</DisplayStreamTime> <ExpirationStreamTime>8918</ExpirationStreamTime> <Answers> <string> Barack Obama </string> <string> Mitt Romney </string> <string> Brian Williams</string> </Answers> </QuestionModel>

Parse xml metadata

parseQuestions <- function(question.xml) {
    sapply(question.xml, function(qs) {
        q <- xmlRoot(qs)
        wording <- xpathApply(q, "//QuestionModel/Question", xmlValue)
        ID <- xpathApply(q, "//QuestionModel/ID", xmlValue)
        A <- xpathApply(q, "//Answers")
        resp <- lapply(A, function(A) xmlSApply(A, xmlValue))
        stubs <- list()
        for (i in 1:length(wording)) {
            stubs[[i]] <- makeSingleStub(ID[[i]], resp = paste(paste0("`", ID[[i]], 
                "`"), shQuote(resp[[i]]), sep = "=="), vallab = resp[[i]], wording = wording[[i]])
        }
        names(stubs) <- unlist(ID)
        stubs
    })
}

Question metadata result (YouGov ‘stubs’)

single categoricalStub labeled ‘c92d9183-4965-41d0-ac2d-98f5316ec6e4’  
wording : chr "If you could play Xbox LIVE with one person, who would it be?"

                                                                           
Barack Obama   "`c92d9183-4965-41d0-ac2d-98f5316ec6e4`=='\tBarack Obama\t'"
Mitt Romney    "`c92d9183-4965-41d0-ac2d-98f5316ec6e4`=='\tMitt Romney\t'" 
Brian Williams "`c92d9183-4965-41d0-ac2d-98f5316ec6e4`=='\tBrian Williams'"

format :List of 2
 $ percent: logi TRUE
 $ digits : num 1

Empty slots: ‘collab’ ‘tags’ ‘filtertext’ ‘surveycode’ 

Reshaping the clickstream into rows by user

df <- reshape2::dcast(as.data.frame(df), voter_hash ~ QuestionId, value.var = "Answer", 
    fun.agg = function(x) levels(x)[x][1])

After reshaping:

R> 100*prop.table(xtabs(~ `c92d9183-4965-41d0-ac2d-98f5316ec6e4`, exclude=4, data=df))
## c92d9183-4965-41d0-ac2d-98f5316ec6e4
##        1        2        3 
## 62.83078 19.58565 17.58357 

Was there a Romney Surge?

phantomswings

phantomswings

Was there a Romney Surge?

Phantom swings, Xbox daily poll

xboxswings

xboxswings

The goal: make this graph

cumulativechange

cumulativechange

##' Compute array of transition matrices from vote history
##'
##'
##' @param Mat vote history matrix, as produced by
##' \code{acast(panel, voter_hash ~ wave, value.var="vote2012_self")}
##' @param t numeric (probably vector) time index at which to compute $\Delta$
##' @param window numeric number of columns to consider prior to $t$
##' @return A transition matrix with rows and columns:
##' \tabular{cccc}{
##'            \tab Obama \tab Romney \tab \Other \tab Undecided
##' Obama      \tab       \tab        \tab        \tab
##' Romney     \tab       \tab        \tab        \tab
##' Other      \tab       \tab        \tab        \tab
##' Undecided  \tab       \tab        \tab        \tab
##' }
##' @export

PresChange <- function(t, Mat, window=5){
    j <- c({t-window:1}, t)
    cur <- Mat[,t]
    Mat <- Mat[,j]

    lastvote <- function(v) {
        v <- na.omit(v)
        ifelse(length(v), v[length(v)], NA)
    }
    prev <- apply(Mat[,1:window], 1, lastvote)
    Mat <- na.omit(cbind(prev,cur))

    v <- apply(Mat, 1, paste, collapse="")
    label <- c("Obama","Romney","Not Sure","Other")

    i <- which(matrix(TRUE,4,4),arr.ind=TRUE)
    l <- apply(i, 1, paste, collapse="")
    ans <- table(v)
    ansmat <- matrix(0, 4,4, dimnames=list(label,label))
    ansmat[match(names(ans),l)] <- ans

    ans <- prop.table(ansmat, margin=1)
    attr(ans,"N") <- sum(ansmat)
    cur <- prop.table(table(cur))
    names(cur) <- label
    attr(ans,"current") <- cur
    attr(ans,"uniquevoters") <- unique(rownames(Mat))
    ans
}
##' Compute array of transition matrices from vote history
##'
##'
##' @param t numeric (probably vector) time index at which to compute $\Delta$
##' @param Mat vote history matrix, as produced by
##' \code{acast(panel, voter_hash ~ wave, value.var="vote2012_self")}
##' @param cut vector of indices, where to cut periods in the data
##' to condense \code{Mat}
##' @param window numeric number of columns to consider prior to $t$
##' @return A transition matrix with rows and columns:
##' \tabular{cccc}{
##'            \tab Obama \tab Romney \tab \Other \tab Undecided
##' Obama      \tab       \tab        \tab        \tab
##' Romney     \tab       \tab        \tab        \tab
##' Other      \tab       \tab        \tab        \tab
##' Undecided  \tab       \tab        \tab        \tab
##' }
##' @export
PresChangeWeekly <- function(t, Mat, cut=c("6:14","15:21","22:25","26:28","29:30"), window=1){
    cut <- sapply(cut, function(range) eval(parse(text=range)), simplify=FALSE)
    ## condense the matrix.
    pickone <- function(row) {
        row <- na.omit(row)
        ifelse(length(row)>1,
               sample(row,1) ,
               ifelse(length(row), row, NA) )
    }
    CMat <- matrix(NA, nrow=nrow(Mat), ncol=length(cut),
                   dimnames=list(rownames(Mat), NULL))
    for (i in 1:length(cut)){
        CMat[,i] <-  apply(Mat[,cut[[i]]], 1, pickone)
    }
    j <- c({t-window:1}, t)
    cur <- CMat[,t,drop=FALSE]
    Mat <- CMat[,j,drop=FALSE]

    lastvote <- function(v) {
        v <- na.omit(v)
        ifelse(length(v), v[length(v)], NA)
    }
    prev <- apply(Mat[,1:window,drop=FALSE], 1, lastvote)
    Mat <- na.omit(cbind(prev,cur))

    v <- apply(Mat, 1, paste, collapse="")
    label <- c("Obama","Romney","Not Sure","Other")

    i <- which(matrix(TRUE,4,4),arr.ind=TRUE)
    l <- apply(i, 1, paste, collapse="")
    ans <- table(v)
    ansmat <- matrix(0, 4,4, dimnames=list(label,label))
    ansmat[match(names(ans),l)] <- ans

    ans <- prop.table(ansmat, margin=1)
    attr(ans,"N") <- sum(ansmat)
    cur <- prop.table(table(cur))
    names(cur) <- label
    attr(ans,"current") <- cur
    attr(ans,"uniquevoters") <- unique(rownames(Mat))
    ans
}

Let’s run this and debug!

load("~/yg/Dropbox/XboxLivePolling/data/panelmatrix.RData")
# translist <- lapply(6:ncol(panel), PresChange, Mat=panel, window=5)
library(lattice)
library(xbox)
debugonce(PresChangeWeekly)
P0 <- lapply(2, PresChangeWeekly, Mat = panel, cut = c("6:14", "15:21"))

Process the blocks

load("~/yg/Dropbox/XboxLivePolling/data/panelmatrix.RData")
load("transition-matrices.RData")
colMeans(do.call("rbind", lapply(P, attr, "current")))
##    Obama   Romney Not Sure    Other 
##  0.47010  0.45926  0.05004  0.02061
lapply(P, attr, "N")
## $Oct03
## [1] 24422
## 
## $Oct11
## [1] 21374
## 
## $Oct16
## [1] 18623
## 
## $Oct22
## [1] 30077
## 
## $Oct30
## [1] 34477

Push forward through time

q0 <- prop.table(table(as.integer(as.matrix(panel))))
q <- list()

netchange <- data.frame(Obama = 0, Romney = 0, Undecided = 0, Other = 0)
q[[1]] <- q0 %*% P[[1]]
netchange[2, ] <- q[[1]] - matrix(q0, 1, 4, byrow = T)
for (t in 2:length(P)) {
    q[[t]] <- q[[t - 1]] %*% P[[t]]
    netchange[t + 1, ] <- q[[t]] - q[[t - 1]]
}