################################################################################ ############################################################################# V5 # # ESPON M4D Multidimensional Database Design and Development # # Time Series Estimation # # Data Exploration, Estimation, and Coherence functions # # ### Authors: # # Martin Charlton and Chris Brunsdon # ### Address: # # National Centre for Geocomputation # National University of Ireland, Maynooth # Maynooth, Co Kildare, IRELAND # # V1.01: June 2014 # #### Copyright # # Code is made avilable under the GNU GENERAL PUBLIC LICENSE, Version 3 # Text of the License is at http://www.gnu.org/licenses/gpl.txt # ################################################################################ ################################################################################ ### ### Identify a working folder - all data/software in in this folder ### dsn <- "C:\\Documents and Settings\\mcharlton.NUIM\\My Documents\\My Dropbox\\00ESPON_M4D (1)\\Time_Series_Analysis\\Sandbox" dsn <- "C:\\Users\\mcharlton\\Dropbox\\00ESPON_M4D (1)\\Time_Series_Analysis\\Sandbox" # laptop dsn <- "F:\\Time_Series_Analysis\\Sandbox" # Data stick setwd(dsn) ### ### Load the libraries needed for the analysis and estimation ### require(RColorBrewer) require(gdata) require(rjags) ################################################################################# ### TASK 1: Read the data, identify the EUROSTAT/NSA data and split data ### and source indices into separate worksheets ################################################################################# Dataset <- read.xls(paste(dsn,"\\M4D_poptot1990-2011_20120522.xls",sep=""), sheet=1,header=FALSE) Indicator <- read.xls(paste(dsn,"\\M4D_poptot1990-2011_20120522.xls",sep=""), sheet=2,header=FALSE) Source <- read.xls(paste(dsn,"\\M4D_poptot1990-2011_20120522.xls",sep=""), sheet=3,header=FALSE) Data <- read.xls(paste(dsn,"\\M4D_poptot1990-2011_20120522.xls",sep=""), sheet=4,header=FALSE) ### ### Parse the 'Source' worksheet for the provider index codes and names ### len3 <- dim(Source)[1] sourceLabels <- which(Source[,1] == "Label") sourceProvider <- which(Source[,1] == "Provider") sourceIndex <- data.frame(Source[sourceLabels,2],Source[sourceProvider,3]) colnames(sourceIndex) <- c("Label","Provider") sourceIndex ############################################# sourcesOK <- sourceIndex[c(1:32),] # These are the rows for which data is from # EUROSTAT + Nation al Statistical Agencies ############################################# ### ### Split 'Data' worksheet into '.data' and '.source' worksheets ### len2 <- dim(Data)[1] # number of rows dataCols <- seq(5,47,2) # columns with data Data.data <- Data[4:len2,c(1,2,4,dataCols)] # data Data.source <- Data[4:len2,c(1,2,4,dataCols+1)] # source information colnames(Data.data) <- c("UnitCode","Level","Name",paste("p",1990:2011,sep="")) colnames(Data.source) <- c("UnitCode","Level","Name",paste("s",1990:2011,sep="")) ### ### Create data frame 'Data.estim' which we used as the basis for subsequent estimation ### as.numeric.factor <- function(x) {as.numeric(levels(x))[x]} # convert factors to numeric Data.estim <- Data.data # Copy old -> new for (icol in 4:25) { # loop over data columns Data.estim[,icol] <- NA # update to NA rowsOK <- which((Data.source[,icol] %in% sourcesOK[,1]) == TRUE) # EUROSTAT/NSA? Data.estim[rowsOK,icol] <- as.numeric.factor(Data.data[rowsOK,icol]) # .. yes - copy data } head(Data.estim) ### ### Finally index Data.estim by the NUTS code ### rownames(Data.estim) <- Data.estim[,1] levels(as.factor(substr(rownames(Data.estim),1,2))) # "AT" "BE" "BG" "CH" "CY" "CZ" "DE" "DK" "EE" "ES" "FI" "FR" "GR" "HR" "HU" # "IE" "IS" "IT" "LI" "LT" "LU" "LV" "MK" "MT" "NL" "NO" "PL" "PT" "RO" "SE" # "SI" "SK" "TR" "UK" ################################################################################# # TASK 2: Create a series of missing data heatmaps for the report ################################################################################# CountryCode <- levels(as.factor(substr(rownames(Data.estim),1,2))) # NUTS codes CountryName <- Data.estim[match(rownames(Data.estim),CountryCode),3] # Names CountryName <- CountryName[!is.na(CountryName)] # remove NA Nc <- length(CountryCode) # How many? ### ### Heatmap construction function - many defaults are turned off ### drawHeatmap <- function(country,title="Population") { CountryData <- Data.estim[substr(rownames(Data.estim),1,2) == country,] PlotOrder <- dim(CountryData)[1]:1 heatmap(as.matrix(CountryData[PlotOrder,4:25]),Colv=NA,Rowv=NA,main=title,na.rm=T, xlab="Year",ylab="NUTS Region",col=rev(brewer.pal(11,"Spectral")),bg="grey") } ### ### Loop over countries, plotting the heatmap. Store in Mapfiles/aa.png ### for (i in 1:Nc) { fdname <- paste("Mapfiles\\",CountryCode[i],".png",sep="") # create filename cat(fdname,"\n") # list it png(fdname) # open it drawHeatmap(CountryCode[i],CountryName[i]) # plot heatmap dev.off() # close the file } drawHeatmap("AT","Austria") drawHeatmap("BE","Belgium") drawHeatmap("BG","Bulgaria") drawHeatmap("CH","Switzerland") drawHeatmap("CY","Cyprus") drawHeatmap("CZ","Czech Republic") drawHeatmap("DE","Germany") drawHeatmap("DK","Denmark") drawHeatmap("EE","Estonia") drawHeatmap("ES","Spain") drawHeatmap("FI","Finland") drawHeatmap("FR","France") drawHeatmap("GR","Greece") drawHeatmap("HR","Croatia") drawHeatmap("HU","Hungary") drawHeatmap("IE","Ireland") drawHeatmap("IS","Iceland") drawHeatmap("IT","Italy") drawHeatmap("LI","Liechtenstein") drawHeatmap("LT","Lithunia") drawHeatmap("LU","Luxembourg") drawHeatmap("LV","Latvia") drawHeatmap("MK","Macedonia") drawHeatmap("MT","Malta") drawHeatmap("NL","Netherlands") drawHeatmap("NO","Norway") drawHeatmap("PL","Poland") drawHeatmap("PT","Portugal") drawHeatmap("RO","Romania") drawHeatmap("SE","Sweden") drawHeatmap("SI","slovakia") drawHeatmap("SK","Slovak Republic") drawHeatmap("TR","Turkey") drawHeatmap("UK","United Kingdom") ################################################################################# # TASK 3: enumerate areas with missing data at NUTS0/1/2 ################################################################################# #dataCols <- 4:25 levels(as.factor(substr(rownames(Data.estim),1,2))) # "AT" "BE" "BG" "CH" "CY" "CZ" "DE" "DK" "EE" "ES" "FI" "FR" "GR" "HR" "HU" # "IE" "IS" "IT" "LI" "LT" "LU" "LV" "MK" "MT" "NL" "NO" "PL" "PT" "RO" "SE" # "SI" "SK" "TR" "UK" ### ### Area code/name information as in TASK 2: ### CountryCode <- levels(as.factor(substr(rownames(Data.estim),1,2))) CountryName <- Data.estim[match(rownames(Data.estim),CountryCode),3] CountryName <- CountryName[!is.na(CountryName)] NUTSlevel <- nchar(rownames(Data.estim))-2 Nc <- length(CountryCode) # Countries Nr <- nrow(Data.estim) # NUTS regions for (i in 1:Nr) { # Loop over NUTS codes if(NUTSlevel[i] <= 2) { if (!complete.cases(Data.estim[i,4:25])) { # Complete case? nCode <- as.character(Data.estim[i,1]) # ... no, get code nName <- as.character(Data.estim[i,3]) # ... get name cat("Missing data for ",nCode,nName,"\n") # ... and report } } } } ################################################################################################# ### Task 4: Example plots to explore general patterns in the data ################################################################################################# CountryCode <- levels(as.factor(substr(rownames(Data.estim),1,2))) year <- 1:22 yearsq <- year^2 ### ### Each plot has: base data as dots and a line in grey ### fit from a linear model in lightblue ### fit from the quadratic model in red ### par(mfrow=c(2,2)) Popdata <- as.numeric(Data.estim["AT",4:25]) plot(1990:2011,Popdata,xlab="Year",ylab="Population",main="Austria",pch=16,col="grey") lines(1990:2011,Popdata,col="grey") lines(1990:2011,(cbind(1,year) %*% coef(lm(Popdata~year))),col="lightblue") lines(1990:2011,(cbind(1,year,yearsq) %*% coef(lm(Popdata~year+yearsq))),col="red") Popdata <- as.numeric(Data.estim["FR",4:25]) plot(1990:2011,Popdata,xlab="Year",ylab="Population",main="France",pch=16,col="grey") lines(1990:2011,Popdata,col="grey") lines(1990:2011,(cbind(1,year) %*% coef(lm(Popdata~year))),col="lightblue") lines(1990:2011,(cbind(1,year,yearsq) %*% coef(lm(Popdata~year+yearsq))),col="red") Popdata <- as.numeric(Data.estim["NL",4:25]) plot(1990:2011,Popdata,xlab="Year",ylab="Population",main="Netherlands",pch=16,col="grey") lines(1990:2011,Popdata,col="grey") lines(1990:2011,(cbind(1,year) %*% coef(lm(Popdata~year))),col="lightblue") lines(1990:2011,(cbind(1,year,yearsq) %*% coef(lm(Popdata~year+yearsq))),col="red") Popdata <- as.numeric(Data.estim["SI",4:25]) plot(1990:2011,Popdata,xlab="Year",ylab="Population",main="Slovenia",pch=16,col="grey") lines(1990:2011,Popdata,col="grey") lines(1990:2011,(cbind(1,year) %*% coef(lm(Popdata~year))),col="lightblue") lines(1990:2011,(cbind(1,year,yearsq) %*% coef(lm(Popdata~year+yearsq))),col="red") par(mfrow=c(1,1)) ################################################################################################# ### TASK 5: Extract data for Austria as test data *** *** *** ONLY FOR TESTING *** *** *** ################################################################################################# Austria <- Data.estim[substr(rownames(Data.estim),1,2) == "AT",] PlotOrder <- dim(Austria)[1]:1 # reverse order for the heatmap heatmap(as.matrix(Austria[PlotOrder,4:25]),Colv=NA,Rowv=NA,main="Austria EUROSTAT Population Data", xlab="Year",ylab="NUTS Region",col=rev(brewer.pal(11,"Spectral"))) parent <- substr(rownames(Austria),1,nchar(rownames(Austria))-1) parent[1] <- "" NUTS <- rownames(Austria) NUTS0 <- which(nchar(rownames(Austria)) == 2) NUTS1 <- which(nchar(rownames(Austria)) == 3) NUTS2 <- which(nchar(rownames(Austria)) == 4) NUTS3 <- which(nchar(rownames(Austria)) == 5) Data.estim <- Austria ################################################################################### #### TASK 6: MCMC estimation of NUTS levels 3 series ################################################################################### # Estimation loop one iteration is about 1.7 minutes for 12 missing from 22 on # a Intel Xeon X5460 quad core 3.16GHz processor running Windows XP Professional # # 35 zones for Austria takes about an hour. As a ball park estimate, the whole of # Europe should take about 36 hours of computing. # # On an Intel Core i7-2640M processor running at 2.80Ghz it's under a minute per fit. # This is a 64 bit system running Windows 7 Professional. Austria would take just # over 30 minutes so about 18 hours for Europe. # NUTS3 tool 199418.4 seconds 55h 24m, say ~55.5h # ################################################################################## ### ### Load the function definition ### source("ImputeMissingData.R") ### ### Copy start data ### NUTSData.completed <- Data.estim nrow <- dim(Data.estim)[1] # number of regions NUTS <- rownames(Data.estim) # NUTS codes NUTSlevel <- nchar(NUTS)-2 # NUTS lebels total.time <- 0 startrow <- 1 # start at 1 ... endrow <- nrow # ... and finish at nrow for (i in startrow:endrow) { #if(NUTSlevel[i] < 3 & substr(NUTS[i],1,2) != "UK") { if(substr(NUTS[i],1,2) != "UK") { # UK separately if (complete.cases(Data.estim[i,4:25])) { # data complete? cat(NUTS[i],"is complete\n") # ... yes: ignore } else { cat(NUTS[i],"is undergoing imputation\n") # ... no: imputation needed Z <- as.numeric(Data.estim[i,4:25]) # data series Years <- 1990:2011 # time frame for series N <- length(Z) # length of the series missingYears <- which(is.na(Z)) # missing observations NM <- length(missingYears) # number of missing obs Scale <- 1000 # scale for populations tic <- proc.time()[3] fitted.model <- ImputeMissingData(Z,Years, missingYears, Scale) # imputation toc <- proc.time()[3] - tic total.time <- total.time + toc cat("Elapsed time: ",toc,"seconds\n") # elapsed time report imputedValues <- summary(fitted.model)$statistics[(5+(1:NM)),1] # extract imputations NUTSData.completed[i,missingYears+3] <- round(imputedValues*Scale) # update data vector } } else { cat(NUTS[i],"is NUTS level",NUTSlevel[i],"\n") } } cat("Completed. Total time required was ",total.time," seconds\n") ### ### Write results to a file - this step takes 60 hours! ### write.csv(NUTSData.completed,"NUTSData_completedN3.csv") # store the results for later ######################### End of MCMC estimation ############################################# ############################################################################################## ################################################################################### ### TASK 6A: Timings for MCMC estimation ### Use Netherlands as an example to get scalbility graphc ### *** Only for report illustrations *** ################################################################################### # Estimation loop one iteration is about 1.7 minutes for 12 missing from 22 on # a Intel Xeon X5460 quad core 3.16GHz processor running Windows XP Professional # # 35 zones for Austria takes about an hour. As a ball park estimate, the whole of # Europe should take about 36 hours of computing. # # On an Intel Core i7-2640M processor running at 2.80Ghz it's under a minute per fit. # This is a 64 bit system running Windows 7 Professional. Austria would take just # over 30 minutes so about 18 hours for Europe. # NUTS3 tool 199418.4 seconds 55h 24m, say ~55.5h # source("ImputeMissingData.R") #nrow <- dim(Data.estim)[1] #NUTS <- rownames(Data.estim) #NUTSlevel <- nchar(NUTS)-2 TestData <- as.numeric(Data.estim["NL",4:25]) timeTaken <- rep(0,20) for (i in 10:10) { Z <- TestData # data series Z[1:i] <- NA # add NAs Years <- 1990:2011 # time frame for series N <- length(Z) # length of the series missingYears <- which(is.na(Z)) # missing observations NM <- length(missingYears) # number of missing obs Scale <- 1000 # scale for populations tic <- proc.time()[3] fitted.model <- ImputeMissingData(Z,Years, missingYears, Scale, 250000, 100000, chains=2) # imputation toc <- proc.time()[3] - tic timeTaken[i] <- toc cat("Elapsed time: ",toc,"seconds with",i,"missing values\n") # elapsed time report } cat("Completed. Total time required was ",sum(timeTaken)," seconds\n") timeTaken gelman.diag(fitted.model) # convergence diagnostics gelman.plot(fitted.model) # convergence plot #timeTaken2 <- timeTaken * (290/171.52) # laptop relative to office desktop timeTaken2 <- timeTaken proportionMissing <- 100 * (1:20) / 22 plot(proportionMissing,timeTaken2,ylim=c(0,300), xlab="Proportion Missing (N=22)",ylab="Time (seconds)", main="Scalability of MCMC Estimation") ############################################################################################## ### TASK 7: Spatial Coherence Adjustment ### ############################################################################################## ### Data.estim : raw data from RIATE with only EUROSTAT/National estimates ### NUTSData.completed : unadjusted MCMC estimates NUTSData.adjusted <- NUTSData.completed # Copy the MCMC estimates NUTSData.RIATE <- Data.estim # Copy the NA pattern NUTSlevel <- nchar(rownames(NUTSData.RIATE)) - 2 # recreate just in case ############################################################################################## ### TASK 7A: Manual adjusment for Portugal (PT 1992:1995 missing) ### Data.estim["PT",c("p1992","p1993","p1994","p1995")] is NA ### should be sum of c("PT1","PT2","PT3") ### "PT1" is c("PT11","PT15","PT16","PT17","PT18") NUTSData.RIATE[substr(rownames(NUTSData.RIATE),1,2) == "PT" & NUTSlevel <= 2,] # for a look ### Fix PT1 for (year in c("p1992","p1993","p1994","p1995")) { NUTSData.adjusted["PT1",year] <- sum(NUTSData.adjusted[c("PT11","PT15","PT16","PT17","PT18"),year]) NUTSData.RIATE["PT1",year] <- NUTSData.adjusted["PT1",year] # update the NA pattern } ### Fix PT for (year in c("p1992","p1993","p1994","p1995")) { NUTSData.adjusted["PT",year] <- sum(NUTSData.adjusted[c("PT1","PT2","PT3"),year]) NUTSData.RIATE["PT",year] <- NUTSData.adjusted["PT",year] # update the NA pattern } NUTSData.adjusted[substr(rownames(NUTSData.adjusted),1,2) == "PT" & NUTSlevel <= 2,] # ... and check NUTSData.RIATE[substr(rownames(NUTSData.RIATE),1,2) == "PT" & NUTSlevel <= 2,] # ... and check ########################################################################################## # TASK 7B: Apply spatial constraint for NUTSlevel nodes below NUTSlevel-1 ########################################################################################## # # parent: vector of parent node codes # basic: matrix of data with NAs # unadjusted: matrix of totals from MCMC operation # dataCols: columns in which there are data # # adjusted: output: adjusted NUTSlevel node populations ConstrainTotals2 <- function(basic,unadjusted,dataCols) { NAPattern <- basic # working copy MCMCestimated <- unadjusted # working copy MCMCadjusted <- unadjusted # eventual output from function ### tree strcuture nodes <- rownames(NAPattern) # tree nodes parent <- substr(rownames(NAPattern),1,nchar(rownames(NAPattern))-1) # backtrack to link children NUTSlevel <- nchar(rownames(NAPattern))-2 # NUTS level parent[which(NUTSlevel == 0)] <- "" # clean up Ny <- length(dataCols) # year span Years <- rep(0,Ny+3) # create Year list Years[dataCols] <- 1990:2011 # # Start from the NUTS0 levels # for (Level in 0:2) { cat("\n\n") cat("***********************************************************\n") cat("* Cross-Sectional Time Series Constraint for NUTS Level",Level,"*\n") cat("***********************************************************\n") Levelnodes <- nodes[which(NUTSlevel == Level)] NL <- length(Levelnodes) for (i in 1:NL) { children <- nodes[parent == Levelnodes[i]] # NUTS1 children of the root cat("NUTS",(Level+1), "children of", Levelnodes[i], "are", children,"\n") Nc <- length(children) # Number of children for (year in dataCols){ # loop over the years any.missing <- which(is.na(NAPattern[children,year])) # which children have NA if (length(any.missing) == 0) { # ... none missing #cat("** NUTS",(Level+1),"data for",Levelnodes[i],"children are present in",Years[year],"\n") } else { # ... missing, so adjust valueParent <- MCMCestimated[Levelnodes[i],year] # parent contraint value childOK <- children # index of children if (!is.na(valueParent)) { # parent value not missing missingChild <- is.na(NAPattern[children,year]) # missing children if (sum(missingChild) < Nc) { ChildOK <- !missingChild # ... nonmissing children alreadyOK <- sum(MCMCestimated[children[ChildOK],year]) # ... total for nonmissing valueParent <- valueParent - alreadyOK # reduce constraint value } sumChildren <- sum(MCMCestimated[children[missingChild],year]) # sum of MCMC estimates MCMCadjusted[children[missingChild],year] <- (valueParent/sumChildren) * MCMCadjusted[children[missingChild],year] # adjusted MCMC estimates cat("** NUTS",(Level+1),"data for",Levelnodes[i],"children constrained in",Years[year],"\n") } else { #cat("** WARNING: missing data for",Levelnodes[i],"Constraint not possible in",Years[year],"\n") } } } } } MCMCadjusted } ### ### Undertake adjustment, and report time ### tic <- proc.time() NUTSData.final <- ConstrainTotals2(NUTSData.RIATE,NUTSData.completed,4:25) #### FINAL OUTPUT ### toc <- proc.time()-tic cat("\nTime Required for adjustment was ",toc,"seconds\n\n") ##################################################################################################### ##################################################################################################### ##################################################################################################### CheckConstraint <- function(parents,basic,dataCols) { NUTSCodes <- rownames(basic) # get the NUTD codes of the parents Np <- length(parents) # how many NUTS2 regions for (parent in 1:Np) { # loop over the NUTS regions NUTS3.units <- (nchar(NUTSCodes) == 5) & (substr(NUTSCodes,1,4) == parents[parent]) NUTS2.basic <- basic[parents[parent],] # just the data for this parent NUTS3.basic <- basic[NUTS3.units,] NUTS3.this.node <- NUTSCodes[NUTS3.units] # NUTS codes for this parent #cat(parents[parent],": ",NUTS3.this.node,"\n") N3 <- length(NUTS3.this.node) for (i in dataCols) { Nmiss <- length(which(is.na(NUTS3.basic[,i]))) NUTS2.basic[i] <- round(100 * Nmiss / N3) } cat(parents[parent],":", unlist(NUTS2.basic[,dataCols]),"\n") } } CheckCountry <- function(Country,Data.estim) { CountryData <- Data.estim[substr(rownames(Data.estim),1,2) == Country,] PlotOrder <- dim(CountryData)[1]:1 # reverse order for the heatmap parent <- substr(rownames(CountryData),1,nchar(rownames(CountryData))-1) parent[1] <- "" NUTS <- rownames(CountryData) NUTS0 <- which(nchar(rownames(CountryData)) == 2) NUTS1 <- which(nchar(rownames(CountryData)) == 3) NUTS2 <- which(nchar(rownames(CountryData)) == 4) NUTS3 <- which(nchar(rownames(CountryData)) == 5) CheckConstraint(rownames(CountryData)[NUTS2],CountryData,4:25) } CheckCountry("AT",Data.estim) CheckCountry("SI",Data.estim) CheckCountry("UK",Data.estim) #> levels(as.factor(substr(rownames(Data.estim),1,2))) # "AT" "BE" "BG" "CH" "CY" "CZ" "DE" "DK" "EE" "ES" "FI" "FR" "GR" "HR" "HU" # "IE" "IS" "IT" "LI" "LT" "LU" "LV" "MK" "MT" "NL" "NO" "PL" "PT" "RO" "SE" # "SI" "SK" "TR" "UK" CountryList <- levels(as.factor(substr(rownames(Data.estim),1,2))) Nc <- length(CountryList) #Nc <- 5 for (i in 1:Nc) { CheckCountry(CountryList[i],Data.estim) } CheckTree <- function(Country,Data.estim) { CountryData <- Data.estim[substr(rownames(Data.estim),1,2) == Country,] nodes <- rownames(CountryData) parent <- substr(rownames(CountryData),1,nchar(rownames(CountryData))-1) parent[1] <- "" ### visit.node ### for each child(visit.node) ROOT <- which(nodes == Country) cat("\nTree Walk\n") root <- nodes[ROOT] cat("NUTS0 node is ",root,"\n") children1 <- nodes[parent == root] cat(" NUTS1 children of", root, "are", children1,"\n") Nc1 <- length(children1) for (i in 1:Nc1) { node2 <- children1[i] children2 <- nodes[parent == node2] cat(" NUTS2 children of", node2, "are", children2,"\n") Nc2 <- length(children2) for (j in 1:Nc2) { node3 <- children2[j] children3 <- nodes[parent == node3] cat(" NUTS3 children of", node3, "are", children3,"\n") } } } CheckTree("AT",Data.estim) CheckTree("UK",Data.estim) ################################################################################ ################################################################################ ################################################################################ ### Check for singleton missings CheckMissingPattern <- function(Country,Data,dataCols,verbose=FALSE) { CountryData <- Data[substr(rownames(Data),1,2) == Country,] IDInfo <- CountryData[,1:3] Nc <- dim(IDInfo)[1] Results <- data.frame(IDInfo[,1:3],matrix(0,Nc,8)) rownames(Results) <- rownames(CountryData) nodes <- rownames(CountryData) parent <- substr(rownames(CountryData),1,nchar(rownames(CountryData))-1) parent[1] <- "" Nd <- length(dataCols) ### visit.node ### for each child(visit.node) ROOT <- which(nodes == Country) cat("\nTree Walk\n") root <- nodes[ROOT] if (verbose) cat("NUTS0 node is ",root,"\n") children1 <- nodes[parent == root] if (verbose) cat(" NUTS1 children of", root, "are", children1,"\n") Nc1 <- length(children1) CasePattern <- CheckNode(root,children1,Data,dataCols) #cat("[",Nc1,"] children", CasePattern,"\n") Results[root,4:11] <- CasePattern for (i in 1:Nc1) { node2 <- children1[i] children2 <- nodes[parent == node2] if (verbose) cat(" NUTS2 children of", node2, "are", children2,"\n") Nc2 <- length(children2) CasePattern <- CheckNode(node2,children2,Data,dataCols) #cat("[",Nc2,"] children", CasePattern,"\n") Results[node2,4:11] <- CasePattern for (j in 1:Nc2) { node3 <- children2[j] children3 <- nodes[parent == node3] if (verbose) cat(" NUTS3 children of", node3, "are", children3,"\n") Nc3 <- length(children3) CasePattern <- CheckNode(node3,children3,Data,dataCols) Results[node3,4:11] <- CasePattern } } NUTSlevels <- lapply(Results[,2], as.character) print(Results[NUTSlevels <= "NUTS2",]) } # # # CheckNode <- function(parent,children,Data,dataCols){ Nc <- length(children) # How many children for this parent MissingParent <- Data[parent,] # Copy parent record (we'll use data cols) Result <- Data[parent,] # Create result record Cases <- rep(0,8) # 8 possible outcomes for (i in dataCols) { # loop over time period for this parent MissingParent[i] <- is.na(Data[parent,i]) # Is parent missing for this year Result[i] <- length(which(is.na(Data[children,i]))) # Number of children missing for this year if (!MissingParent[i]) { if (Result[i] == 0) k <- 1 # Parent present, no children missing - nothing to do! else if (Result[i] == 1) k <- 3 # Parent present, 1 child missing (embarassingly estimatable) else if (Result[i] > 1 & Result[i] < Nc) k <- 4 # Parent present, some children missing - estimate/constrain else k <- 5 # Present present, all children missing - estimate/constrain } else { if (Result[i] == 0) k <- 2 # Parent missing, no children missing (estimate parent!) else if (Result[i] == 1) k <- 6 # Parent missing, 1 child missing - awkward else if (Result[i] > 1 & Result[i] < Nc) k <- 7 # Parent missing, some children missing - awkward else k <- 8 # Parent missing, all children missing - awkward } Cases[k] <- Cases[k] + 1 # update counter } Cases } # Return counter vector CheckMissingPattern("AT",Data.estim,4:25) #> levels(as.factor(substr(rownames(Data.estim),1,2))) # "AT" "BE" "BG" "CH" "CY" "CZ" "DE" "DK" "EE" "ES" "FI" "FR" "GR" "HR" "HU" # "IE" "IS" "IT" "LI" "LT" "LU" "LV" "MK" "MT" "NL" "NO" "PL" "PT" "RO" "SE" # "SI" "SK" "TR" "UK" CountryList <- levels(as.factor(substr(rownames(Data.estim),1,2))) Nc <- length(CountryList) for (i in 1:Nc) { CheckMissingPattern(CountryList[i],Data.estim, 4:25) } ################################################################################# ### TASK X Austria ##################################################### ################################################################################# nodes <- rownames(Data.estim) NUTSlevel <- nchar(nodes) - 2 parents <- substr(nodes,1,(NUTSlevel+1)) AT21children <- nodes[which(parents == "AT21")] Names <- as.character(Data.estim[AT21children,3]) AT21MCMCout <- as.matrix(Austria.final[AT21children,4:25]) # MCMC outputs AT21totals <- colSums(AT21MCMCout) # AT21 totals from MCMC AT21factor <- as.numeric(Data.estim["AT21",4:25]) / AT21totals # adjustment factors AT21adjusted <- round(sweep(AT21MCMCout,2,AT21factor,"*"),0) # sweep down rows colSums(AT21adjusted) - Data.estim["AT21",4:25] # and check par(mfrow=c(3,1)) for (i in 1:length(AT21children)) { NUTS3zone <- AT21children[i] mindata <- min(AT21MCMCout[i,],AT21adjusted[NUTS3zone,]) * 0.95 maxdata <- max(AT21MCMCout[i,],AT21adjusted[NUTS3zone,]) * 1.05 plot(1990:2011,AT21MCMCout[i,],col="darkgrey",type="l",xlab="Year",ylab=NA, ylim=c(mindata,maxdata), main=paste(NUTS3zone,Names[i]),lty=2) lines(1990:2011,AT21adjusted[NUTS3zone,],col="red",lty=1) lines(1990:2011,Data.estim[NUTS3zone,4:25],col="darkgrey") points(1990:2011,Data.estim[NUTS3zone,4:25],pch=16,col="darkgrey") } par(mfrow=c(1,1)) ################################################################################# ### MONTE CARLO VARIABILITY ##################################################### ################################################################################# ### Random normal variates par(mfrow=c(3,3)) for (i in c(100,500,1000,5000,10000,50000,100000,500000,1000000)) { hist(rnorm(i,0,1),breaks=100,xlim=c(-4,4),col="grey",border=NA,main=paste("N=",i,sep=""),xlab=NA,ylab=NA) } par(mfrow=c(1,1)) ### Random gamma par(mfrow=c(3,3)) for (i in c(100,500,1000,5000,10000,50000,100000,500000,1000000)) { hist(rgamma(i,50),breaks=100,xlim=c(20,80),col="grey",border=NA,main=paste("N=",i,sep=""),xlab=NA,ylab=NA) } par(mfrow=c(1,1)) ### random beta par(mfrow=c(3,3)) for (i in c(100,500,1000,5000,10000,50000,100000,500000,1000000)) { hist(rbeta(i,1,1),breaks=100,xlim=c(0,1),col="grey",border=NA,main=paste("N=",i,sep=""),xlab=NA,ylab=NA) } par(mfrow=c(1,1)) ### mtulivariate normal Sigma <- matrix(c(10,3,3,2),2,2) Sigma par(mfrow=c(3,3)) for (i in c(100,300,750, 1000,3000,7500, 10000,30000,75000)) { x <- mvrnorm(n=i, c(0,0), Sigma) plot(x ,col="grey",pch=16, xlim=c(-15,15),ylim=c(-6,6), main=paste("N=",i,sep=""),xlab=NA,ylab=NA) lines(lowess(x),col="red") } par(mfrow=c(1,1)) ### changing precision par(mfrow=c(3,3)) tau <- c(0.25,1,4, 16,64,256, 1024, 4096, 16384) for (i in 1:9) { hist(rnorm(10000,10,1/sqrt(tau[i])),breaks=100,xlim=c(6,14),col="red",border="red",main=paste("Tau=",tau[i],sep=""),xlab=NA,ylab=NA) } par(mfrow=c(1,1)) ### non-informative prior par(mfrow=c(3,3)) tau <- 4^(-(0:8)) for (i in 1:9) { hist(rnorm(10000,0,1/sqrt(tau[i])),breaks=100,xlim=c(-20,20),col="red",border="red",main=paste("Tau=",format(tau[i],digits=3),sep=""),xlab=NA,ylab=NA) } par(mfrow=c(1,1)) ################################################################################ ################################################################################ # # ESPON M4D Multidimensional Database Design and Development # # Time Series Estimation # # Missing Data Imputation Function # # ### Authors: # # Martin Charlton and Chris Brunsdon # ### Address: # # National Centre for Geocomputation # National University of Ireland, Maynooth # Maynooth, Co Kildare, IRELAND # # V1.01: June 2014 # #### Copyright # # Code is made avilable under the GNU GENERAL PUBLIC LICENSE, Version 3 # Text of the License is at http://www.gnu.org/licenses/gpl.txt # ################################################################################ ################################################################################ # # Fit estimate missing values in a time series using MCMC with a quadratic # trend model # # Input: # pop: series with Missing data (represented by NA) # years: date range for input series # missing: indices of missing data # yscale: scaling for pop variable (to keep values reasonabley small) # burnin.iters: number of burn-in iterations (default 250000) # coda.iters: number of iterations for estimation (default 100000) # # Output: # fitted.model coda.samples rjags output object # ################################################################################ ################################################################################ ImputeMissingData <- function(pop,years,missing,yscale=1,burnin.iters=250000,coda.iters=100000,chains=1){ ### ### Create the regressands for the linear and quadratic trend terms ### gyear <- 1:length(years) # linear term gyear2 <- gyear^2 # quadratic term ### ### Load the correct JAGS code - different cdoe is required when there is ### only one NA. This is an R issue ### if (length(missing) == 1) { tfile = "tfile2.JAGS" # estimator for data with a single NA } else { tfile = "tfile1.JAGS" # estimator for data with multiple NAs } ### ### Initialisation ### ### present: indices of the non-missing data ### y: time series - the regressor ### N: number of terms in the series (both missing and present) ### ys: present data ### ym: missing data ### present <- setdiff(gyear,missing) # indices of non-missing data y <- pop/yscale # scale the y if necessary N <- length(y) # length of the series ys <- y[present] # present data values ym <- y[missing] # missing data ### ### Initialise the JAGS input ### popdata: list which is used to pass data to JAGS ### N: length of the time series ### ys: non-missing data values ### seen: indices of the non-missing data ### missing: indices of the missing data ### year: linear regressand ### yearsq: quadratic regressand ### popdata <- list(N=N, ys=ys, seen=present, missing=missing, year=gyear, yearsq=gyear2) ### ### Fit an OLS model to provide some plausible starting values for the coefficients ### xx <- coef(mdl <- lm(y~gyear+gyear2)) ### ### Load the model ### m: rjags object containign the model ### file: JAGS code for the MCMC model ### data: list of data and regressands for the model ### inits: initial values for the parameters ### .RNG.name: random number generator to be used ### .RNG.seed: random number generator seed (for reproducibility) ### n.chains: number of Markov chains to use ### if (chains == 1) { m <- jags.model(file=tfile, data=popdata, inits=list(b0=xx[1],b1=xx[2],b2=xx[3], .RNG.name='base::Mersenne-Twister',.RNG.seed=290162), n.chains=1) } else { m <- jags.model(file=tfile, data=popdata, inits=list(b0=xx[1],b1=xx[2],b2=xx[3]), n.chains=chains) } ### ### Initial burn-in - these results are discarded. Default is 250000 iterations ### update(m, n.iter=burnin.iters) ### ### Fit the model - output is and mcmc.list object which is returned from the function ### coda.samples(m, n.iter=coda.iters, thin=5, variable.names = c("ym","b0","b1","b2","rho","s2err")) } ################################################################################ ################################################################################ # # ESPON M4D Multidimensional Database Design and Development # # Time Series Estimation # # MCMC estimation of time series with missing data [tfile1: several NA values] # # ### Authors: # # Martin Charlton and Chris Brunsdon # ### Address: # # National Centre for Geocomputation # National University of Ireland, Maynooth # Maynooth, Co Kildare, IRELAND # # V1.01: June 2014 # #### Copyright # # Code is made avilable under the GNU GENERAL PUBLIC LICENSE, Version 3 # Text of the License is at http://www.gnu.org/licenses/gpl.txt # ################################################################################ ################################################################################ model { # Prior distributions b0 ~ dnorm(0.0, 0.000001) # intercept b1 ~ dnorm(0.0, 0.000001) # linear coefficient on time b2 ~ dnorm(0.0, 0.000001) # quadratic time term s2err ~ dgamma(1,50) #rho ~ dbeta(0.5, 0.5) rho ~ dbeta(10,10) # AR(1) parameter # Trend component for(i in 1:N) { mu[i] <- b0 + b1*year[i] +b2*yearsq[i] } # Autocorrelation component for(i in 1:N) { for (j in 1:N) { tdmat[i,j] <- s2err*rho^abs(i-j) }} for (i in 1:length(seen)) { for (j in 1:length(seen)) { tdmat22[i,j] <- tdmat[seen[i],seen[j]] }} itdmat22 <- inverse(tdmat22) for (i in 1:length(missing)) { for (j in 1:length(missing)) { tdmat11[i,j] <- tdmat[missing[i],missing[j]] }} itdmat11 <- inverse(tdmat11) for (i in 1:length(missing)) { for (j in 1:length(seen)) { tdmat12[i,j] <- tdmat[missing[i],seen[j]] }} for (i in 1:length(seen)) { for (j in 1:length(missing)) { tdmat21[i,j] <- tdmat[seen[i],missing[j]] }} for (i in 1:length(seen)) { mu2[i] <- mu[seen[i]] } for (i in 1:length(missing)) { mu1[i] <- mu[missing[i]] } OmegaM <- inverse(tdmat11 - tdmat12 %*% itdmat22 %*% tdmat21) muM <- mu1 + tdmat12 %*% itdmat22 %*% (ys - mu2) ys ~ dmnorm(mu2,itdmat22) # non-missing data ym ~ dmnorm(muM, OmegaM) # missing data } Appendix 5: JAGS code for MCMC (single NA) ################################################################################ ################################################################################ # # ESPON M4D Multidimensional Database Design and Development # # Time Series Estimation # # MCMC estimation of time series with missing data [tfile2: single NA value] # # ### Authors: # # Martin Charlton and Chris Brunsdon # ### Address: # # National Centre for Geocomputation # National University of Ireland, Maynooth # Maynooth, Co Kildare, IRELAND # # V1.01: June 2014 # #### Copyright # # Code is made avilable under the GNU GENERAL PUBLIC LICENSE, Version 3 # Text of the License is at http://www.gnu.org/licenses/gpl.txt # ################################################################################ ################################################################################ model { # Prior distributions b0 ~ dnorm(0.0, 0.000001) b1 ~ dnorm(0.0, 0.000001) b2 ~ dnorm(0.0, 0.000001) s2err ~ dgamma(1,50) #rho ~ dbeta(0.5, 0.5) rho ~ dbeta(10,10) # Trend component for(i in 1:N) { mu[i] <- b0 + b1*year[i] +b2*yearsq[i] # mu[i] <- b0 + b1*i } # Autocorrelation component for(i in 1:N) { for (j in 1:N) { tdmat[i,j] <- s2err*rho^abs(i-j) } } for (i in 1:length(seen)) { for (j in 1:length(seen)) { tdmat22[i,j] <- tdmat[seen[i],seen[j]] }} itdmat22 <- inverse(tdmat22) tdmat11 <- tdmat[missing,missing] itdmat11 <- 1.0/tdmat11 for (j in 1:length(seen)) { tdmat12[j] <- tdmat[missing,seen[j]] } for (i in 1:length(seen)) { tdmat21[i] <- tdmat[seen[i],missing] } for (i in 1:length(seen)) { mu2[i] <- mu[seen[i]] } for (i in 1:length(seen)) { for (j in 1:length(seen)) { oprod12[i,j] <- tdmat12[i] * tdmat21[j] } } mu1 <- mu[missing] OmegaM <- 1.0/(tdmat11 - inprod(tdmat12,itdmat22 %*% tdmat21)) muM <- mu1 + inprod(tdmat12, itdmat22 %*% (ys - mu2)) ys ~ dmnorm(mu2,itdmat22) ym ~ dnorm(muM, OmegaM)