# ------------------------------------------------------------------- # - NAME: bevoelkerung-tirol.R # - AUTHOR: Reto Stauffer # - DATE: 2019-10-01 # ------------------------------------------------------------------- # - DESCRIPTION: Download bevoelkerungsstatistik Tirol on Bezirks # level from tirol.gv.at and prepare the demo data # set for discdown.org rprogramming. # # Requires: # - xml2: parse website, find files to download # - gdata: read xls/xlsx files # - dplyr: combine data # ------------------------------------------------------------------- # - EDITORIAL: 2019-10-01, RS: Created file on thinkreto. # ------------------------------------------------------------------- # - L@ST MODIFIED: 2019-10-01 09:30 on marvin # ------------------------------------------------------------------- # Do not run if output file exists if (file.exists("bevoelkerung-tirol.csv")) stop("Output file \"bevoelkerung-tirol.csv\" exists, stop (do not re-run)") library("gdata") library("xml2") library("dplyr") # ------------------------------------------------------------------- # Get hyperrefs from website to know what we have to download. # ------------------------------------------------------------------- tmp <- tempfile(fileext = ".html") download.file("https://www.tirol.gv.at/statistik-budget/statistik/wohnbevoelkerung/", tmp) doc <- read_html(tmp) hrefs <- xml_find_all(doc, "//*/a[contains(@href, '/fileadmin/themen/statistik-budget/statistik/downloads/')]") hrefs <- xml_attr(hrefs, "href") hrefs <- hrefs[grepl("bev_[0-9]{4}\\.xlsx$", hrefs)] hrefs_years <- regmatches(basename(hrefs), regexpr("[0-9]{4}(?=\\.xlsx$)", basename(hrefs), perl = TRUE)) hrefs <- setNames(sprintf("https://www.tirol.gv.at/%s", hrefs), hrefs_years) # ------------------------------------------------------------------- # Reading data # ------------------------------------------------------------------- data <- list() for (n in names(hrefs)) { year <- as.integer(n) # Download file into temporary file tmp <- tempfile() download.file(hrefs[[n]], tmp) # Read data from XLS file x <- gdata::read.xls(tmp, sheet = "Bezirke", verbose = TRUE, skip = 1, dec = ",", header = T, encoding = "latin1") x <- subset(x, Bezirk != "Tirol") # Append year x <- cbind(data.frame(Jahr = rep(year, nrow(x))), x) names(x) <- gsub("\\.", "", names(x)) names(x) <- gsub("\\.1$", "", names(x)) # Replace special chars names(x) <- gsub("Ö", "Oe", names(x)) names(x) <- gsub("ü", "ue", names(x)) # Append results data[[as.character(year)]] <- x } # As the columns change: use dplyr to combine all. data <- as.data.frame(bind_rows(data)) hold <- data data <- data[, colSums(is.na(data)) < nrow(data)] # ------------------------------------------------------------------- # Create output file (csv with header) # ------------------------------------------------------------------- head <- "# -------------------------------------------- # Created: %s # Author: Reto Stauffer # Script: bevoelkerung-tirol.R # With %s # # Encoding: utf-8 # Quelle: Statistik Austria, Bevoelkerungsregister; # Innenministerium, Zentrales Melderegister # Note: Slowenien und Kroatien sind sowohl in der # Summe der EU-Staaten als auch in der Spalte. # --------------------------------------------" head <- sprintf(head, Sys.time(), R.version$version.string) # Write output file, csv writeLines(head, "bevoelkerung-tirol.csv") write.table(data, file = "bevoelkerung-tirol.csv", append = TRUE, row.names = FALSE, sep = ",", fileEncoding = "utf-8") # ------------------------------------------------------------------- # Example plot # ------------------------------------------------------------------- pdf(file = "bevoelkerung-tirol.pdf", width = 10, height = 8) ibk <- read.csv("bevoelkerung-tirol.csv", comment.char = "#") mein_bezirk = "Innsbruck-Stadt" ibk <- subset(ibk, Bezirk == mein_bezirk) cols <- which(!names(ibk) %in% c("Jahr", "Bezirk", "Gesamt")) ibk <- structure(as.matrix(ibk[, cols]), dimnames = list(ibk$Jahr, names(ibk)[cols])) barplot(t(ibk), main = paste("Bezirk", mein_bezirk, sep = " "), ylab = "inhabitants in thousands", xlab = "year", col = rev(gray.colors(ncol(ibk)))) legend("bottomleft", fill = rev(gray.colors(ncol(ibk))), legend = colnames(ibk), bg = "white") dev.off()