Literary Data: Some Approaches Andrew Goldstone http://www.rci.rutgers.edu/~ag978/litdata February 26, 2015. Functions, abstraction, modularity.
homework 5 ▶ Questions?
formal parameter param is now bound to the argument x function_name <- function (param) { # anything... # at... # all... } function_name(x) functions ▶ map inputs to outputs ▶ possibly with side effects (as rarely as possible)
function_name <- function (param) { # anything... # at... # all... } function_name(x) functions ▶ map inputs to outputs ▶ possibly with side effects (as rarely as possible) ▶ formal parameter param is now bound to the argument x
twice <- function (s) { str_c(s, s, sep=" ") } twice("ha") twice(str_c(letters, collapse=" "))
function_name <- function (param1, param2, ...) { ... } many_times <- function (s, n) { result <- s } result } many_times("ha", 5) [1] "ha ha ha ha ha" for (j in 2:n) { result <- str_c(result, s, sep=" ")
[1] "ha ha ha ha ha" many_times("ha", n=5) many_times(n=5, s="ha") # !! [1] "ha ha ha ha ha" binding named parameters
[1] "no no" many_times("no", 2) result Error in eval(expr, envir, enclos): object 'result' not found scope
[1] "O" "O" "O" "O" rep("O", 4) for, begone ▶ use rep to rewrite many_times without for
many_times <- function (s, n) { str_c(rep(s, n), collapse=" ") }
# get the body text three_weeks <- readLines("three-weeks-gutenberg.txt") metadata_start <- match aararrgggh not again abstraction
gutenberg_body <- function (ll, start_pat, end_pat) { start <- grep(start_pat, ll) start <- start[1] end <- end[1] ll[start:end] } three_weeks_body <- gutenberg_body(three_weeks, "^CHAPTER", "^THE END") end <- grep(end_pat, ll)
Homework… three_weeks_words <- tolower unlist strsp aarrrgh featurize <- function (ll) { # old familiar friends }
three_weeks_words <- tolower unlist strsp aarrrgh featurize <- function (ll) { # old familiar friends } Homework…
three_weeks_body[1] [1] "CHAPTER I" three_week_printer <- function (separator) { str_c(three_weeks_body, collapse=separator) } three_week_printer("...") # but what if...? [1] "Week 1...Week 2...Week 3" three_weeks_body[1] [1] "CHAPTER I" global and local three_weeks_body <- c("Week 1", "Week 2", "Week 3")
smoosh <- function (words1, words2) { helper <- function (ws) { str_c(ws, collapse=" ") } str_c(helper(words1), helper(words2), sep=" + ") } smoosh(c("uh", "huh"), c("that's", "the")) [1] "uh huh + that's the" helper(c("uh", "huh")) scope: even more Error in eval(expr, envir, enclos): could not find function "helper"
x <- 10 f <- function (y) { x + y } f(5) [1] 15 x <- 100 f(5) [1] 105 closure
first_few <- function (ll) { if (length(ll) < 4) { return(ll) } ll[1:4] } first_few(1:2) [1] 1 2 first_few(1:200) [1] 1 2 3 4 early escape
recursion Algorithm: QuickSort. 1. Choose the first element as “pivot.” 2. Partition the vector into two pieces by comparing to the pivot. 3. QuickSort the two pieces.
qsort <- function (xs) { if (length(xs) <= 1) { return(xs) } p <- xs[1] rest <- xs[-1] left <- rest[rest <= p] right <- rest[rest > p] c(qsort(left), p, qsort(right)) } qsort(c(4, 2, 3, 5, 1)) [1] 1 2 3 4 5
extract_years <- function (pubdates) { gsub("^\\D*(\\d{4}).*$", "\\1", pubdates) } ecco <- read.csv("ecco-headers.csv", as.is=T, encoding="UTF-8") ecco_years <- extract_years(ecco$pubdate) encoding="UTF-8") all(grepl("^\\d{4}$", eebo_years)) [1] FALSE reuse and refine reuse eebo <- read.csv("eebo-headers.csv", as.is=T, eebo_years <- extract_years(eebo$pubdate)
extract_years <- function (pubdates) { result <- gsub("^\\D*(\\d{4}).*$", "\\1", pubdates) result[missing_year] <- NA result } sum(is.na(eebo_years)) [1] 540 eebo_years <- eebo_years[!is.na(eebo_years)] # bye ecco_years <- extract_years(ecco$pubdate) sum(is.na(ecco_years)) # cool [1] 0 rats! missing_year <- grep("^\\d{4}$", result, invert=T) eebo_years <- extract_years(eebo$pubdate)
eebo_years_table <- table(eebo_years) eebo_sorted <- names sort descending wait top_hits <- function (xs, n=10) { names(sorted)[1:n] } top_hits(eebo_years) [1] "1660" "1642" "1641" "1659" "1689" "1680" "1681" [8] "1688" "1685" "1682" sorted <- sort(table(xs), decreasing=T)
years_decades <- function (years) { gsub("\\d$", "0s", years) } top_hits(years_decades(eebo_years)) [1] "1680s" "1640s" "1690s" "1650s" "1660s" "1670s" [7] "1630s" "1620s" "1600s" "1610s" top_hits(years_decades(ecco_years)) [1] "1790s" "1780s" "1770s" "1760s" "1750s" "1710s" [7] "1740s" "1730s" "1700s" "1720s" modularity
top_hits(featurize(three_weeks_body)) # reuse! [1] "the" "and" "of" "to" "he" "a" "his" "was" [9] "in" "her"
three_weeks_body %>% featurize %>% top_hits eebo_years %>% years_decades %>% top_hits function composition is chaining
Recommend
More recommend