Loading necessary package(s) and reading data
library(tidyverse)
library(readr)
sheet1 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
Using chaining to select conditional columns, and edit the column name of ‘Area_name’
var1 <- sheet1 %>%
select(Area_name, STCOU, ends_with("D")) %>%
rename("area_name" = Area_name)
var1
## # A tibble: 3,198 × 12
## area_…¹ STCOU EDU01…² EDU01…³ EDU01…⁴ EDU01…⁵ EDU01…⁶ EDU01…⁷ EDU01…⁸ EDU01…⁹
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 UNITED… 00000 4.00e7 4.00e7 4.03e7 4.07e7 4.14e7 4.21e7 4.27e7 4.34e7
## 2 ALABAMA 01000 7.34e5 7.28e5 7.30e5 7.28e5 7.26e5 7.26e5 7.28e5 7.31e5
## 3 Autaug… 01001 6.83e3 6.9 e3 6.92e3 6.85e3 7.01e3 7.14e3 7.15e3 7.38e3
## 4 Baldwi… 01003 1.64e4 1.65e4 1.68e4 1.71e4 1.75e4 1.80e4 1.87e4 1.94e4
## 5 Barbou… 01005 5.07e3 5.10e3 5.07e3 5.16e3 5.17e3 5.25e3 5.14e3 5.11e3
## 6 Bibb, … 01007 3.56e3 3.51e3 3.57e3 3.62e3 3.65e3 3.56e3 3.56e3 3.55e3
## 7 Blount… 01009 7.32e3 7.22e3 7.20e3 7.21e3 7.16e3 7.15e3 7.21e3 7.3 e3
## 8 Bulloc… 01011 2.01e3 1.98e3 1.98e3 1.98e3 1.98e3 2.02e3 2.01e3 2.01e3
## 9 Butler… 01013 4.64e3 4.58e3 4.61e3 4.59e3 4.54e3 4.48e3 4.44e3 4.35e3
## 10 Calhou… 01015 2.09e4 2.09e4 2.09e4 2.08e4 2.08e4 2.05e4 2.05e4 2.01e4
## # … with 3,188 more rows, 2 more variables: EDU010195D <dbl>, EDU010196D <dbl>,
## # and abbreviated variable names ¹area_name, ²EDU010187D, ³EDU010188D,
## # ⁴EDU010189D, ⁵EDU010190D, ⁶EDU010191D, ⁷EDU010192D, ⁸EDU010193D,
## # ⁹EDU010194D
Since we are interested in pivoting on enrollment values, applying pivot longer from columns 3 to 12, and adding the data with two new columns -> ‘enroll_details’ (having the data from original DF’s columns names), and ‘value’ (having data for corresponding enrollment values from original data frame)
wideVar1 <- var1 %>%
pivot_longer(cols = 3:12, names_to = "enroll_details", values_to = "value")
wideVar1
## # A tibble: 31,980 × 4
## area_name STCOU enroll_details value
## <chr> <chr> <chr> <dbl>
## 1 UNITED STATES 00000 EDU010187D 40024299
## 2 UNITED STATES 00000 EDU010188D 39967624
## 3 UNITED STATES 00000 EDU010189D 40317775
## 4 UNITED STATES 00000 EDU010190D 40737600
## 5 UNITED STATES 00000 EDU010191D 41385442
## 6 UNITED STATES 00000 EDU010192D 42088151
## 7 UNITED STATES 00000 EDU010193D 42724710
## 8 UNITED STATES 00000 EDU010194D 43369917
## 9 UNITED STATES 00000 EDU010195D 43993459
## 10 UNITED STATES 00000 EDU010196D 44715737
## # … with 31,970 more rows
Using the str_sub function to get substring from a string. Fetching the 3rd last and 2nd last elements from the column name, converting the value to numeric, and putting an if else condition based on that value to add to a new column ‘year’. Further, using the substring function to create a new column ‘measurement’ by extracting first characters from one of the columns.
updatedData <- wideVar1 %>%
mutate(year = as.numeric(if_else(as.numeric(str_sub(enroll_details, start = -3, end = -2)) > 22,
str_c("19", str_sub(enroll_details, start = -3, end = -2)),
str_c("20",str_sub(enroll_details, start = -3, end = -2)))))
updatedData <- updatedData %>%
mutate(measurement = str_sub(enroll_details, start = 1, end = 7))
updatedData
## # A tibble: 31,980 × 6
## area_name STCOU enroll_details value year measurement
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101
## # … with 31,970 more rows
nonCountyData <- updatedData %>%
filter(grepl(",", area_name, fixed=TRUE)==FALSE)
nonCountyData
## # A tibble: 530 × 6
## area_name STCOU enroll_details value year measurement
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101
## # … with 520 more rows
Checking if the area name has a “,” in it. If so, then it is a county data
countyData <- updatedData %>%
filter(grepl(",", area_name, fixed=TRUE)==TRUE)
countyData
## # A tibble: 31,450 × 6
## area_name STCOU enroll_details value year measurement
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101
## # … with 31,440 more rows
Adding “county” and “state” to respective objects class’ list
class(countyData) <- c("county", class(countyData))
class(nonCountyData) <- c("state", class(nonCountyData))
Extracting last two characters of the area name string from county data for abbreviated state name
countyData <- countyData %>%
mutate(state = str_sub(area_name, start = -2, end = -1))
countyData
## # A tibble: 31,450 × 7
## area_name STCOU enroll_details value year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 31,440 more rows
Creating lists of divisions, and for area name in non county data, checking if it belongs in any list, and creating division column with that division name accordingly
div1 <- c("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT")
div2 <- c("NEW JERSEY", "NEW YORK", "PENNSYLVANIA")
div3 <- c("ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN")
div4 <- c("IOWA","KANSAS", "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA")
div5 <- c("DELAWARE", "FLORIDA", "GEORGIA", "MARYLAND", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA", "DISTRICT OF COLUMBIA", "District of Columbia", "WEST VIRGINIA")
div6 <- c("ALABAMA", "KENTUCKY", "MISSISSIPPI", "TENNESSEE")
div7 <- c("ARKANSAS", "LOUISIANA", "OKLAHOMA", "TEXAS")
div8 <- c("ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH", "WYOMING")
div9 <- c("ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON")
nonCountyData <- nonCountyData %>%
mutate(division = if_else(area_name %in% div1, "New England",
if_else(area_name %in% div2, "Mid-Atlantic",
if_else(area_name %in% div3, "East North Central",
if_else(area_name %in% div4, "West North Central",
if_else(area_name %in% div5, "South Atlantic",
if_else(area_name %in% div6, "East South Central",
if_else(area_name %in% div7, "West South Central",
if_else(area_name %in% div8, "Mountain",
if_else(area_name %in% div9, "Pacific", "ERROR"))))))))))
nonCountyData
## # A tibble: 530 × 7
## area_name STCOU enroll_details value year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 520 more rows
Function to read the csv file and return a tibble
readCSVAndReturnTibble <- function(url){
sheet_ <- read_csv(url)
return(sheet_)
}
Function for step 1 and 2 which filters the original data frame, renames area name column, and pivots the data. Adding an optional argument in the function which will take a default value as “values” when no argument is passed when calling a function
stepOneAndTwo <- function(sheet, colName = "values"){
var2 <- sheet %>%
select(Area_name, STCOU, ends_with("D")) %>%
rename("area_name" = Area_name) %>%
pivot_longer(cols = 3:12, names_to = "enroll_details", values_to = colName)
return(var2)
}
Function to do step 3, which takes input from step 1 and 2’s function’s output. This function adds a column for year by extracting the 3rd last and 2nd last characters of the enroll_details column, and checks for correct year assigning criteria. It also adds a column ‘measurement’ by extracting first 7 characters from existing rnroll_details column
stepThree <- function(input_ = stepOneAndTwo(sheet_, colName)){
retVar <- input_ %>%
mutate(year = as.numeric(if_else(as.numeric(str_sub(enroll_details, start = -3, end = -2)) > 22,
str_c("19", str_sub(enroll_details, start = -3, end = -2)),
str_c("20",str_sub(enroll_details, start = -3, end = -2)))))
retVar <- retVar %>%
mutate(measurement = str_sub(enroll_details, start = 1, end = 7))
return(retVar)
}
Function to extract state name after from area_name. Function also adds “county” to the object’s class list
getCountyData <- function(inputTibble_){
countyData <- inputTibble_ %>%
#filter(grepl(",", area_name, fixed=TRUE)==TRUE) %>%
mutate(state = str_sub(area_name, start = -2, end = -1))
class(countyData) <- c("county", class(countyData))
return(countyData)
}
Function to add division name after conditional area_name checks. Function also adds “state” to the object’s class list
getNonCountyData <- function(input_){
div1 <- c("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT")
div2 <- c("NEW JERSEY", "NEW YORK", "PENNSYLVANIA")
div3 <- c("ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN")
div4 <- c("IOWA","KANSAS", "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA")
div5 <- c("DELAWARE", "FLORIDA", "GEORGIA", "MARYLAND", "NORTH CAROLINA",
"SOUTH CAROLINA", "VIRGINIA", "DISTRICT OF COLUMBIA", "WEST VIRGINIA")
div6 <- c("ALABAMA", "KENTUCKY", "MISSISSIPPI", "TENNESSEE")
div7 <- c("ARKANSAS", "LOUISIANA", "OKLAHOMA", "TEXAS")
div8 <- c("ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH", "WYOMING")
div9 <- c("ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON")
nonCountyData <- input_ %>%
#filter(grepl(",", area_name, fixed=TRUE)==FALSE) %>%
mutate(division = if_else(toupper(area_name) %in% div1, "New England",
if_else(toupper(area_name) %in% div2, "Mid-Atlantic",
if_else(toupper(area_name) %in% div3, "East North Central",
if_else(toupper(area_name) %in% div4, "West North Central",
if_else(toupper(area_name) %in% div5, "South Atlantic",
if_else(toupper(area_name) %in% div6, "East South Central",
if_else(toupper(area_name) %in% div7, "West South Central",
if_else(toupper(area_name) %in% div8, "Mountain",
if_else(toupper(area_name) %in% div9, "Pacific", "ERROR"))))))))))
class(nonCountyData) <- c("state", class(nonCountyData))
return(nonCountyData)
}
Function to segregate county data, and non county data, and pass it on to above two fucntions to add details about their corresponding state/division. This function returns a list of two tibbles, first element for county tibble, and second element for non county tibble.
stepFour <- function(input_ = stepThree(stepOneAndTwo(sheet_, colName))){
nonCountyData <- input_ %>%
filter(grepl(",", area_name, fixed=TRUE)==FALSE)
countyData <- input_ %>%
filter(grepl(",", area_name, fixed=TRUE)==TRUE)
countyTib <- getCountyData(countyData)
nonCountyTib <- getNonCountyData(nonCountyData)
return(list(County=countyTib, Non_County=nonCountyTib))
}
Wrapper function to combine all the above defined functions to achieve final required tibble from a single function. It has an optional argument to define column name. If no argument is passed, it will take default value as “Default_NAME”
my_wrapper <- function(url, colName = "Default_NAME"){
a <- readCSVAndReturnTibble(url)
b <- stepOneAndTwo(a, colName)
c <- stepThree(b)
d <- stepFour(c)
return(d)
}
Calling wrapper functions using appropriate arguments.
tibble_1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", "E_Values")
tibble_1
## $County
## # A tibble: 31,450 × 7
## area_name STCOU enroll_details E_Values year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 31,440 more rows
##
## $Non_County
## # A tibble: 530 × 7
## area_name STCOU enroll_details E_Values year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 520 more rows
tibble_2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", "E_Values")
tibble_2
## $County
## # A tibble: 31,450 × 7
## area_name STCOU enroll_details E_Values year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010197D 8099 1997 EDU0101 AL
## 2 Autauga, AL 01001 EDU010198D 8211 1998 EDU0101 AL
## 3 Autauga, AL 01001 EDU010199D 8489 1999 EDU0101 AL
## 4 Autauga, AL 01001 EDU010200D 8912 2000 EDU0102 AL
## 5 Autauga, AL 01001 EDU010201D 8626 2001 EDU0102 AL
## 6 Autauga, AL 01001 EDU010202D 8762 2002 EDU0102 AL
## 7 Autauga, AL 01001 EDU015203D 9105 2003 EDU0152 AL
## 8 Autauga, AL 01001 EDU015204D 9200 2004 EDU0152 AL
## 9 Autauga, AL 01001 EDU015205D 9559 2005 EDU0152 AL
## 10 Autauga, AL 01001 EDU015206D 9652 2006 EDU0152 AL
## # … with 31,440 more rows
##
## $Non_County
## # A tibble: 530 × 7
## area_name STCOU enroll_details E_Values year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010197D 44534459 1997 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010198D 46245814 1998 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010199D 46368903 1999 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010200D 46818690 2000 EDU0102 ERROR
## 5 UNITED STATES 00000 EDU010201D 47127066 2001 EDU0102 ERROR
## 6 UNITED STATES 00000 EDU010202D 47606570 2002 EDU0102 ERROR
## 7 UNITED STATES 00000 EDU015203D 48506317 2003 EDU0152 ERROR
## 8 UNITED STATES 00000 EDU015204D 48693287 2004 EDU0152 ERROR
## 9 UNITED STATES 00000 EDU015205D 48978555 2005 EDU0152 ERROR
## 10 UNITED STATES 00000 EDU015206D 49140702 2006 EDU0152 ERROR
## # … with 520 more rows
This function combines two input data frames/ tibbles using the bind_rows() function from tidyverse. A list is returned from this function which would have two elements, for for county tibble, and another for non-county tibble.
combine_dfs <- function(tibb1, tibb2){
temp_tib_1 <- bind_rows(tibb1$County, tibb2$County)
temp_tib_2 <- bind_rows(tibb1$Non_County, tibb2$Non_County)
comb_df <- list(County = temp_tib_1, Non_County = temp_tib_2)
return(comb_df)
}
combine_dfs(tibble_1, tibble_2)
## $County
## # A tibble: 62,900 × 7
## area_name STCOU enroll_details E_Values year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 62,890 more rows
##
## $Non_County
## # A tibble: 1,060 × 7
## area_name STCOU enroll_details E_Values year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 1,050 more rows
This function is written to define the default plot for any class that would be of (or have) “state” type. Check for division to remove “ERROR” data rows, and summarize function to get mean enrollment for each division for each year.
plot.state <- function(df, var_name = "E_Values"){
new_df <- df %>%
filter(division != "ERROR") %>%
group_by(year, division) %>%
summarize(Enrollment = mean(get(var_name)))
ggplot(new_df, aes(x = year, y = Enrollment, color = division)) + geom_line()
}
This function is written to define the default plot for any class that would be of (or have) “county” type. Takes 4 optional arguments, one for the column name to find average of, state name, sort by (top/bottom), and number of rows. This function finds means of enrollment for each area for state of user’s choice, sorts it either by ascending or descending values of average, and selects first n rows of data. Default value for the column to find average of is “E_Values”, default for state name is “AZ”, sort by is “top”, and number of rows is numeric 5.
plot.county <- function(df, var_name = "E_Values", state_name = "AZ", sort_by = "top", rows = 5){
new_df <- df %>%
filter(state == state_name) %>%
group_by(area_name) %>%
summarize(Enrollment = mean(get(var_name)))
if (sort_by == "top"){
new_df <- new_df %>%
arrange(desc(Enrollment)) %>%
slice(1:rows)
} else if(sort_by == "bottom") {
new_df <- new_df %>%
arrange(Enrollment) %>%
slice(1:rows)
} else{
new_df <- new_df %>%
arrange(desc(Enrollment)) %>%
slice(1:rows)
}
ret_df <- df %>%
filter(area_name %in% new_df$area_name)
ggplot(ret_df, aes(x = year, y = get(var_name), color = area_name)) + geom_line() + ylab('Enrollment')
}
Reading two csv files, and calling wrapper function
url1 <- "https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv"
url2 <- "https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv"
df_1 <- my_wrapper(url1, "E_Value")
df_2 <- my_wrapper(url2, "E_Value")
Combining data frames from output of wrapper functions above, and separating county and non county tibbles
comb_dfs <- combine_dfs(df_1, df_2)
print(comb_dfs)
## $County
## # A tibble: 62,900 × 7
## area_name STCOU enroll_details E_Value year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 62,890 more rows
##
## $Non_County
## # A tibble: 1,060 × 7
## area_name STCOU enroll_details E_Value year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 1,050 more rows
non_County_Comb <-comb_dfs[['Non_County']]
county_Comb <- comb_dfs[['County']]
All plots below are according to the class type of each object. For “state” type objects (non-county tibbles), the plot for state function is called, while for “county” type objects (county tibbles), the plot function for county is called.
plot(non_County_Comb, var_name = "E_Value")
plot(county_Comb, var_name = "E_Value", "PA", "top", 7)
plot(county_Comb, var_name = "E_Value", "PA", "bottom", 4)
plot(county_Comb, var_name = "E_Value")
plot(county_Comb, var_name = "E_Value", "MN", "top", 10)
Combining 4 data sets by calling combining function 3 times, and separating county and non county data
add_url_1 <- "https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv"
add_url_2 <- "https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv"
add_url_3 <- "https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv"
add_url_4 <- "https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv"
add_df_1 <- my_wrapper(add_url_1, "E_Value")
add_df_2 <- my_wrapper(add_url_2, "E_Value")
add_df_3 <- my_wrapper(add_url_3, "E_Value")
add_df_4 <- my_wrapper(add_url_4, "E_Value")
comb_df_1 <- combine_dfs(add_df_1, add_df_2)
comb_df_2 <- combine_dfs(add_df_3, comb_df_1)
comb_df_3 <- combine_dfs(add_df_4, comb_df_2)
combined_Non_County <- comb_df_3[['Non_County']]
combined_County <- comb_df_3[['County']]
All plots below are according to the class type of each object. For “state” type objects (non-county tibbles), the plot for state function is called, while for “county” type objects (county tibbles), the plot function for county is called.
plot(combined_Non_County, var_name = "E_Value")
plot(combined_County, var_name = "E_Value", "CT", "top", 6)
plot(combined_County, var_name = "E_Value", "NC", "bottom", 10)
plot(combined_County, var_name = "E_Value")
plot(combined_County, var_name = "E_Value", "MN", "top", 4)