Created
April 8, 2021 23:48
-
-
Save jeanpaulrsoucy/2132259122e68c17ba39b26128aa3eda to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scraping and plotting Canadian COVID-19 VOC data | |
*Jean-Paul R. Soucy* | |
Let's scrape the VOC data from the [CTV News variant tracker](https://www.ctvnews.ca/health/coronavirus/tracking-variants-of-the-novel-coronavirus-in-canada-1.5296141) (maintained by journalists [Jesse Tahirali](https://twitter.com/jessetahirali) and [Stephanie Liu](https://twitter.com/_stephanieliu)) and the [Public Health Agency of Canada](https://health-infobase.canada.ca/covid-19/epidemiological-summary-covid-19-cases.html#VOC). | |
```{r variant-data} | |
# load packages | |
library(jsonlite) | |
suppressPackageStartupMessages(library(dplyr)) | |
library(tidyr) | |
# load and process data | |
phac <- read.csv( | |
"https://health-infobase.canada.ca/src/data/covidLive/covid19-epiSummary-voc.csv", | |
stringsAsFactors = FALSE) %>% | |
## keep relevant columns | |
select(report_date, prov, b117, b1351, p1) %>% | |
## rename columns | |
rename(date = report_date, province = prov, B117 = b117, B1351 = b1351, P1 = p1) %>% | |
## convert dates | |
mutate( | |
date = as.Date(date) | |
) %>% | |
## rename "CA" to "Canada" | |
mutate( | |
province = case_when( | |
province == "CA" ~ "Canada", | |
TRUE ~ province | |
) | |
) %>% | |
## fill in missing dates | |
complete(., expand(., date, province), fill = list(B117 = 0, B1351 = 0, P1 = 0)) %>% | |
## arrange | |
arrange(date, province) | |
# wide to long for easier plotting | |
phac_plot <- | |
pivot_longer( | |
phac, | |
cols = c(B117, B1351, P1), | |
names_to = "variant", | |
values_to = "count" | |
) | |
# load and process data | |
ctv <- fromJSON( | |
"https://beta.ctvnews.ca/content/dam/common/exceltojson/COVID-Variants.txt", | |
flatten = FALSE) %>% | |
## remove blank data and summary data | |
filter(!Date %in% c("", "Updated", "Total")) %>% | |
## convert Excel dates | |
mutate(date = as.Date(as.integer(Date), origin = "1899-12-30")) | |
# create usable table | |
ctv <- bind_cols( | |
select(ctv, date, contains("B117")) %>% | |
pivot_longer( | |
cols = ends_with("B117"), | |
names_to = c("province", ".value"), | |
names_sep = "_", | |
values_to = "B117", | |
values_transform = list(B117 = as.integer) | |
) %>% | |
arrange(date, province) %>% | |
group_by(province) %>% | |
fill(3, .direction = "down") %>% | |
ungroup, | |
select(ctv, date, contains("B1351")) %>% | |
pivot_longer( | |
cols = ends_with("B1351"), | |
names_to = c("province", ".value"), | |
names_sep = "_", | |
values_to = "B1351", | |
values_transform = list(B1351 = as.integer) | |
) %>% | |
arrange(date, province) %>% | |
group_by(province) %>% | |
fill(3, .direction = "down") %>% | |
ungroup %>% | |
select(3), | |
select(ctv, date, contains("P1")) %>% | |
pivot_longer( | |
cols = ends_with("P1"), | |
names_to = c("province", ".value"), | |
names_sep = "_", | |
values_to = "P1", | |
values_transform = list(P1 = as.integer) | |
) %>% | |
arrange(date, province) %>% | |
group_by(province) %>% | |
fill(3, .direction = "down") %>% | |
ungroup %>% | |
select(3) | |
) %>% | |
replace_na(list(B117 = 0, B1351 = 0, P1 = 0)) | |
# wide to long for easier plotting | |
ctv_plot <- | |
pivot_longer( | |
ctv, | |
cols = c(B117, B1351, P1), | |
names_to = "variant", | |
values_to = "count" | |
) | |
# combine data | |
phac_plot$source <- "PHAC" | |
ctv_plot$source <- "CTV News" | |
variants <- bind_rows(phac_plot, ctv_plot) | |
``` | |
Let's plot the two time series for Canada. | |
```{r variants-can, fig.cap = "Cumulative time series for three COVID-19 variants of concern in Canada. Data from CTV News (solid) and Public Health Agency of Canada (dashed)."} | |
# load packages | |
library(ggplot2) | |
library(ggpubr) | |
# plot Canadian data | |
ggplot(data = variants %>% filter(province == "Canada"), | |
aes(x = date, y = count, colour = variant, linetype = source)) + | |
geom_line() + | |
labs( | |
title = "COVID Variants in Canada", | |
x = "Public reporting date", | |
y = "Cumulative variants reported", | |
colour = "Variant", | |
linetype = "Source") + | |
theme_pubr() + | |
theme( | |
legend.position = "bottom", | |
plot.title = element_text(size = 16, hjust = 0.5), | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 11), | |
legend.title = element_text(size = 13), | |
legend.text = element_text(size = 11), | |
axis.title.y = element_text(margin = margin( | |
t = 0, | |
r = 5, | |
b = 0, | |
l = 0 | |
))) | |
``` | |
Let's plot the two time series for Ontario. | |
```{r variants-on, fig.cap = "Cumulative time series for three COVID-19 variants of concern in Canada. Data from CTV News (solid) and Public Health Agency of Canada (dashed)."} | |
# plot Ontario data | |
ggplot(data = variants %>% filter(province == "ON"), | |
aes(x = date, y = count, colour = variant, linetype = source)) + | |
geom_line() + | |
labs( | |
title = "COVID Variants in Ontario", | |
x = "Public reporting date", | |
y = "Cumulative variants reported", | |
colour = "Variant", | |
linetype = "Source") + | |
theme_pubr() + | |
theme( | |
legend.position = "bottom", | |
plot.title = element_text(size = 16, hjust = 0.5), | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 11), | |
legend.title = element_text(size = 13), | |
legend.text = element_text(size = 11), | |
axis.title.y = element_text(margin = margin( | |
t = 0, | |
r = 5, | |
b = 0, | |
l = 0 | |
))) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment