library(DOPE)
c_cs_alpha.pdf downloaded https://www.deadiversion.usdoj.gov/schedules/orangebook/c_cs_alpha.pdf 2020-09-08
The text file, c_cs_alpha.txt, was opened with UltraEdit and edited to remove headers and the data were aligned to match the drug number column. That manual editing to remove the headers should be done programmatically (perhaps with a package that emulates PERL). Aligning of the drug number column is a major coding task given the structure of the data.
That file was opened in Excel to spit into columns. This should be redone in R.
What | Column |
---|---|
Substance | 1-59 |
DEA Number | 60-62 |
CSA Schedule | 63-71 |
Narcotic | 72 |
Other Names | 73+ |
library(readxl)
<- read_excel("../inst/extdata/c_cs_alpha.xlsx") controlled
library(conflicted)
suppressMessages(conflict_prefer("filter", "dplyr"))
suppressPackageStartupMessages(library(dplyr))
library(stringr) # str_count & str_detect
library(tidyr) # separate
library(readr) # write_csv
The Names column can have many different synonyms for a drug (street and/or brand). I am marking names records that have an open parenthisis (
or a comma followed by something other than a space as being difficult to parse.
<- controlled %>%
new mutate(difficult = str_count(Names, "[(]") > 0 |
str_detect(controlled$Names, ",(?=\\S)"))
Here the difficult names are exported to a text file and then parsed by hand to separate names with semi-colons. Check the git history for a partial solution that automated this for specific types complexities.
Changes made to the Difficult.csv
file include:
# filtered rows where synonyms are "difficult"
<- new %>%
difficult filter(difficult == TRUE)
# created an text file (CSV) for all rows with "difficult" synonyms
# difficult %>%
# select(- difficult) %>%
# write_csv("../inst/extdata/Difficult.csv")
# for the 'difficult' synonyms, I plan to split by semicolon
# prepped csv file for splitting synonyms via semicolon
<- read_csv("../inst/extdata/Difficult_Edited.csv")
synonyms_edited
# data set of difficult synonyms, all split by semicolon
<-
synonyms_difficult %>%
synonyms_edited separate(
Names, into = c("n_1", "n_2", "n_3", "n_4", "n_5", "n_6", "n_7", "n_8", "n_9"),
extra = "drop",
fill = "right",
sep = "[;]",
remove = FALSE
%>%
) select(everything()) %>%
mutate(across(starts_with("n_"), ~str_trim(.x))) %>%
pivot_longer(
cols = starts_with("n_"),
values_to = "synonym",
values_drop_na = TRUE) %>%
select(-c(name, Names)) %>%
filter(synonym != '')
A few of the “easy” synonyms had conjunctions (and/or) or commas without spaces.
For the ‘easy’ synonyms, I plan to split by comma. Below are the records which need modification:
# filtered rows where synonyms are NOT "difficult"
<- new %>%
easy filter(difficult %in% c(FALSE, NA))
# made the comma replacements and created a dataset for each type of
# transformation, with the final result being a comma
# change semicolon to comma
<-
semi_is_gone %>%
easy slice(6, 64, 80, 378) %>%
mutate(Names = str_replace_all(Names, ";", ","))
# replace "and" with comma
<-
and_is_gone %>%
easy slice(79, 120, 247, 274, 422, 423) %>%
mutate(Names = str_replace_all(Names, " and", ","))
# remove the phrase involving ecstasy
<-
ecstasy_is_gone %>%
easy slice(58) %>%
mutate(Names = str_remove_all(Names, " has been sold as Ecstasy, i.e."))
# remove comma after synonym
<-
extra_comma_is_gone %>%
easy slice(376) %>%
mutate(Names = str_remove_all(Names, ","))
# replace "or" with comma
<-
or_is_gone %>%
easy slice(328) %>%
mutate(Names = str_replace_all(Names, " or", ","))
# dataset of rows that did NOT require a comma change
# (i.e. I left them the way they are)
<-
easy_nochanges %>%
easy slice(-6, -58, -64, -79, -80, -120, -247, -274, -328, -376, -378, -422, -423)
# bind rows that required a comma change and rows that didn't
# now the data is ready to be split by comma
<-
synonyms_easy_prep bind_rows(
semi_is_gone,
and_is_gone,
ecstasy_is_gone,
extra_comma_is_gone,
or_is_gone,
easy_nochanges
)
# dataset of easy synonyms, all split by comma
<-
synonyms_easy %>%
synonyms_easy_prep # move the comma separated names into their own columns.
# mine new columns are enough to hold the drugs with MANY synonyms.
separate(
Names, into = c("n_1", "n_2", "n_3", "n_4", "n_5", "n_6", "n_7", "n_8", "n_9"),
extra = "drop",
fill = "right",
sep = "[,]",
remove = FALSE
%>%
) # remove extra spaces for all the newly created variables
mutate(across(starts_with("n_"), ~str_trim(.x))) %>%
# make the dataset long
pivot_longer(
cols = starts_with("n_"),
values_to = "synonym",
values_drop_na = TRUE) %>%
select(-c(name, Names, difficult)) %>%
# get of any blank name columns
filter(synonym != '')
<- bind_rows(synonyms_difficult, synonyms_easy) %>%
dea_controlled mutate(synonym = if_else(synonym == "Soneryl (UK)", "Soneryl", synonym)) %>%
rename("substance" = SUBSTANCE) %>%
rename("number" = Number) %>%
rename("schedule" = Schedule) %>%
rename("narcotic" = Narcotic)
::use_data(dea_controlled, overwrite = TRUE) usethis