Title: | cleaningtools package focuses on data cleaning |
---|---|
Description: | The cleaningtools package focuses on cleaning, and has three components: **Check**, which includes a set of functions that flag values, such as check_outliers and check_logical. **Create**, which includes a set of functions to create different items for use in cleaning, such as the cleaning log from the checks, clean data, and enumerator performance. **Review**, which includes a set of functions to review the cleaning. |
Authors: | Mehedi Khan [aut] , Yann Say [cre, aut] |
Maintainer: | Yann Say <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.0.0.9003 |
Built: | 2024-11-25 13:39:30 UTC |
Source: | https://github.com/impact-initiatives/cleaningtools |
Creates a duration variable using the start and end time of the survey
add_duration( dataset, duration_column = "duration", uuid_column = "uuid", start_column = "start", end_column = "end" )
add_duration( dataset, duration_column = "duration", uuid_column = "uuid", start_column = "start", end_column = "end" )
dataset |
dataset to add the duration |
duration_column |
the name of the duration variable that will be created. |
uuid_column |
uuid column in the dataset. Default is uuid. |
start_column |
start time of the survey, in KOBO format (e.g "2021-07-14T14:02:24.955+03:00") |
end_column |
end time of the survey, in KOBO format (e.g "2021-07-14T14:03:28.955+03:00") |
the clean dataset with 6 new columns: duration, start_date, start_time, end_date, end_time, days_diff
test_data <- data.frame( start = c( "2021/07/13 11:25:49", "2021/07/13 12:36:16", "2021/07/13 10:21:10", "2021/07/13 10:54:07", "2021/07/13 11:18:45" ), end = c( "2021/07/13 12:02:39", "2021/07/13 13:20:17", "2021/07/13 10:53:42", "2021/07/13 11:28:58", "2021/07/13 11:55:24" ), uuid = 1:5 ) test_data_w_duration <- test_data %>% add_duration() test_data_kobo_time <- data.frame( `X.U.FEFF.start` = c( "2021-07-13T11:25:49.543+03:00", "2021-07-13T12:36:16.316+03:00", "2021-07-13T10:21:10.337+03:00", "2021-07-13T10:54:07.394+03:00", "2021-07-13T11:18:45.521+03:00" ), end = c( "2021-07-14T12:02:39.269+03:00", "2021-07-13T13:20:17.815+03:00", "2021-07-13T10:53:42.662+03:00", "2021-07-13T11:28:58.295+03:00", "2021-07-13T11:55:24.366+03:00" ), uuid_column = 1:5 ) test_data_kobo_time_w_duration <- test_data_kobo_time %>% add_duration(start_column = "X.U.FEFF.start")
test_data <- data.frame( start = c( "2021/07/13 11:25:49", "2021/07/13 12:36:16", "2021/07/13 10:21:10", "2021/07/13 10:54:07", "2021/07/13 11:18:45" ), end = c( "2021/07/13 12:02:39", "2021/07/13 13:20:17", "2021/07/13 10:53:42", "2021/07/13 11:28:58", "2021/07/13 11:55:24" ), uuid = 1:5 ) test_data_w_duration <- test_data %>% add_duration() test_data_kobo_time <- data.frame( `X.U.FEFF.start` = c( "2021-07-13T11:25:49.543+03:00", "2021-07-13T12:36:16.316+03:00", "2021-07-13T10:21:10.337+03:00", "2021-07-13T10:54:07.394+03:00", "2021-07-13T11:18:45.521+03:00" ), end = c( "2021-07-14T12:02:39.269+03:00", "2021-07-13T13:20:17.815+03:00", "2021-07-13T10:53:42.662+03:00", "2021-07-13T11:28:58.295+03:00", "2021-07-13T11:55:24.366+03:00" ), uuid_column = 1:5 ) test_data_kobo_time_w_duration <- test_data_kobo_time %>% add_duration(start_column = "X.U.FEFF.start")
Wrapper around create_duration_from_audit_with_start_end and create_duration_from_audit_sum_all to add the duration to the dataset.
add_duration_from_audit( dataset, col_name_prefix = "duration_audit", uuid_column = "uuid", audit_list, start_question = NULL, end_question = NULL, sum_all = TRUE )
add_duration_from_audit( dataset, col_name_prefix = "duration_audit", uuid_column = "uuid", audit_list, start_question = NULL, end_question = NULL, sum_all = TRUE )
dataset |
dataset to add the duration |
col_name_prefix |
string character to be used for the prefix of new columns |
uuid_column |
uuid column in the dataset. Default is uuid. |
audit_list |
list of dataframe that are the audit file with the uuid of each interview as name of the dataframe. |
start_question |
character vector use for the starting question (optional) |
end_question |
character vector use for the ending question (optional) |
sum_all |
TRUE or FALSE if to add the time with sum all duration |
return the dataset with durations column added
list_audit <- list( uuid1 = data.frame( event = c("form start", rep("question", 5)), node = c("", paste0("/xx/question", 1:5)), start = c( 1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720 ), end = c( NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452 ) ), uuid2 = data.frame( event = c("form start", rep("question", 5)), node = c("", paste0("/xx/question", 1:5)), start = c( 1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720 ), end = c( NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452 ) ) ) some_dataset <- data.frame( X_uuid = c("uuid1", "uuid2"), question1 = c("a", "b"), question2 = c("a", "b"), question3 = c("a", "b"), question4 = c("a", "b"), question5 = c("a", "b") ) add_duration_from_audit(some_dataset, uuid_column = "X_uuid", audit_list = list_audit) add_duration_from_audit(some_dataset, uuid_column = "X_uuid", audit_list = list_audit, start_question = "question1", end_question = "question3", sum_all = FALSE ) add_duration_from_audit(some_dataset, uuid_column = "X_uuid", audit_list = list_audit, start_question = "question1", end_question = "question3", sum_all = TRUE )
list_audit <- list( uuid1 = data.frame( event = c("form start", rep("question", 5)), node = c("", paste0("/xx/question", 1:5)), start = c( 1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720 ), end = c( NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452 ) ), uuid2 = data.frame( event = c("form start", rep("question", 5)), node = c("", paste0("/xx/question", 1:5)), start = c( 1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720 ), end = c( NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452 ) ) ) some_dataset <- data.frame( X_uuid = c("uuid1", "uuid2"), question1 = c("a", "b"), question2 = c("a", "b"), question3 = c("a", "b"), question4 = c("a", "b"), question5 = c("a", "b") ) add_duration_from_audit(some_dataset, uuid_column = "X_uuid", audit_list = list_audit) add_duration_from_audit(some_dataset, uuid_column = "X_uuid", audit_list = list_audit, start_question = "question1", end_question = "question3", sum_all = FALSE ) add_duration_from_audit(some_dataset, uuid_column = "X_uuid", audit_list = list_audit, start_question = "question1", end_question = "question3", sum_all = TRUE )
Add information to the cleaning log
add_info_to_cleaning_log( list_of_log = NULL, dataset = "checked_dataset", cleaning_log = "cleaning_log", dataset_uuid_column = "uuid", cleaning_log_uuid_column = "uuid", information_to_add = c("enumerator_num", "date_assessment") )
add_info_to_cleaning_log( list_of_log = NULL, dataset = "checked_dataset", cleaning_log = "cleaning_log", dataset_uuid_column = "uuid", cleaning_log_uuid_column = "uuid", information_to_add = c("enumerator_num", "date_assessment") )
list_of_log |
A list file containing the cleaning_log and the dataset |
dataset |
If a list is provided, the element name represents the dataset; otherwise, it should be a dataframe. |
cleaning_log |
If a list is provided, the element name represents the cleaning log; otherwise, it is should be a dataframe. |
dataset_uuid_column |
uuid column in the cleaning dataset, it can take a vector of several binding key. Default is uuid. |
cleaning_log_uuid_column |
uuid column in the cleaning log, it can take a vectore of several binding key. Default is uuid. |
information_to_add |
Vector of name of the columns to be added. |
A list with cleaning log with added info and the dataset.
cleaningtools::cleaningtools_raw_data |> check_pii(uuid_column = "X_uuid") |> check_duplicate(uuid_column = "X_uuid") |> check_value(uuid_column = "X_uuid") |> create_combined_log() |> add_info_to_cleaning_log(dataset_uuid_column = "X_uuid")
cleaningtools::cleaningtools_raw_data |> check_pii(uuid_column = "X_uuid") |> check_duplicate(uuid_column = "X_uuid") |> check_value(uuid_column = "X_uuid") |> create_combined_log() |> add_info_to_cleaning_log(dataset_uuid_column = "X_uuid")
Adds the percentage of missing values per row
add_percentage_missing( dataset, column_name = "percentage_missing", kobo_survey = NULL, type_to_include = c("integer", "date", "text", "select_one", "select_multiple") )
add_percentage_missing( dataset, column_name = "percentage_missing", kobo_survey = NULL, type_to_include = c("integer", "date", "text", "select_one", "select_multiple") )
dataset |
A dataset to add the percentage of missing values |
column_name |
string variable with the name of the new column to be created, default is percentage_missing |
kobo_survey |
Kobo survey sheet. |
type_to_include |
Types (from KOBO) to be included in the columns default are integer, date, text, select_one and select_multiple. |
a dataset with the one additional column with the percentage of missing value
data_test <- data.frame( uuid = c(1:3), col_1 = c(1:3), col_2 = c(NA, NA, "expenditures"), col_3 = c("with need", NA, "with need"), col_4 = c("food health school", NA, "food"), col_4.food = c(1, NA, 1), col_4.health = c(1, NA, 0), col_4.school = c(1, NA, 0) ) kobo_survey <- data.frame( type = c( "uuid", "integer", "select_one choice2", "select_one choice3", "select_multiple choice4" ), name = c("uuid", "col_1", "col_2", "col_3", "col_4") ) data_test %>% add_percentage_missing( kobo_survey = kobo_survey, type_to_include = c("integer", "select_one", "select_multiple") ) data_test %>% add_percentage_missing()
data_test <- data.frame( uuid = c(1:3), col_1 = c(1:3), col_2 = c(NA, NA, "expenditures"), col_3 = c("with need", NA, "with need"), col_4 = c("food health school", NA, "food"), col_4.food = c(1, NA, 1), col_4.health = c(1, NA, 0), col_4.school = c(1, NA, 0) ) kobo_survey <- data.frame( type = c( "uuid", "integer", "select_one choice2", "select_one choice3", "select_multiple choice4" ), name = c("uuid", "col_1", "col_2", "col_3", "col_4") ) data_test %>% add_percentage_missing( kobo_survey = kobo_survey, type_to_include = c("integer", "select_one", "select_multiple") ) data_test %>% add_percentage_missing()
auto_detect_sm_parents
is mean to detect select multiple parent columns in a way that does
not rely on the XLSForm as the input
auto_detect_sm_parents(dataset, sm_separator = ".")
auto_detect_sm_parents(dataset, sm_separator = ".")
dataset |
dataset to correct |
sm_separator |
Separator for choice multiple questions. The default is "." If using read_csv to read in data the separator will most likely be '/' where as if using read.csv it will likely be '.' |
a list of select multiple parent columns in data set.
auto_sm_parent_children
is mean to detect select multiple parent columns & children columns in a way that does
not rely on the XLSForm as the input
auto_sm_parent_children(dataset, sm_separator = ".")
auto_sm_parent_children(dataset, sm_separator = ".")
dataset |
dataset to correct |
sm_separator |
Separator for choice multiple questions. The default is "." If using read_csv to read in data the separator will most likely be '/' where as if using read.csv it will likely be '.' |
a data frame containing the the child select multiple columns alongside there parents and a log with all changes recorded.
Checks for duplicated values in columns
check_duplicate( dataset, uuid_column = "uuid", columns_to_check = NULL, log_name = "duplicate_log" )
check_duplicate( dataset, uuid_column = "uuid", columns_to_check = NULL, log_name = "duplicate_log" )
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset". |
uuid_column |
uuid column in the dataset. Default is uuid. |
columns_to_check |
string character with the name of the columns to check. If NULL (default), it will check for the uuid_column |
log_name |
name of the log of flagged value |
return a list with the dataset checked stored as checked_dataset and a dataframe with the duplicate log
testdata <- data.frame( uuid = c(letters[1:4], "a", "b", "c"), col_a = runif(7), col_b = runif(7) ) check_duplicate(testdata) testdata2 <- data.frame( uuid = letters[c(1:7)], village = paste("village", c(1:3, 1:3, 4)), ki_identifier = paste0("xx_", c(1:5, 3, 4)) ) check_duplicate(testdata2, columns_to_check = "village") check_duplicate(testdata2, columns_to_check = c("village", "ki_identifier"), uuid = "uuid")
testdata <- data.frame( uuid = c(letters[1:4], "a", "b", "c"), col_a = runif(7), col_b = runif(7) ) check_duplicate(testdata) testdata2 <- data.frame( uuid = letters[c(1:7)], village = paste("village", c(1:3, 1:3, 4)), ki_identifier = paste0("xx_", c(1:5, 3, 4)) ) check_duplicate(testdata2, columns_to_check = "village") check_duplicate(testdata2, columns_to_check = c("village", "ki_identifier"), uuid = "uuid")
Check if value is strictly inferior of the lower threshold or strictly superior of the higher threshold.
check_duration( dataset, column_to_check, uuid_column = "uuid", log_name = "duration_log", lower_bound = 25, higher_bound = 90 )
check_duration( dataset, column_to_check, uuid_column = "uuid", log_name = "duration_log", lower_bound = 25, higher_bound = 90 )
dataset |
a dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
column_to_check |
string character with the name of the duration column |
uuid_column |
uuid column in the dataset. Default is uuid. |
log_name |
character string with name to give to the log |
lower_bound |
lower value of the range (strictly inferior to) |
higher_bound |
higher value of the range (strictly superior to) |
return a list with the dataset checked stored as checked_dataset and a dataframe with the duration log
testdata <- data.frame( uuid = c(letters[1:7]), duration_audit_start_end_ms = c( 2475353, 375491, 2654267, 311585, 817270, 2789505, 8642007 ), duration_audit_start_end_minutes = c(41, 6, 44, 5, 14, 46, 144) ) check_duration(testdata, column_to_check = "duration_audit_start_end_minutes" ) check_duration( testdata, column_to_check = "duration_audit_start_end_ms", lower_bound = 375490, higher_bound = 8642000 ) testdata %>% check_duration(column_to_check = "duration_audit_start_end_minutes") %>% check_duration( column_to_check = "duration_audit_start_end_ms", log_name = "duration_in_ms", lower_bound = 375490, higher_bound = 8642000 )
testdata <- data.frame( uuid = c(letters[1:7]), duration_audit_start_end_ms = c( 2475353, 375491, 2654267, 311585, 817270, 2789505, 8642007 ), duration_audit_start_end_minutes = c(41, 6, 44, 5, 14, 46, 144) ) check_duration(testdata, column_to_check = "duration_audit_start_end_minutes" ) check_duration( testdata, column_to_check = "duration_audit_start_end_ms", lower_bound = 375490, higher_bound = 8642000 ) testdata %>% check_duration(column_to_check = "duration_audit_start_end_minutes") %>% check_duration( column_to_check = "duration_audit_start_end_ms", log_name = "duration_in_ms", lower_bound = 375490, higher_bound = 8642000 )
FCS component checks
check_fcs( dataset, uuid_column = "uuid", cereals_column = "fs_fcs_cerealgrainroottuber", pulses_column = "fs_fcs_beansnuts", dairy_column = "fs_fcs_dairy", meat_column = "fs_fcs_meatfishegg", vegetables_column = "fs_fcs_vegetableleave", fruits_column = "fs_fcs_fruit", oil_column = "fs_fcs_fat", sugar_column = "fs_fcs_sugar" )
check_fcs( dataset, uuid_column = "uuid", cereals_column = "fs_fcs_cerealgrainroottuber", pulses_column = "fs_fcs_beansnuts", dairy_column = "fs_fcs_dairy", meat_column = "fs_fcs_meatfishegg", vegetables_column = "fs_fcs_vegetableleave", fruits_column = "fs_fcs_fruit", oil_column = "fs_fcs_fat", sugar_column = "fs_fcs_sugar" )
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
uuid_column |
uuid column in the dataset. Default is uuid. |
cereals_column |
Variable name for- In the last 7 days, on how many days did your household eat cereals, grains, roots and tubers, including wild roots? |
pulses_column |
Variable name for- In the last 7 days, on how many days did your household eat any beans / legumes, pulses or nuts? |
dairy_column |
Variable name for- In the last 7 days, on how many days did your household drink milk or eat other dairy products? |
meat_column |
Variable name for- In the last 7 days, on how many days did your household eat meat, fish or eggs? |
vegetables_column |
Variable name for- In the last 7 days, on how many days did your household eat vegetables or leaves, including all wild vegetables and leaves? |
fruits_column |
Variable name for- In the last 7 days, on how many days did your household eat fruits, including all wild fruits? |
oil_column |
Variable name for- In the last 7 days, on how many days did your household eat oil, fat, or butter? |
sugar_column |
Variable name for- In the last 7 days, on how many days did your household eat sugar or sugary foods? |
dataset with potential issues
Check a logical test
check_logical( dataset, uuid_column = "uuid", information_to_add = NULL, check_id = "logical_xx", check_to_perform, columns_to_clean = NULL, description )
check_logical( dataset, uuid_column = "uuid", information_to_add = NULL, check_id = "logical_xx", check_to_perform, columns_to_clean = NULL, description )
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
uuid_column |
uuid column in the dataset. Default is "uuid". |
information_to_add |
string vector optional, if to add some information to the log (today, vilagge) |
check_id |
name of the check to perform |
check_to_perform |
test to perform as R code (in text format) |
columns_to_clean |
variables to be put in the log. if not provided, it will try to detect variables |
description |
description of the check performed |
return a list with the dataset checked stored as checked_dataset, it will have all a column added with the check_id and a dataframe with the logical check log.
test_data <- data.frame( uuid = c(1:10) %>% as.character(), today = rep("2023-01-01", 10), location = rep(c("villageA", "villageB"), 5), distance_to_market = c(rep("less_30", 5), rep("more_30", 5)), access_to_market = c(rep("yes", 4), rep("no", 6)), number_children_05 = c(rep(c(0, 1), 4), 5, 6) ) check_logical(test_data, uuid_column = "uuid", check_id = "my_test", check_to_perform = "distance_to_market == \"less_30\" & access_to_market == \"no\"", columns_to_clean = "distance_to_market, access_to_market", description = "distance to market less than 30 and no access" ) check_logical(test_data, uuid_column = "uuid", information_to_add = c("today", "location"), check_to_perform = "distance_to_market == \"less_30\" & access_to_market == \"no\"", columns_to_clean = "distance_to_market, access_to_market", description = "distance to market less than 30 and no access" )
test_data <- data.frame( uuid = c(1:10) %>% as.character(), today = rep("2023-01-01", 10), location = rep(c("villageA", "villageB"), 5), distance_to_market = c(rep("less_30", 5), rep("more_30", 5)), access_to_market = c(rep("yes", 4), rep("no", 6)), number_children_05 = c(rep(c(0, 1), 4), 5, 6) ) check_logical(test_data, uuid_column = "uuid", check_id = "my_test", check_to_perform = "distance_to_market == \"less_30\" & access_to_market == \"no\"", columns_to_clean = "distance_to_market, access_to_market", description = "distance to market less than 30 and no access" ) check_logical(test_data, uuid_column = "uuid", information_to_add = c("today", "location"), check_to_perform = "distance_to_market == \"less_30\" & access_to_market == \"no\"", columns_to_clean = "distance_to_market, access_to_market", description = "distance to market less than 30 and no access" )
This is a wrapper around check_logical to allow several checks to be performed.
check_logical_with_list( dataset, uuid_column = "uuid", information_to_add = NULL, list_of_check, check_id_column, check_to_perform_column, columns_to_clean_column = NULL, description_column, bind_checks = TRUE )
check_logical_with_list( dataset, uuid_column = "uuid", information_to_add = NULL, list_of_check, check_id_column, check_to_perform_column, columns_to_clean_column = NULL, description_column, bind_checks = TRUE )
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
uuid_column |
uuid column in the dataset. Default is "uuid". |
information_to_add |
string vector optional, if to add some information to the log (today, vilagge) |
list_of_check |
a dataframe with the checks to perform |
check_id_column |
the column name with the names of each test. |
check_to_perform_column |
the column name with the checks to perform as R code (in text format) |
columns_to_clean_column |
the column name with the variables to be put in the log. |
description_column |
the column name with the description |
bind_checks |
default is TRUE, to bind the checks into 1 log. |
return a list with the dataset checked stored as checked_dataset, it will have all a column added with the check_id and a dataframe with the logical check log.
test_data <- data.frame( uuid = c(1:10) %>% as.character(), distance_to_market = rep(c("less_30", "more_30"), 5), access_to_market = c(rep("yes", 4), rep("no", 6)), number_children_05 = c(rep(c(0, 1), 4), 5, 6), number_children_618 = c(rep(c(0, 1), 4), 5, 6) ) check_list <- data.frame( name = c("logical_xx", "logical_yy", "logical_zz"), check = c( "distance_to_market == \"less_30\" & access_to_market == \"no\"", "number_children_05 > 3", "rowSums(dplyr::across(starts_with(\"number\")), na.rm = T) > 9" ), description = c( "distance to market less than 30 and no access", "number of children under 5 seems high", "number of children very high" ), columns_to_clean = c( "distance_to_market, access_to_market", "number_children_05", "" ) ) check_logical_with_list(test_data, uuid_column = "uuid", list_of_check = check_list, check_id_column = "name", check_to_perform_column = "check", columns_to_clean_column = "columns_to_clean", description_column = "description" )
test_data <- data.frame( uuid = c(1:10) %>% as.character(), distance_to_market = rep(c("less_30", "more_30"), 5), access_to_market = c(rep("yes", 4), rep("no", 6)), number_children_05 = c(rep(c(0, 1), 4), 5, 6), number_children_618 = c(rep(c(0, 1), 4), 5, 6) ) check_list <- data.frame( name = c("logical_xx", "logical_yy", "logical_zz"), check = c( "distance_to_market == \"less_30\" & access_to_market == \"no\"", "number_children_05 > 3", "rowSums(dplyr::across(starts_with(\"number\")), na.rm = T) > 9" ), description = c( "distance to market less than 30 and no access", "number of children under 5 seems high", "number of children very high" ), columns_to_clean = c( "distance_to_market, access_to_market", "number_children_05", "" ) ) check_logical_with_list(test_data, uuid_column = "uuid", list_of_check = check_list, check_id_column = "name", check_to_perform_column = "check", columns_to_clean_column = "columns_to_clean", description_column = "description" )
Generate a log for other follow up questions
check_others(dataset, uuid_column = "uuid", columns_to_check = NULL)
check_others(dataset, uuid_column = "uuid", columns_to_check = NULL)
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
uuid_column |
uuid column in the dataset. Default is "uuid". |
columns_to_check |
Columns to include in the checks |
a log with all the values that may require recoding
check_others( dataset = cleaningtools::cleaningtools_clean_data, uuid_column = "X_uuid", columns_to_check = names(cleaningtools::cleaningtools_clean_data |> dplyr::select(ends_with("_other")) |> dplyr::select(-contains("."))) )
check_others( dataset = cleaningtools::cleaningtools_clean_data, uuid_column = "X_uuid", columns_to_check = names(cleaningtools::cleaningtools_clean_data |> dplyr::select(ends_with("_other")) |> dplyr::select(-contains("."))) )
Check if the input passed to the check_others function is correct
check_others_checks(dataset, uuid_column, columns_to_check)
check_others_checks(dataset, uuid_column, columns_to_check)
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
uuid_column |
uuid column in the dataset. Default is uuid. |
columns_to_check |
Columns to include in the checks |
nothing
check outliers over the dataset
check_outliers( dataset, uuid_column = "uuid", element_name = "checked_dataset", kobo_survey = NULL, kobo_choices = NULL, cols_to_add_cleaning_log = NULL, strongness_factor = 3, minimum_unique_value_of_variable = NULL, remove_choice_multiple = TRUE, sm_separator = ".", columns_not_to_check = NULL )
check_outliers( dataset, uuid_column = "uuid", element_name = "checked_dataset", kobo_survey = NULL, kobo_choices = NULL, cols_to_add_cleaning_log = NULL, strongness_factor = 3, minimum_unique_value_of_variable = NULL, remove_choice_multiple = TRUE, sm_separator = ".", columns_not_to_check = NULL )
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
uuid_column |
UUID. Default is uuid |
element_name |
name of the dataset in list |
kobo_survey |
Kobo survey sheet. Default is NULL. |
kobo_choices |
Kobo choices sheet. Default is NULL. |
cols_to_add_cleaning_log |
Variables those must be included in the output |
strongness_factor |
Strongness factor define how strong your outliers will be. The default is 3. |
minimum_unique_value_of_variable |
Default is NULL, mean this parameter won't be considered. For example 10 means for any variable where number of unique value is less than 10, then the variable won't be considered for outlier checking. |
remove_choice_multiple |
TRUE (default) will remove choice multiple questions from the output. |
sm_separator |
Separator for choice multiple questions. The default is "." |
columns_not_to_check |
Columns to exclude from the checks even if they are numeric values. |
return a list with the dataset checked stored as checked_dataset and a dataframe with the outliers log
dataset_outlier <- data.frame( uuid = paste0("uuid_", 1:100), one_value = c(round(runif(90, min = 45, max = 55)), round(runif(5)), round(runif(5, 99, 100))), expense = c(sample(200:500, replace = TRUE, size = 95), c(600, 100, 80, 1020, 1050)), income = c(c(60, 0, 80, 1020, 1050), sample(20000:50000, replace = TRUE, size = 95)), yy = c(rep(100, 99), 10) ) check_outliers(dataset = dataset_outlier, uuid_column = "uuid")
dataset_outlier <- data.frame( uuid = paste0("uuid_", 1:100), one_value = c(round(runif(90, min = 45, max = 55)), round(runif(5)), round(runif(5, 99, 100))), expense = c(sample(200:500, replace = TRUE, size = 95), c(600, 100, 80, 1020, 1050)), income = c(c(60, 0, 80, 1020, 1050), sample(20000:50000, replace = TRUE, size = 95)), yy = c(rep(100, 99), 10) ) check_outliers(dataset = dataset_outlier, uuid_column = "uuid")
The function will flag if a survey for its missing values. The missing values column can be created with add_percentage_missing and the values are flagged with check_outliers.
check_percentage_missing( dataset, uuid_column = "uuid", column_to_check = "percentage_missing", strongness_factor = 2, log_name = "percentage_missing_log" )
check_percentage_missing( dataset, uuid_column = "uuid", column_to_check = "percentage_missing", strongness_factor = 2, log_name = "percentage_missing_log" )
dataset |
a dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset". |
uuid_column |
uuid column in the dataset. Default is "uuid". |
column_to_check |
string character with the name of the columns to check. Default is "percentage_missing" |
strongness_factor |
Strongness factor define how strong your outliers will be. The default is 2. |
log_name |
name of the log of flagged value, default is percentage_missing_log |
return a list with the dataset checked stored as checked_dataset and a dataframe with the flagged values log
add_percentage_missing()
, check_outliers()
# Adding the percentage missing first data_example <- data.frame( uuid = letters[1:3], col_1 = c(1:3), col_2 = c(NA, NA, "expenditures"), col_3 = c("with need", NA, "with need"), col_4 = c("food health school", NA, "food"), col_4.food = c(1, NA, 1), col_4.health = c(1, NA, 0), col_4.school = c(1, NA, 0) ) data_example <- data_example %>% add_percentage_missing() data_example %>% check_percentage_missing() # With a dataset that already has a percentage missing data_example2 <- data.frame( uuid = letters, any_cols = LETTERS, any_number = 1:26, percentage_missing = c(rep(.05, 25), .99) ) data_example2 %>% check_percentage_missing()
# Adding the percentage missing first data_example <- data.frame( uuid = letters[1:3], col_1 = c(1:3), col_2 = c(NA, NA, "expenditures"), col_3 = c("with need", NA, "with need"), col_4 = c("food health school", NA, "food"), col_4.food = c(1, NA, 1), col_4.health = c(1, NA, 0), col_4.school = c(1, NA, 0) ) data_example <- data_example %>% add_percentage_missing() data_example %>% check_percentage_missing() # With a dataset that already has a percentage missing data_example2 <- data.frame( uuid = letters, any_cols = LETTERS, any_number = 1:26, percentage_missing = c(rep(.05, 25), .99) ) data_example2 %>% check_percentage_missing()
Checks for potential PII
check_pii( dataset, element_name = "checked_dataset", uuid_column = "uuid", words_to_look = NULL )
check_pii( dataset, element_name = "checked_dataset", uuid_column = "uuid", words_to_look = NULL )
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
element_name |
If the input is a list file, please specify the element name that contains the dataset |
uuid_column |
uuid column in the dataset. Default is uuid. |
words_to_look |
Specify the words that might be the PIIs |
dataset with potential PII
check_pii(cleaningtools_raw_data, uuid_column = "X_uuid")
check_pii(cleaningtools_raw_data, uuid_column = "X_uuid")
Checks for survey similarities - Soft Duplicates
check_soft_duplicates( dataset, kobo_survey, uuid_column = "uuid", idnk_value = "idnk", sm_separator = ".", log_name = "soft_duplicate_log", threshold = 7, return_all_results = FALSE )
check_soft_duplicates( dataset, kobo_survey, uuid_column = "uuid", idnk_value = "idnk", sm_separator = ".", log_name = "soft_duplicate_log", threshold = 7, return_all_results = FALSE )
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" stored as "checked_dataset" |
kobo_survey |
Kobo survey sheet. |
uuid_column |
uuid column in the dataset. Default is "uuid". |
idnk_value |
String character for the value of the "I don't know" value |
sm_separator |
Separator for choice multiple questions. The default is "." |
log_name |
Name of the log dataframe flagged in the end of the function |
threshold |
flag all entries less or equal a specified threshold. The default is 7. |
return_all_results |
By default, the function will return only the values that are under the threshold. Default is FALSE. |
return a list with the dataset checked stored as checked_dataset and a dataframe with the soft duplicate log
soft_duplicates <- check_soft_duplicates( dataset = cleaningtools_raw_data, kobo_survey = cleaningtools_survey, uuid_column = "X_uuid", idnk_value = "dont_know", sm_separator = ".", log_name = "soft_duplicate_log", threshold = 7 ) group_by_enum_raw_data <- cleaningtools_raw_data %>% dplyr::group_by(enumerator_num) soft_per_enum <- group_by_enum_raw_data %>% dplyr::group_split() %>% purrr::map(~ check_soft_duplicates( dataset = ., kobo_survey = cleaningtools_survey, uuid_column = "X_uuid", idnk_value = "dont_know", sm_separator = ".", log_name = "soft_duplicate_log", threshold = 7, return_all_results = TRUE )) soft_per_enum %>% purrr::map(~ .[["soft_duplicate_log"]]) %>% purrr::map2( .y = dplyr::group_keys(group_by_enum_raw_data) %>% unlist(), ~ dplyr::mutate(.x, enum = .y) ) %>% do.call(dplyr::bind_rows, .)
soft_duplicates <- check_soft_duplicates( dataset = cleaningtools_raw_data, kobo_survey = cleaningtools_survey, uuid_column = "X_uuid", idnk_value = "dont_know", sm_separator = ".", log_name = "soft_duplicate_log", threshold = 7 ) group_by_enum_raw_data <- cleaningtools_raw_data %>% dplyr::group_by(enumerator_num) soft_per_enum <- group_by_enum_raw_data %>% dplyr::group_split() %>% purrr::map(~ check_soft_duplicates( dataset = ., kobo_survey = cleaningtools_survey, uuid_column = "X_uuid", idnk_value = "dont_know", sm_separator = ".", log_name = "soft_duplicate_log", threshold = 7, return_all_results = TRUE )) soft_per_enum %>% purrr::map(~ .[["soft_duplicate_log"]]) %>% purrr::map2( .y = dplyr::group_keys(group_by_enum_raw_data) %>% unlist(), ~ dplyr::mutate(.x, enum = .y) ) %>% do.call(dplyr::bind_rows, .)
Check for value(s) in the dataset
check_value( dataset, uuid_column = "uuid", element_name = "checked_dataset", values_to_look = c(99, 999, 999, 88, 888, 888) )
check_value( dataset, uuid_column = "uuid", element_name = "checked_dataset", values_to_look = c(99, 999, 999, 88, 888, 888) )
dataset |
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset" |
uuid_column |
uuid column in the dataset. Default is "uuid". |
element_name |
Name of the dataset in list |
values_to_look |
Values to look. Default are- 99,999,999,88,888,888 |
A dataframe as cleaning log format
df <- data.frame( X_uuid = paste0("uuid_", 1:100), age = c(sample(18:80, replace = TRUE, size = 96), 99, 99, 98, 88), gender = c("99", sample(c("male", "female"), replace = TRUE, size = 95 ), "98", "98", "88", "888") ) check_value( dataset = df, uuid_column = "X_uuid", element_name = "checked_dataset", values_to_look = c(99, 98, 88, 888) )
df <- data.frame( X_uuid = paste0("uuid_", 1:100), age = c(sample(18:80, replace = TRUE, size = 96), 99, 99, 98, 88), gender = c("99", sample(c("male", "female"), replace = TRUE, size = 95 ), "98", "98", "88", "888") ) check_value( dataset = df, uuid_column = "X_uuid", element_name = "checked_dataset", values_to_look = c(99, 98, 88, 888) )
Survey data
cleaningtools_analysis_by_group
cleaningtools_analysis_by_group
These data sets include HH data (raw and clean data) and analysis along with cleaning log.
cleaningtools_analysis_by_group cleaningtools_raw_data cleaningtools_cleaning_log cleaningtools_clean_data cleaningtools_survey cleaningtools_choices cleaningtools_overall_analysis cleaningtools_sample_frame
cleaningtools_analysis_by_group cleaningtools_raw_data cleaningtools_cleaning_log cleaningtools_clean_data cleaningtools_survey cleaningtools_choices cleaningtools_overall_analysis cleaningtools_sample_frame
Choices tab of kobo tool
cleaningtools_choices
cleaningtools_choices
Dataset with food consumption, household hunger Score component
cleaningtools_food_consumption_df
cleaningtools_food_consumption_df
Nation/all population level analysis.
cleaningtools_overall_analysis
cleaningtools_overall_analysis
Survey tab of kobo tool
cleaningtools_survey
cleaningtools_survey
Coerce numeric values to character, without scientific noting and NA are kept as NA.
coerce_to_character(x)
coerce_to_character(x)
x |
a value to convert |
x as character
coerce_to_character(c("a", NA)) coerce_to_character(c(1, 10000000, NA))
coerce_to_character(c("a", NA)) coerce_to_character(c(1, 10000000, NA))
Read all audit files from a zip
create_audit_list(audit_zip_path, dataset = NULL, uuid_column = "uuid")
create_audit_list(audit_zip_path, dataset = NULL, uuid_column = "uuid")
audit_zip_path |
location to .zip file. It must contain .zip |
dataset |
dataset to (optional) |
uuid_column |
uuid column if a dataset is provided. It will only read the uuid present in the dataset. It will only be used if a dataset is provided. |
A list with all the audits file read as they are.
## Not run: create_audit_list("audit_path.zip") ## End(Not run)
## Not run: create_audit_list("audit_path.zip") ## End(Not run)
implement cleaning log on raw data set.
create_clean_data( raw_dataset, raw_data_uuid_column = "uuid", cleaning_log, cleaning_log_uuid_column = "uuid", cleaning_log_question_column, cleaning_log_new_value_column, cleaning_log_change_type_column, change_response_value = "change_response", NA_response_value = "blank_response", no_change_value = "no_action", remove_survey_value = "remove_survey" )
create_clean_data( raw_dataset, raw_data_uuid_column = "uuid", cleaning_log, cleaning_log_uuid_column = "uuid", cleaning_log_question_column, cleaning_log_new_value_column, cleaning_log_change_type_column, change_response_value = "change_response", NA_response_value = "blank_response", no_change_value = "no_action", remove_survey_value = "remove_survey" )
raw_dataset |
Raw dataset |
raw_data_uuid_column |
uuid column in the raw dataset. Default is uuid. |
cleaning_log |
cleaning + deletion log (data.frame). |
cleaning_log_uuid_column |
uuid column in the cleaning log. Default is "uuid". |
cleaning_log_question_column |
column in cleaning log which specifies data set column to change |
cleaning_log_new_value_column |
cleaning log column specifying the new correct value |
cleaning_log_change_type_column |
column in cleaning log which specifies change type to be made |
change_response_value |
values in change type column which should be changed to a new value. |
NA_response_value |
values in change type column which should be blank (NA). |
no_change_value |
values in change type column which should NOT be changed to a new value. |
remove_survey_value |
values in change type column which should be deleted from the data. |
clean data set
create_clean_data( raw_dataset = cleaningtools_raw_data, raw_data_uuid_column = "X_uuid", cleaning_log = cleaningtools_cleaning_log, cleaning_log_uuid_column = "X_uuid", cleaning_log_question_column = "questions", cleaning_log_new_value_column = "new_value", cleaning_log_change_type_column = "change_type" ) cleaning_log_test <- data.frame( uuid = paste0("uuid", 1:4), question = c("age", "gender", "pop_group", "strata"), change_type = c("blank_response", "no_change", "Delete", "change_res"), new_value = c(NA_character_, NA_character_, NA_character_, "st-a") ) test_data <- data.frame( uuid = paste0("uuid", 1:4), age = c(180, 23, 45, 67), gender = c("male", "female", "male", "female"), pop_group = c("idp", "refugee", "host", "idp"), strata = c("a", "b", "c", "d") ) review_cleaning_log( raw_dataset = test_data, raw_data_uuid_column = "uuid", cleaning_log = cleaning_log_test, cleaning_log_change_type_column = "change_type", change_response_value = "change_res", cleaning_log_question_column = "question", cleaning_log_uuid_column = "uuid", cleaning_log_new_value_column = "new_value" ) create_clean_data( raw_dataset = test_data, raw_data_uuid_column = "uuid", cleaning_log = cleaning_log_test, cleaning_log_change_type_column = "change_type", change_response_value = "change_res", NA_response_value = "blank_response", no_change_value = "no_change", remove_survey_value = "Delete", cleaning_log_question_column = "question", cleaning_log_uuid_column = "uuid", cleaning_log_new_value_column = "new_value" )
create_clean_data( raw_dataset = cleaningtools_raw_data, raw_data_uuid_column = "X_uuid", cleaning_log = cleaningtools_cleaning_log, cleaning_log_uuid_column = "X_uuid", cleaning_log_question_column = "questions", cleaning_log_new_value_column = "new_value", cleaning_log_change_type_column = "change_type" ) cleaning_log_test <- data.frame( uuid = paste0("uuid", 1:4), question = c("age", "gender", "pop_group", "strata"), change_type = c("blank_response", "no_change", "Delete", "change_res"), new_value = c(NA_character_, NA_character_, NA_character_, "st-a") ) test_data <- data.frame( uuid = paste0("uuid", 1:4), age = c(180, 23, 45, 67), gender = c("male", "female", "male", "female"), pop_group = c("idp", "refugee", "host", "idp"), strata = c("a", "b", "c", "d") ) review_cleaning_log( raw_dataset = test_data, raw_data_uuid_column = "uuid", cleaning_log = cleaning_log_test, cleaning_log_change_type_column = "change_type", change_response_value = "change_res", cleaning_log_question_column = "question", cleaning_log_uuid_column = "uuid", cleaning_log_new_value_column = "new_value" ) create_clean_data( raw_dataset = test_data, raw_data_uuid_column = "uuid", cleaning_log = cleaning_log_test, cleaning_log_change_type_column = "change_type", change_response_value = "change_res", NA_response_value = "blank_response", no_change_value = "no_change", remove_survey_value = "Delete", cleaning_log_question_column = "question", cleaning_log_uuid_column = "uuid", cleaning_log_new_value_column = "new_value" )
Compares 2 dataset and logs differences
create_cleaning_log( raw_dataset, raw_dataset_uuid_column = "uuid", clean_dataset = clean_dataset, clean_dataset_uuid_column = "uuid", check_for_deletion_log = T, columns_not_to_check = NULL, check_for_variable_name = T )
create_cleaning_log( raw_dataset, raw_dataset_uuid_column = "uuid", clean_dataset = clean_dataset, clean_dataset_uuid_column = "uuid", check_for_deletion_log = T, columns_not_to_check = NULL, check_for_variable_name = T )
raw_dataset |
Raw dataset |
raw_dataset_uuid_column |
uuid column in the raw dataset. Default is "uuid". |
clean_dataset |
Clean dataset |
clean_dataset_uuid_column |
uuid column in the clean dataset. Default is "uuid". |
check_for_deletion_log |
TRUE to flag the removed survey |
columns_not_to_check |
Columns to exclude from the checks |
check_for_variable_name |
TRUE to flag the removed variables |
Cleaning log
## Not run: create_cleaning_log( raw_dataset = cleaningtools::cleaningtools_raw_data, raw_dataset_uuid_column = "X_uuid", clean_dataset = cleaningtools::cleaningtools_clean_data, clean_dataset_uuid_column = "X_uuid", check_for_deletion_log = TRUE, check_for_variable_name = TRUE ) ## End(Not run)
## Not run: create_cleaning_log( raw_dataset = cleaningtools::cleaningtools_raw_data, raw_dataset_uuid_column = "X_uuid", clean_dataset = cleaningtools::cleaningtools_clean_data, clean_dataset_uuid_column = "X_uuid", check_for_deletion_log = TRUE, check_for_variable_name = TRUE ) ## End(Not run)
Create a project folder with a cleaning template
create_cleaning_template(folder_path, write_template_files, ...)
create_cleaning_template(folder_path, write_template_files, ...)
folder_path |
Path for the new folder |
write_template_files |
TRUE/FALSE if to write template files |
... |
Extra information collected from the RStudio wizard |
Folder with a copy of the cleaning_template_folder
## Not run: create_cleaning_template("path/to/folder") ## End(Not run)
## Not run: create_cleaning_template("path/to/folder") ## End(Not run)
This function returns an Excel cell range for a given column in a dataframe. The range is formatted for use in Excel functions or formulas and is prefixed with the sheet name 'validation_rules'.
create_col_range(variable, data.val)
create_col_range(variable, data.val)
variable |
The name of the column/question for which the Excel range is to be generated. |
data.val |
A dataframe containing the column specified by |
A string representing the Excel cell range containing all choices for the column variable
in the sheet 'validation_rules'.
# Assume df is a sample dataframe with a column named 'consent'. # range_string <- create_col_range('consent', df)
# Assume df is a sample dataframe with a column named 'consent'. # range_string <- create_col_range('consent', df)
Merging the cleaning logs
create_combined_log(list_of_log, dataset_name = "checked_dataset")
create_combined_log(list_of_log, dataset_name = "checked_dataset")
list_of_log |
A list file containig all the logs |
dataset_name |
The dataset's name, if it exists in the list_of_log |
Combined cleaning log
list <- cleaningtools::cleaningtools_raw_data |> check_pii(uuid_column = "X_uuid") |> check_duplicate(uuid_column = "X_uuid") |> check_value(uuid_column = "X_uuid") create_combined_log(list_of_log = list)
list <- cleaningtools::cleaningtools_raw_data |> check_pii(uuid_column = "X_uuid") |> check_duplicate(uuid_column = "X_uuid") |> check_value(uuid_column = "X_uuid") create_combined_log(list_of_log = list)
Duration is the difference between the sum of all start and the sum of all end. It ignores all node that are empty such as form_start, end screen, form save, etc.
create_duration_from_audit_sum_all(audit_file)
create_duration_from_audit_sum_all(audit_file)
audit_file |
a dataframe with a single audit file, it needs start, end, node column |
A dataframe with the duration in ms and duration in minutes.
some_audit <- data.frame( event = c("form start", rep("question", 5)), node = c("", paste0("/xx/question", 1:5)), start = c( 1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720 ), end = c( NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452 ) ) create_duration_from_audit_sum_all(some_audit)
some_audit <- data.frame( event = c("form start", rep("question", 5)), node = c("", paste0("/xx/question", 1:5)), start = c( 1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720 ), end = c( NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452 ) ) create_duration_from_audit_sum_all(some_audit)
The function will calculate time difference between the start of the start_question and the end of the end_question.
create_duration_from_audit_with_start_end( audit_file, start_question, end_question )
create_duration_from_audit_with_start_end( audit_file, start_question, end_question )
audit_file |
a dataframe with a single audit file, it needs start, end, node column |
start_question |
character vector use for the starting question |
end_question |
character vector use for the ending question |
In case the node appear several time (if the value was changed or with a select multiple) it will take the minimum for the start and the maximum for the end.
If a value is missing (skip logic or question not found), it will return -Inf
A dataframe with the duration in ms and duration in minutes.
some_audit <- data.frame( event = c("form start", rep("question", 5)), node = c("", paste0("/xx/question", 1:5)), start = c( 1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720 ), end = c( NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452 ) ) create_duration_from_audit_with_start_end(some_audit, "question1", "question3")
some_audit <- data.frame( event = c("form start", rep("question", 5)), node = c("", paste0("/xx/question", 1:5)), start = c( 1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720 ), end = c( NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452 ) ) create_duration_from_audit_with_start_end(some_audit, "question1", "question3")
Creates formatted workbook with openxlsx
create_formated_wb( write_list, column_for_color = NULL, header_front_size = 12, header_front_color = "#FFFFFF", header_fill_color = "#ee5859", header_front = "Arial Narrow", body_front = "Arial Narrow", body_front_size = 11 )
create_formated_wb( write_list, column_for_color = NULL, header_front_size = 12, header_front_color = "#FFFFFF", header_fill_color = "#ee5859", header_front = "Arial Narrow", body_front = "Arial Narrow", body_front_size = 11 )
write_list |
List of dataframe |
column_for_color |
Column name in the dataframe which should be used for colorizing the cell. The default is null. |
header_front_size |
Header front size (default is 12) |
header_front_color |
Hexcode for header front color (default is white) |
header_fill_color |
Hexcode for header fill color (default is red) |
header_front |
Front name for header (default is Arial Narrow) |
body_front |
Front name for body (default is Arial Narrow) |
body_front_size |
Front size for body (default is 11) |
A workbook
checks_list <- cleaningtools::cleaningtools_raw_data |> check_pii(uuid_column = "X_uuid") |> check_duplicate(uuid_column = "X_uuid") |> check_value(uuid_column = "X_uuid") create_combined_log(list_of_log = checks_list) |> create_formated_wb()
checks_list <- cleaningtools::cleaningtools_raw_data |> check_pii(uuid_column = "X_uuid") |> check_duplicate(uuid_column = "X_uuid") |> check_value(uuid_column = "X_uuid") create_combined_log(list_of_log = checks_list) |> create_formated_wb()
This function returns a dataframe containing 'select_one' questions
from the tool
dataframe with their corresponding choices in the choices
dataframe.
create_formatted_choices(choices, tool)
create_formatted_choices(choices, tool)
choices |
A dataframe containing the choices sheet from the Kobo tool
Expected to have at least the columns |
tool |
A dataframe containing the survey sheet from the Kobo tool
Expected to have at least the columns |
A dataframe containing 'select_one' questions with their corresponding choices.
The dataframe has columns from both the tool
and choices
dataframes.
# formatted_choices <- create_formatted_choices(choices, tool)
# formatted_choices <- create_formatted_choices(choices, tool)
Create logical checks for "other" values.
create_logic_for_other( kobo_survey, sm_separator = ".", compare_with_dataset = FALSE, dataset = NULL )
create_logic_for_other( kobo_survey, sm_separator = ".", compare_with_dataset = FALSE, dataset = NULL )
kobo_survey |
Kobo survey sheet. |
sm_separator |
Separator for choice multiple questions. The default is "." |
compare_with_dataset |
Default FALSE will not compare the applicability of logics with the dataset whereas TRUE will check the applicability of logics. |
dataset |
dataset to be checked. |
A dataframe will all logical tests to check "others" value. It should be used in combination of review_others
create_logic_for_other( kobo_survey = cleaningtools::cleaningtools_survey, sm_separator = ".", dataset = cleaningtools::cleaningtools_clean_data, compare_with_dataset = TRUE )
create_logic_for_other( kobo_survey = cleaningtools::cleaningtools_survey, sm_separator = ".", dataset = cleaningtools::cleaningtools_clean_data, compare_with_dataset = TRUE )
This function generates a validation list to be used for data entry validation.
It combines predefined validation lists with dynamically formatted choices derived from
the choices
and tool
parameters. The resulting dataframe is intended to provide
structured and valid choices for various question types.
create_validation_list(choices, tool)
create_validation_list(choices, tool)
choices |
A dataframe representing the Kobo choices sheet.
Expected to have at least the columns |
tool |
A dataframe representing the Kobo survey sheet.
Expected to have at least the columns |
A dataframe where each column corresponds to a choice list for a specific question. Each row contains a valid choice for the question.
# Assume choices_df and tool_df are sample dataframes that fit the expected structure. # validation_list <- create_validation_list(choices, tool)
# Assume choices_df and tool_df are sample dataframes that fit the expected structure. # validation_list <- create_validation_list(choices, tool)
Creates formatted excel for cleaning log
create_xlsx_cleaning_log( write_list, cleaning_log_name = "cleaning_log", change_type_col = "change_type", column_for_color = "check_binding", header_front_size = 12, header_front_color = "#FFFFFF", header_fill_color = "#ee5859", header_front = "Arial Narrow", body_front = "Arial Narrow", body_front_size = 11, use_dropdown = F, sm_dropdown_type = NULL, kobo_survey = NULL, kobo_choices = NULL, output_path = NULL )
create_xlsx_cleaning_log( write_list, cleaning_log_name = "cleaning_log", change_type_col = "change_type", column_for_color = "check_binding", header_front_size = 12, header_front_color = "#FFFFFF", header_fill_color = "#ee5859", header_front = "Arial Narrow", body_front = "Arial Narrow", body_front_size = 11, use_dropdown = F, sm_dropdown_type = NULL, kobo_survey = NULL, kobo_choices = NULL, output_path = NULL )
write_list |
List of dataframe |
cleaning_log_name |
Name for cleaning log from write_list |
change_type_col |
Change type column name in the cleaning log |
column_for_color |
Column name in the dataframe which should be used for colorizing the cell. The default is null. |
header_front_size |
Header front size (default is 12) |
header_front_color |
Hexcode for header front color (default is white) |
header_fill_color |
Hexcode for header fill color (default is red) |
header_front |
Front name for header (default is Arial Narrow) |
body_front |
Front name for body (default is Arial Narrow) |
body_front_size |
Front size for body (default is 11) |
use_dropdown |
Use drop down lists for data validation in the cleaning log output (default is FALSE) |
sm_dropdown_type |
Dropdown list options for select multiple questions: "numerical" (1/0) or "logical" (TRUE/FALSE). The default is logical. |
kobo_survey |
Kobo survey dataframe |
kobo_choices |
Kobo choices dataframe |
output_path |
Output path. Default is NULL which will return a workbook instead of an excel file. |
save a .xlsx file or return a workbook object
## Not run: checks_list <- cleaningtools::cleaningtools_raw_data |> check_pii(uuid_column = "X_uuid") |> check_duplicate(uuid_column = "X_uuid") |> check_value(uuid_column = "X_uuid") create_combined_log(list_of_log = checks_list) |> create_xlsx_cleaning_log() logical_check_example <- cleaningtools::cleaningtools_raw_data |> check_logical( check_to_perform = 'treat_cook_water == "always_treat"', uuid_column = "X_uuid", description = "description", check_id = "check_4", columns_to_clean = "treat_cook_water" ) create_combined_log(logical_check_example) |> create_xlsx_cleaning_log( output_path = paste0(tempdir(check = TRUE), "/cleaning_log.xlsx"), cleaning_log_name = "cleaning_log", change_type_col = "change_type", kobo_survey = cleaningtools::cleaningtools_survey, kobo_choices = cleaningtools::cleaningtools_choices, use_dropdown = TRUE, sm_dropdown_type = "logical" ) ## End(Not run)
## Not run: checks_list <- cleaningtools::cleaningtools_raw_data |> check_pii(uuid_column = "X_uuid") |> check_duplicate(uuid_column = "X_uuid") |> check_value(uuid_column = "X_uuid") create_combined_log(list_of_log = checks_list) |> create_xlsx_cleaning_log() logical_check_example <- cleaningtools::cleaningtools_raw_data |> check_logical( check_to_perform = 'treat_cook_water == "always_treat"', uuid_column = "X_uuid", description = "description", check_id = "check_4", columns_to_clean = "treat_cook_water" ) create_combined_log(logical_check_example) |> create_xlsx_cleaning_log( output_path = paste0(tempdir(check = TRUE), "/cleaning_log.xlsx"), cleaning_log_name = "cleaning_log", change_type_col = "change_type", kobo_survey = cleaningtools::cleaningtools_survey, kobo_choices = cleaningtools::cleaningtools_choices, use_dropdown = TRUE, sm_dropdown_type = "logical" ) ## End(Not run)
detects variables names in code
detect_variable(codeintext)
detect_variable(codeintext)
codeintext |
a string vector to look for variables |
a vector with strings
string_to_check <- c( "ki_age> 1", "noo==\"mar\"", "sum(var1, var2, na.rm = T)", "oks != \"mar2\"", "oknospace!=\"mar2\"" ) detect_variable(string_to_check)
string_to_check <- c( "ki_age> 1", "noo==\"mar\"", "sum(var1, var2, na.rm = T)", "oks != \"mar2\"", "oknospace!=\"mar2\"" ) detect_variable(string_to_check)
This function recreates the columns for select multiple questions
recreate_parent_column( dataset, uuid_column = "uuid", kobo_survey = NULL, kobo_choices = NULL, sm_separator = ".", cleaning_log_to_append = NULL )
recreate_parent_column( dataset, uuid_column = "uuid", kobo_survey = NULL, kobo_choices = NULL, sm_separator = ".", cleaning_log_to_append = NULL )
dataset |
data frame |
uuid_column |
uuid column in the dataset. Default is "uuid". |
kobo_survey |
Kobo survey sheet. |
kobo_choices |
Kobo choices sheet. |
sm_separator |
Separator for choice multiple questions. The default is "." |
cleaning_log_to_append |
A cleaning log where to add the changes from this functions. Names of the log from this function are "uuid", "question", "change_type", "new_value", "old_value", "comment". If the cleaning_log_to_append names are not matching, the only way is to create without a cleaning_log_to_append, and rename the columns and then bind. |
test_data <- dplyr::tibble( uuid = paste0("uuid_", 1:6), gender = rep(c("male", "female"), 3), reason = c( "xx,yy", "xx,zy", "zy", "xx,xz,zy", NA_character_, "xz" ), reason.xx = c(0, 1, 0, 1, 0, 0), reason.yy = c(1, 0, 0, 0, 1, 0), reason.xz = c(0, 0, 0, 1, 0, 1), reason.zy = c(0, 1, 1, 1, 0, 0), reason_zy = c(NA_character_, "A", "B", "C", NA_character_, NA_character_) ) recreate_parent_column(dataset = test_data, uuid_column = "uuid", sm_separator = ".")
test_data <- dplyr::tibble( uuid = paste0("uuid_", 1:6), gender = rep(c("male", "female"), 3), reason = c( "xx,yy", "xx,zy", "zy", "xx,xz,zy", NA_character_, "xz" ), reason.xx = c(0, 1, 0, 1, 0, 0), reason.yy = c(1, 0, 0, 0, 1, 0), reason.xz = c(0, 0, 0, 1, 0, 1), reason.zy = c(0, 1, 1, 1, 0, 0), reason_zy = c(NA_character_, "A", "B", "C", NA_character_, NA_character_) ) recreate_parent_column(dataset = test_data, uuid_column = "uuid", sm_separator = ".")
It will compare the raw dataset, clean dataset, cleaning log and deletion log. Possible flags are :
UUID found in deletion log.
No action with different value in new value column.
Changes were not applied
This survey should be deleted from the clean dataset but it was not deleted
This survey should be removed from deletion log as it doesn't exist in the raw data.
Duplicated entry with different value, please recheck and keep one
Entry missing in cleaning log
New value in cleaning log and value in clean dataset not matching
Survey missing in the raw data
review_cleaning( raw_dataset, raw_dataset_uuid_column = "uuid", clean_dataset, clean_dataset_uuid_column = "uuid", cleaning_log = cleaning_log_only, cleaning_log_uuid_column = "uuid", cleaning_log_change_type_column = "change_type", cleaning_log_question_column = "question", cleaning_log_new_value_column = "new_value", cleaning_log_old_value_column = "old_value", cleaning_log_added_survey_value = "added_survey", cleaning_log_no_change_value = c("no_action", "no_change"), deletion_log = NULL, deletion_log_uuid_column = NULL, check_for_deletion_log = T )
review_cleaning( raw_dataset, raw_dataset_uuid_column = "uuid", clean_dataset, clean_dataset_uuid_column = "uuid", cleaning_log = cleaning_log_only, cleaning_log_uuid_column = "uuid", cleaning_log_change_type_column = "change_type", cleaning_log_question_column = "question", cleaning_log_new_value_column = "new_value", cleaning_log_old_value_column = "old_value", cleaning_log_added_survey_value = "added_survey", cleaning_log_no_change_value = c("no_action", "no_change"), deletion_log = NULL, deletion_log_uuid_column = NULL, check_for_deletion_log = T )
raw_dataset |
Raw dataset |
raw_dataset_uuid_column |
uuid column in the raw dataset. Default is "uuid". |
clean_dataset |
Clean dataset |
clean_dataset_uuid_column |
uuid column in the clean dataset. Default is "uuid". |
cleaning_log |
Cleaning log |
cleaning_log_uuid_column |
uuid column in the cleaning log. Default is "uuid". |
cleaning_log_change_type_column |
column in cleaning log which specifies which change to be made |
cleaning_log_question_column |
column in cleaning log which specifies which column to change |
cleaning_log_new_value_column |
cleaning log column specifying the new correct value |
cleaning_log_old_value_column |
cleaning log column specifying the old value |
cleaning_log_added_survey_value |
Value for change type column which defines for new surveys |
cleaning_log_no_change_value |
Value for change type column which defines for no action needed |
deletion_log |
deletion log |
deletion_log_uuid_column |
Unique ID column name of deletion log |
check_for_deletion_log |
TRUE to flag the removed survey |
Discrepancy in cleaning log
## Not run: deletion_log <- cleaningtools::cleaningtools_cleaning_log |> dplyr::filter(change_type == "remove_survey") cleaning_log <- cleaningtools::cleaningtools_cleaning_log |> dplyr::filter(change_type != "remove_survey") review_cleaning( raw_dataset = cleaningtools::raw_dataset, raw_dataset_uuid_column = "X_uuid", clean_dataset = cleaningtools::clean_dataset, clean_dataset_uuid_column = "X_uuid", cleaning_log = cleaning_log2, cleaning_log_uuid_column = "X_uuid", cleaning_log_question_column = "questions", cleaning_log_new_value_column = "new_value", cleaning_log_old_value_column = "old_value", deletion_log = deletaion_log, deletion_log_uuid_column = "X_uuid", check_for_deletion_log = T ) ## End(Not run)
## Not run: deletion_log <- cleaningtools::cleaningtools_cleaning_log |> dplyr::filter(change_type == "remove_survey") cleaning_log <- cleaningtools::cleaningtools_cleaning_log |> dplyr::filter(change_type != "remove_survey") review_cleaning( raw_dataset = cleaningtools::raw_dataset, raw_dataset_uuid_column = "X_uuid", clean_dataset = cleaningtools::clean_dataset, clean_dataset_uuid_column = "X_uuid", cleaning_log = cleaning_log2, cleaning_log_uuid_column = "X_uuid", cleaning_log_question_column = "questions", cleaning_log_new_value_column = "new_value", cleaning_log_old_value_column = "old_value", deletion_log = deletaion_log, deletion_log_uuid_column = "X_uuid", check_for_deletion_log = T ) ## End(Not run)
Review the cleaning log. It can be used before create_clean_data to check cleaning log. Possible flags are:
Value in change_response_value not found
question_does_not_exist: question does not exist
uuid_does_not_exist: cannot find a UUID
na_in_change_type: change_type is NA
review_cleaning_log( raw_dataset, raw_data_uuid_column = "uuid", cleaning_log, cleaning_log_uuid_column = "uuid", cleaning_log_question_column, cleaning_log_new_value_column, cleaning_log_change_type_column, change_response_value = "change_response" )
review_cleaning_log( raw_dataset, raw_data_uuid_column = "uuid", cleaning_log, cleaning_log_uuid_column = "uuid", cleaning_log_question_column, cleaning_log_new_value_column, cleaning_log_change_type_column, change_response_value = "change_response" )
raw_dataset |
Raw dataset |
raw_data_uuid_column |
uuid column in the raw dataset. Default is "uuid". |
cleaning_log |
cleaning log (data.frame) |
cleaning_log_uuid_column |
uuid column in the cleaning log. Default is "uuid". |
cleaning_log_question_column |
column in cleaning log which specifies data set column to change |
cleaning_log_new_value_column |
cleaning log column specifying the new correct value |
cleaning_log_change_type_column |
column in cleaning log which specifies change type to be made |
change_response_value |
values in change type column which should be changed to a new value. |
review of the cleaning log with flagged entries and note explaining the flags.
Review discrepancy between kobo relevancies and the dataset.
review_others( dataset, uuid_column = "uuid", kobo_survey, sm_separator = ".", information_to_add = NULL, columns_not_to_check = NULL )
review_others( dataset, uuid_column = "uuid", kobo_survey, sm_separator = ".", information_to_add = NULL, columns_not_to_check = NULL )
dataset |
dataset to be check |
uuid_column |
uuid column in the dataset. Default is "uuid". |
kobo_survey |
Kobo survey sheet. |
sm_separator |
Separator for choice multiple questions. The default is "." |
information_to_add |
string vector optional, if to add some information to the log (today,village etc.) |
columns_not_to_check |
Columns to exclude from the checks |
Any discrepancy between kobo relevancies and the dataset.
review_others( dataset = cleaningtools::cleaningtools_clean_data, uuid_column = "X_uuid", kobo_survey = cleaningtools_survey )
review_others( dataset = cleaningtools::cleaningtools_clean_data, uuid_column = "X_uuid", kobo_survey = cleaningtools_survey )
Compares the sample frame with the clean data
review_sample_frame_with_dataset( sample_frame, sampling_frame_strata_column, sampling_frame_target_survey_column = "Total.no.of.HH", clean_dataset, clean_dataset_strata_column, consent_column = "consent", consent_yes_value = "yes" )
review_sample_frame_with_dataset( sample_frame, sampling_frame_strata_column, sampling_frame_target_survey_column = "Total.no.of.HH", clean_dataset, clean_dataset_strata_column, consent_column = "consent", consent_yes_value = "yes" )
sample_frame |
A sample frame. It mush have a column for strata and another column defining the number of needed survey per strata. |
sampling_frame_strata_column |
Column name for strata in the sample frame |
sampling_frame_target_survey_column |
Column name defining number of survey per strata in sample frame. |
clean_dataset |
Clean data |
clean_dataset_strata_column |
Column name for strata in clean data |
consent_column |
Dataset consent column |
consent_yes_value |
Choice for consent yes |
a dataset with the two additional columns with the number of completed surveys and number of remaining survey.
review_sample_frame_with_dataset( sample_frame = cleaningtools::cleaningtools_sample_frame, sampling_frame_strata_column = "Neighbourhood", sampling_frame_target_survey_column = "Total.no.of.HH", clean_dataset = cleaningtools::cleaningtools_clean_data, clean_dataset_strata_column = "neighbourhood", consent_column = "consent_remote", consent_yes_value = "yes" )
review_sample_frame_with_dataset( sample_frame = cleaningtools::cleaningtools_sample_frame, sampling_frame_strata_column = "Neighbourhood", sampling_frame_target_survey_column = "Total.no.of.HH", clean_dataset = cleaningtools::cleaningtools_clean_data, clean_dataset_strata_column = "neighbourhood", consent_column = "consent_remote", consent_yes_value = "yes" )
This function checks whether the provided Kobo choices dataframe meets certain criteria:
It is a dataframe.
It is not empty.
It contains columns named "list_name" and "name".
verify_valid_choices(kobo_choices)
verify_valid_choices(kobo_choices)
kobo_choices |
A dataframe representing the Kobo choices sheet. |
Logical. Returns TRUE
if the dataframe meets all the criteria. Otherwise, it returns FALSE
.
# Assume df_valid_choices is a valid Kobo choices dataframe df_valid_choices <- data.frame(list_name = c("ChoiceA", "ChoiceB"), name = c("Option1", "Option2")) verify_valid_choices(df_valid_choices) # should return TRUE # Assume df_invalid_choices lacks the required columns df_invalid_choices <- data.frame(column1 = c("ChoiceA", "ChoiceB"), column2 = c("Option1", "Option2")) verify_valid_choices(df_invalid_choices) # should return FALSE
# Assume df_valid_choices is a valid Kobo choices dataframe df_valid_choices <- data.frame(list_name = c("ChoiceA", "ChoiceB"), name = c("Option1", "Option2")) verify_valid_choices(df_valid_choices) # should return TRUE # Assume df_invalid_choices lacks the required columns df_invalid_choices <- data.frame(column1 = c("ChoiceA", "ChoiceB"), column2 = c("Option1", "Option2")) verify_valid_choices(df_invalid_choices) # should return FALSE
This function checks whether the provided Kobo survey dataframe meets certain criteria:
It is a dataframe.
It is not empty.
It contains columns named "type" and "name".
verify_valid_survey(kobo_survey)
verify_valid_survey(kobo_survey)
kobo_survey |
A dataframe representing the Kobo survey sheet. |
Logical. Returns TRUE
if the dataframe meets all the criteria. Otherwise, it returns FALSE
.
# Assume df_valid is a valid Kobo survey dataframe df_valid <- data.frame(type = c("integer", "selecte_one yesno"), name = c("age", "consent")) verify_valid_survey(df_valid) # should return TRUE # Assume df_invalid lacks the required columns df_invalid <- data.frame(column1 = c("integer", "selecte_one yesno"), column2 = c("age", "consent")) verify_valid_survey(df_invalid) # should return FALSE
# Assume df_valid is a valid Kobo survey dataframe df_valid <- data.frame(type = c("integer", "selecte_one yesno"), name = c("age", "consent")) verify_valid_survey(df_valid) # should return TRUE # Assume df_invalid lacks the required columns df_invalid <- data.frame(column1 = c("integer", "selecte_one yesno"), column2 = c("age", "consent")) verify_valid_survey(df_invalid) # should return FALSE