Cleaned up the code, provided additional data checks and instructions on how to fix errors. Refactored the code so it now uses "common.R" and "VSI.R". By doing this, all other "tools" by the WP8 team can be streamlined and made more user friendly.
170 lines
4.6 KiB
R
170 lines
4.6 KiB
R
# checks the main data, returns FALSE in case something is not OK
|
|
# for VSI, we need at least: idno, cntry, VSI1, VSI2, VSI3
|
|
|
|
check_main_file <- function (dataset) {
|
|
|
|
if (!is.null(dataset$VSI1) && !is.null(dataset$VSI2) && !is.null(dataset$VSI3)
|
|
&& !is.null(dataset$idno) && !is.null(dataset$cntry)) {
|
|
|
|
return (TRUE)
|
|
|
|
} else {
|
|
if (is.null(dataset$VSI1)) {
|
|
cat ("VSI1 variable is missing in the main dataset. ");
|
|
}
|
|
if (is.null(dataset$VSI2)) {
|
|
cat ("VSI2 variable is missing in the main dataset. ");
|
|
}
|
|
if (is.null(dataset$VSI3)) {
|
|
cat ("VSI3 variable is missing in the main dataset. ");
|
|
}
|
|
if (is.null(dataset$idno)) {
|
|
cat ("idno variable (interview ID) is missing in the main dataset. ");
|
|
}
|
|
if (is.null(dataset$cntry)) {
|
|
cat ("cntry variable (country code) is missing in the main dataset. ");
|
|
}
|
|
|
|
cat ("**Please check your dataset and try again**")
|
|
return (FALSE)
|
|
}
|
|
}
|
|
|
|
# checks the intervier data, returns FALSE in case something is not OK
|
|
# here, we need at least: idno, cntry, intnum
|
|
check_inwer_file <- function (dataset) {
|
|
|
|
if (!is.null(dataset$cntry) && !is.null(dataset$idno) && !is.null(dataset$intnum)) {
|
|
|
|
return (TRUE)
|
|
|
|
} else {
|
|
if (is.null(dataset$intnum)) {
|
|
cat ("intnum (intervie**wer** ID variable is missing in the interviewer dataset. ");
|
|
}
|
|
if (is.null(dataset$idno)) {
|
|
cat ("idno variable (interview ID) is missing in the interviewer dataset. ");
|
|
}
|
|
if (is.null(dataset$cntry)) {
|
|
cat ("cntry variable (country code) is missing in the interviewer dataset. ");
|
|
}
|
|
cat ("**Please check your dataset and try again**")
|
|
return (FALSE)
|
|
}
|
|
}
|
|
|
|
|
|
|
|
# Extracts access points fingerprints from the data (5 APs from 1 VSI)
|
|
extract_ap <- function (source) {
|
|
return (c(substr(source,0,7),
|
|
substr(source,16,23),
|
|
substr(source,32,39),
|
|
substr(source,48,55),
|
|
substr(source,64,71)));
|
|
}
|
|
|
|
# Checks whether at least 3 of 5 APs are present during an interview
|
|
match_within <- function (t1,t2,t3) {
|
|
|
|
matched_within = 0
|
|
|
|
for (a in seq(1,5)) {
|
|
|
|
if (t1[a] %in% t2 && t1[a] %in% t3) {
|
|
matched_within=matched_within + 1
|
|
}
|
|
}
|
|
if (matched_within >= 3) {
|
|
return (TRUE)
|
|
}
|
|
else {
|
|
return (FALSE)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# Checks whether any of other cases have 3 or more same APs (i.e. same location)
|
|
|
|
match_outside <- function (a_t1, a_t2, a_t3, b_t1, b_t2, b_t3) {
|
|
|
|
matched = FALSE
|
|
|
|
# only take cases that match !WITHIN!
|
|
if (match_within(a_t1, a_t2, a_t3) == FALSE || match_within(b_t1, b_t2, b_t3) == FALSE) {
|
|
return (FALSE) # no match, as we can't even do it within
|
|
}
|
|
|
|
|
|
matches=0
|
|
|
|
# we iterate through all 5 APs. If at least three are repeated across
|
|
# all time points in both surveys (a_t1... b_t3) we have a match
|
|
|
|
for (a in seq(1,5)) {
|
|
|
|
if (a_t1[a] %in% b_t1 && a_t1[a] %in% b_t2 && a_t1[a] %in% b_t3 &&
|
|
a_t2[a] %in% b_t1 && a_t2[a] %in% b_t2 && a_t2[a] %in% b_t3 &&
|
|
a_t3[a] %in% b_t1 && a_t3[a] %in% b_t2 && a_t3[a] %in% b_t3)
|
|
{
|
|
matches=matches+1
|
|
}
|
|
}
|
|
|
|
if (matches >= 3) {
|
|
return (TRUE)
|
|
}
|
|
else {
|
|
return (FALSE)
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
# Checks existing pairings and add it to appropriate position
|
|
# example:
|
|
# in raw data, matches are found between cases 22<->23; 23<->38; 40<->41
|
|
|
|
# This can be summarized into:
|
|
# Matched cases: 22, 23, 38 ; as well as 40, 41
|
|
|
|
# This function takes a pair (e.g. c(23,38)) and checks if any of them is
|
|
# already in any of the matches.
|
|
|
|
# if it is, it adds new case to the appropriate list.
|
|
# if not, it adds a new offset with the new pair
|
|
|
|
group_pairs <- function (groups, pair) {
|
|
|
|
# if we receive an empty list, just populate it...
|
|
if (length(groups) == 0) {
|
|
groups <- list (c(pair[1], pair[2]))
|
|
return (groups)
|
|
}
|
|
|
|
# find both elements in the groupings
|
|
loc_1 <- which(sapply(groups, FUN=function(X) pair[1] %in% X))
|
|
loc_2 <- which(sapply(groups, FUN=function(X) pair[2] %in% X))
|
|
|
|
# none in existing list
|
|
if (length(loc_1) == 0 && length(loc_2) == 0) {
|
|
|
|
groups[[length(groups)+1]] <- c(pair[1], pair[2])
|
|
|
|
} else if (length(loc_1) != 0 && length(loc_2) == 0) {
|
|
|
|
# let's add the second (not present) to the first's offset
|
|
groups[[loc_1]] <- c(groups[[loc_1]], pair[2])
|
|
|
|
} else if (length(loc_1) == 0 && length(loc_2) != 0) {
|
|
|
|
# let's add the first (not present) to the second's offset
|
|
groups[[loc_2]] <- c(groups[[loc_2]], pair[1])
|
|
|
|
}
|
|
# yes, we can have both present (else) - no need to do anything then.
|
|
return (groups)
|
|
}
|