FlexRL is a package for Flexible Record Linkage. Its goal is to find the common set of records among 2 data sources. The area of applications of Record Linkage is broad, it can be used to link data from healthcare monitoring studies, to count casualties in conflict zones by combining several registries, … Therefore FlexRL models registration errors (missing values and mistakes in the data) and handles dynamic Partially Identifying Variables that evolve over time (the zipcode can change between the 2 data collections for instance) in order to produce a final set of linked records with posterior probabilities to be linked. The algorithm can take time to run on large data sets but has a low memory footprint and can easily run on standard computers.
This package implements the Stochastic EM approach to Record Linkage described in ‘A flexible model for Record Linkage’. The main article and the supplementary material are available on arxiv.
You can install FlexRL from CRAN with:
install.packages("FlexRL")
library(FlexRL)
Or you can install the development version of FlexRL from its GitHub with one of the following:
::pak("robachowyk/FlexRL") pak
::install_github("robachowyk/FlexRL") remotes
::install_github("robachowyk/FlexRL") devtools
FlexRL
Here is a basic example which shows how to solve a common record linkage task:
library(FlexRL)
# load real data subsets from the vignettes
= read.csv("FlexRL/vignettes/exA.csv", row.names = 1)
df2016 = read.csv("FlexRL/vignettes/exB.csv", row.names = 1)
df2020
# use 5 PIVs birth year, sex, marital status, educational level, regional code;
# we do not have enough information to model instability
# all PIVs are considered stable
= list(
PIVs_config ANASCI = list(stable = TRUE),
SESSO = list(stable = TRUE),
STACIV = list(stable = TRUE),
STUDIO = list(stable = TRUE),
IREG = list(stable = TRUE)
)= names(PIVs_config)
PIVs = sapply(PIVs_config, function(x) x$stable)
PIVs_stable
# we put bounds on the probability of mistakes:
# there should realistically be less than 10% of mistakes in the PIVs
# however for some PIVs which are probably dynamic (though we cannot model it here) it is good to
# not bound the mistakes parameter, which will adapt to the dynamics not taken into account
= c(TRUE, TRUE, FALSE, FALSE, FALSE)
boundMistakes
# filter the data to their common support
for(i in 1:length(PIVs)){
= intersect( unique(df2016[,PIVs[i]]), unique(df2020[,PIVs[i]]) )
intersect_support_piv = df2016[df2016[,PIVs[i]] %in% c(NA,intersect_support_piv),]
df2016 = df2020[df2020[,PIVs[i]] %in% c(NA,intersect_support_piv),]
df2020
}
# reset the rownames
rownames(df2016) = 1:nrow(df2016)
rownames(df2020) = 1:nrow(df2020)
# we know the true linkage structure so we know the true links
= intersect(df2016$ID, df2020$ID)
links = length(links)
Nlinks
= data.frame( matrix(0, nrow=0, ncol=2) )
TrueDelta for (i in 1:Nlinks)
{= links[i]
id = which(df2016$ID == id)
id16 = which(df2020$ID == id)
id20 = rbind(TrueDelta, cbind(rownames(df2016[id16,]),rownames(df2020[id20,])))
TrueDelta
}= do.call(paste, c(TrueDelta, list(sep="_")))
true_pairs
# we need a source column
$source = "df2016"
df2016$source = "df2020"
df2020
# the first dataset (namely source A) has to be the smallest one
# i.e. the second dataset (namely source B) has to be the largest one
if(nrow(df2020)>nrow(df2016)){
= df2016
encodedA = df2020
encodedB cat("df2020 is the largest file, denoted encodedB")
else{
}= df2016
encodedB = df2020
encodedA cat("df2016 is the largest file, denoted encodedB")
}
# encode the PIVs
= lapply(PIVs, function(x) levels(factor(as.character(c(encodedA[,x], encodedB[,x])))))
levels_PIVs
for(i in 1:length(PIVs))
{= as.numeric(factor(as.character(encodedA[,PIVs[i]]), levels=levels_PIVs[[i]]))
encodedA[,PIVs[i]] = as.numeric(factor(as.character(encodedB[,PIVs[i]]), levels=levels_PIVs[[i]]))
encodedB[,PIVs[i]]
}= sapply(levels_PIVs, length)
nvalues names(nvalues) = PIVs
is.na(encodedA[,PIVs]) ] = 0
encodedA[,PIVs][ is.na(encodedB[,PIVs]) ] = 0
encodedB[,PIVs][
= list( A = encodedA,
data B = encodedB,
Nvalues = nvalues,
PIVs_config = PIVs_config,
controlOnMistakes = boundMistakes,
sameMistakes = TRUE,
phiMistakesAFixed = FALSE,
phiMistakesBFixed = FALSE,
phiForMistakesA = c(NA, NA, NA, NA, NA),
phiForMistakesB = c(NA, NA, NA, NA, NA)
)
# launch FlexRL algorithm
= stEM( data = data,
fit StEMIter = 50,
StEMBurnin = 30,
GibbsIter = 50,
GibbsBurnin = 30,
musicOn = TRUE,
newDirectory = NULL,
saveInfoIter = FALSE
)
# collect the output and build the final set of linked records with posterior probability > 0.5
# to ensure that the one-to-one assignment constraint is fulfilled
= fit$Delta
DeltaResult colnames(DeltaResult) = c("idx20","idx16","probaLink")
= DeltaResult[DeltaResult$probaLink>0.5,]
DeltaResult
DeltaResult
# compute the results by comparing the true links to the linked records
# watch out for the order of the source in the outcome, source A here corresponds to df2020
# and source B corresponds to df2016
# in the set of true links we ordered pairs with first df2016 and then df2020
= data.frame( Results=matrix(NA, nrow=6, ncol=1) )
results rownames(results) = c("tp","fp","fn","f1score","fdr","sens.")
if(nrow(DeltaResult)>1){
= do.call(paste, c(DeltaResult[,c("idx16","idx20")], list(sep="_")))
linked_pairs = length( intersect(linked_pairs, true_pairs) )
truepositive = length( setdiff(linked_pairs, true_pairs) )
falsepositive = length( setdiff(true_pairs, linked_pairs) )
falsenegative = truepositive / (truepositive + falsepositive)
precision = 1 - precision
fdr = truepositive / (truepositive + falsenegative)
sensitivity = 2 * (precision * sensitivity) / (precision + sensitivity)
f1score "FlexRL"] = c(truepositive,falsepositive,falsenegative,f1score,fdr,sensitivity)
results[,
}
results
# one can use the simplistic method to assess the difficulty of the task
= launchNaive(PIVs, encodedA, encodedB)
DeltaResult
if(nrow(DeltaResult)>1){
= do.call(paste, c(DeltaResult[,c("idxB","idxA)], list(sep="_")))
linked_pairs truepositive = length( intersect(linked_pairs, true_pairs) )
falsepositive = length( setdiff(linked_pairs, true_pairs) )
falsenegative = length( setdiff(true_pairs, linked_pairs) )
precision = truepositive / (truepositive + falsepositive)
fdr = 1 - precision
sensitivity = truepositive / (truepositive + falsenegative)
f1score = 2 * (precision * sensitivity) / (precision + sensitivity)
results[,"Naive"] = c(truepositive,falsepositive,falsenegative,f1score,fdr,sensitivity)
}
results
Here is another example (using synthetic data to illustrate more elaborated options) which shows you how to solve a common record linkage problem:
library(FlexRL)
= list( V1 = list(stable = TRUE),
PIVs_config V2 = list(stable = TRUE),
V3 = list(stable = TRUE),
V4 = list(stable = TRUE),
V5 = list( stable = FALSE,
conditionalHazard = FALSE,
pSameH.cov.A = c(),
pSameH.cov.B = c()) )
= names(PIVs_config)
PIVs = sapply(PIVs_config, function(x) x$stable)
PIVs_stable = c(6, 7, 8, 9, 15)
Nval = c(500, 800)
NRecords = 300
Nlinks = c(0.02, 0.02, 0.02, 0.02, 0.02)
PmistakesA = c(0.02, 0.02, 0.02, 0.02, 0.02)
PmistakesB = c(0.007, 0.007, 0.007, 0.007, 0.007)
PmissingA = c(0.007, 0.007, 0.007, 0.007, 0.007)
PmissingB = list(V1=c(),V2=c(),V3=c(),V4=c(),V5=c(0.28))
moving_params = TRUE
enforceEstimability = DataCreation( PIVs_config,
DATA
Nval,
NRecords,
Nlinks,
PmistakesA,
PmistakesB,
PmissingA,
PmissingB,
moving_params,
enforceEstimability)= DATA$A
A = DATA$B
B = DATA$Nvalues
Nvalues = DATA$TimeDifference
TimeDifference = DATA$proba_same_H
proba_same_H
# we generate data with an unstable PIV (representing that people move for instance)
plot( sort(proba_same_H_5, decreasing=TRUE), ylim=c(0,1) )plot( TimeDifference, proba_same_H_5, ylim=c(0,1) )
# it is realistic to bound the parameter for mistakes to not exceeds 10%
# (also for the unstable PIV since we model its dynamics)
= c(TRUE, TRUE, TRUE, TRUE, TRUE)
boundMistakes
# the first 1:Nlinks records of each files created are links
= data.frame( matrix(0, nrow=0, ncol=2) )
TrueDelta for (i in 1:Nlinks)
{= rbind(TrueDelta, cbind(rownames(A[i,]),rownames(B[i,])))
TrueDelta
}= do.call(paste, c(TrueDelta, list(sep="_")))
true_pairs
= A
encodedA = B
encodedB
is.na(encodedA[,PIVs]) ] = 0
encodedA[,PIVs][ is.na(encodedB[,PIVs]) ] = 0
encodedB[,PIVs][
= list( A = encodedA,
data B = encodedB,
Nvalues = Nvalues,
PIVs_config = PIVs_config,
controlOnMistakes = boundMistakes,
sameMistakes = TRUE,
phiMistakesAFixed = TRUE,
phiMistakesBFixed = TRUE,
phiForMistakesA = c(NA, NA, NA, NA, 0),
phiForMistakesB = c(NA, NA, NA, NA, 0)
)
= stEM( data = data,
fit StEMIter = 100,
StEMBurnin = 70,
GibbsIter = 200,
GibbsBurnin = 100,
musicOn = TRUE,
newDirectory = NULL,
saveInfoIter = FALSE
)
= fit$Delta
DeltaResult colnames(DeltaResult) = c("idxA","idxB","probaLink")
= DeltaResult[DeltaResult$probaLink>0.5,]
DeltaResult
= data.frame( Results=matrix(NA, nrow=6, ncol=1) )
results rownames(results) = c("tp","fp","fn","f1score","fdr","sens.")
if(nrow(DeltaResult)>1){
= do.call(paste, c(DeltaResult[,c("idxA","idxB")], list(sep="_")))
linked_pairs = length( intersect(linked_pairs, true_pairs) )
truepositive = length( setdiff(linked_pairs, true_pairs) )
falsepositive = length( setdiff(true_pairs, linked_pairs) )
falsenegative = truepositive / (truepositive + falsepositive)
precision = 1 - precision
fdr = truepositive / (truepositive + falsenegative)
sensitivity = 2 * (precision * sensitivity) / (precision + sensitivity)
f1score "FlexRL"] = c(truepositive,falsepositive,falsenegative,f1score,fdr,sensitivity)
results[,
} results
More documentation is accessible on CRAN.
More examples are available in the vignette and in the repository FlexRL-experiments.
For support requests, contact k dot c dot robach at amsterdamumc dot nl.