FlexRL-vignette

library(FlexRL)
#> If you are happy with FlexRL, please cite us! Also, if you are unhappy, please cite us anyway.
#> HERE ADD
#> bibtex format in CITATION.

FlexRL is an algorithm for Record Linkage written in R and cpp. It uses a Stochastic Expectation Maximisation approach to combine 2 data sources and outputs the set of records referring to the same entities.

More details are provided in our paper.

This vignette uses example subsets from the SHIW data.

df2016 = read.csv("exA.csv", row.names = 1)
df2020 = read.csv("exB.csv", row.names = 1)
head(df2016)
#>   ANASCI SESSO STACIV STUDIO IREG       ID ANNO PAR NASCREG ETA
#> 1   1953     1      1      2    2 501068_1 2016   1      19  63
#> 2   1960     2      1      2    2 501068_2 2016   2      19  56
#> 3   1955     1      1      3    2 501082_1 2016   1       2  61
#> 4   1958     2      1      4    2 501082_2 2016   2       2  58
#> 5   1961     2      1      3    2 680914_1 2016   1       2  55
#> 6   1965     1      1      3    2 680914_2 2016   2       2  51
head(df2020)
#>   ANASCI SESSO STACIV STUDIO IREG       ID ANNO PAR NASCREG ETA
#> 1   1953     1      1      2    2 501068_1 2020   1      19  67
#> 2   1960     2      1      3    2 501068_2 2020   2      19  60
#> 3   1955     1      1      3    2 501082_1 2020   1       2  65
#> 4   1958     2      1      4    2 501082_2 2020   2       2  62
#> 5   1961     2      3      3    2 686717_1 2020   1      18  59
#> 6   1998     1      2      3    2 686717_4 2020   3       2  22

We will use several Partially Identifying Variables to link the records: birth year, sex, marital status, education level, regional code. We treat them all as stable i.e. not evolving across time (though it may not be true). We show an example with PIVs that evolve over time later.

PIVs_config = list( 
  ANASCI     = list(stable = TRUE),
  SESSO      = list(stable = TRUE),
  STACIV     = list(stable = TRUE),
  STUDIO     = list(stable = TRUE), 
  IREG       = list(stable = TRUE) 
  )
PIVs = names(PIVs_config)
PIVs_stable = sapply(PIVs_config, function(x) x$stable)

A first step in record linkage is to remove the records for which the linking variables are outside of the common support. If a record in the first file indicates a birth year which does not appear in the second file, we can suppose that this record has no link in the second file.

We need to reinitialise the rownames to be used as indices later (to compare the true pairs and the linked pairs for instance).

for(i in 1:length(PIVs)){
  intersect_support_piv = intersect( unique(df2016[,PIVs[i]]), unique(df2020[,PIVs[i]]) )
  df2016 = df2016[df2016[,PIVs[i]] %in% c(NA,intersect_support_piv),]
  df2020 = df2020[df2020[,PIVs[i]] %in% c(NA,intersect_support_piv),]
}

rownames(df2016) = 1:nrow(df2016)
rownames(df2020) = 1:nrow(df2020)

For these example data sets we know the true linkage structure.

links = intersect(df2016$ID, df2020$ID)
Nlinks = length(links)

TrueDelta = data.frame( matrix(0, nrow=0, ncol=2) )
for (i in 1:Nlinks)
{
  id = links[i]
  id16 = which(df2016$ID == id)
  id20 = which(df2020$ID == id)
  TrueDelta = rbind(TrueDelta, cbind(rownames(df2016[id16,]),rownames(df2020[id20,])))
}
true_pairs = do.call(paste, c(TrueDelta, list(sep="_")))
nrow(df2016)
#> [1] 49
nrow(df2020)
#> [1] 42
Nlinks
#> [1] 27

We can look at the level of agreement of the PIVs in the set of true links, and the proportion of missing values:

colSums(df2016[TrueDelta[,1],PIVs] == df2020[TrueDelta[,2],PIVs]) / Nlinks
#>    ANASCI     SESSO    STACIV    STUDIO      IREG 
#> 1.0000000 1.0000000 0.9259259 0.7777778 1.0000000
colSums(is.na(df2016[TrueDelta[,1],PIVs])) / Nlinks
#> ANASCI  SESSO STACIV STUDIO   IREG 
#>      0      0      0      0      0
colSums(is.na(df2020[TrueDelta[,2],PIVs])) / Nlinks
#> ANASCI  SESSO STACIV STUDIO   IREG 
#>      0      0      0      0      0

FlexRL needs a specific encoding of the data to run properly. The categorical observed values in the data should be mapped to the set of natural numbers; missing values should be encoded as 0. The algorithm requires a column “source” and, it denotes by B the biggest data set.

df2016$source = "df2016"
df2020$source = "df2020"

if(nrow(df2020)>nrow(df2016)){
  encodedA = df2016
  encodedB = df2020
  cat("df2020 is the largest file, denoted encodedB")
}else{
  encodedB = df2016
  encodedA = df2020
  cat("df2016 is the largest file, denoted encodedB")
}
#> df2016 is the largest file, denoted encodedB

levels_PIVs = lapply(PIVs, function(x) levels(factor(as.character(c(encodedA[,x], encodedB[,x])))))

for(i in 1:length(PIVs))
{
  encodedA[,PIVs[i]] = as.numeric(factor(as.character(encodedA[,PIVs[i]]), levels=levels_PIVs[[i]]))
  encodedB[,PIVs[i]] = as.numeric(factor(as.character(encodedB[,PIVs[i]]), levels=levels_PIVs[[i]]))
}
nvalues = sapply(levels_PIVs, length)
names(nvalues) = PIVs

encodedA[,PIVs][ is.na(encodedA[,PIVs]) ] = 0
encodedB[,PIVs][ is.na(encodedB[,PIVs]) ] = 0

The number of unique values per PIV gives information on their discriminative power:

nvalues
#> ANASCI  SESSO STACIV STUDIO   IREG 
#>     28      2      4      5      1

We can now launch FlexRL:

We elaborate on all the parameters below.

data = list( A                    = encodedA,
             B                    = encodedB, 
             Nvalues              = nvalues,
             PIVs_config          = PIVs_config,
             controlOnMistakes    = c(TRUE, TRUE, FALSE, FALSE, FALSE),
             sameMistakes         = TRUE,
             phiMistakesAFixed    = FALSE,
             phiMistakesBFixed    = FALSE,
             phiForMistakesA      = c(NA, NA, NA, NA, NA),
             phiForMistakesB      = c(NA, NA, NA, NA, NA)
             )

fit = FlexRL::stEM(  data                 = data,
             StEMIter             = 50,
             StEMBurnin           = 30,
             GibbsIter            = 100, 
             GibbsBurnin          = 70,
             musicOn              = TRUE,
             newDirectory         = NULL,
             saveInfoIter         = FALSE
             )

The algorithm returns:

DeltaResult = fit$Delta
colnames(DeltaResult) = c("idx20","idx16","probaLink")
DeltaResult = DeltaResult[DeltaResult$probaLink>0.5,]
DeltaResult
#>    idx20 idx16 probaLink
#> 1      1     1     0.970
#> 3     21     3     0.616
#> 4      4     4     0.970
#> 6     38     7     0.789
#> 10    31    11     0.503
#> 11     6    14     0.991
#> 12     7    15     0.898
#> 13     8    16     0.962
#> 14     9    17     0.970
#> 15    10    18     0.974
#> 16    11    19     0.992
#> 17    12    20     0.980
#> 18    36    21     0.587
#> 19    13    22     0.937
#> 20    14    23     0.957
#> 21    15    24     0.991
#> 26    42    29     0.672
#> 27    24    30     0.776
#> 29    19    34     0.951
#> 30    30    36     0.984
#> 31    20    37     0.860
#> 32     3    38     0.612
#> 34     5    39     0.524
#> 37     2    41     0.730
#> 39    26    43     0.879
#> 45    28    46     0.815
#> 46    29    48     0.515

We can then compare the linked records with the true links:

results = data.frame( Results=matrix(NA, nrow=6, ncol=1) )
rownames(results) = c("tp","fp","fn","f1score","fdr","sens.")
if(nrow(DeltaResult)>1){ 
  linked_pairs    = do.call(paste, c(DeltaResult[,c("idx16","idx20")], list(sep="_")))
  truepositive    = length( intersect(linked_pairs, true_pairs) ) 
  falsepositive   = length( setdiff(linked_pairs, true_pairs) ) 
  falsenegative   = length( setdiff(true_pairs, linked_pairs) ) 
  precision       = truepositive / (truepositive + falsepositive) 
  fdr             = 1 - precision
  sensitivity     = truepositive / (truepositive + falsenegative) 
  f1score         = 2 * (precision * sensitivity) / (precision + sensitivity)
  results[,"FlexRL"] = c(truepositive,falsepositive,falsenegative,f1score,fdr,sensitivity)
}
results
#>         Results     FlexRL
#> tp           NA 17.0000000
#> fp           NA 10.0000000
#> fn           NA 10.0000000
#> f1score      NA  0.6296296
#> fdr          NA  0.3703704
#> sens.        NA  0.6296296

The level of agreement among the linked records differs from the one in the true links:

colSums(encodedA[DeltaResult[,"idx20"],PIVs] == encodedB[DeltaResult[,"idx16"],PIVs]) / nrow(DeltaResult)
#>    ANASCI     SESSO    STACIV    STUDIO      IREG 
#> 1.0000000 0.8888889 0.9629630 1.0000000 1.0000000

The missing values and potential mistakes in the registration, the size of the overlapping set of records between the files, the instability of some PIVs, their distribution, and dependencies among them, are obstacles to the linkage.

Several other algorithms for Record Linkage have been developed so far in the literature. We cite some of them in our paper but more can be found. Each has its own qualities and flaws. FlexRL usually outperforms in scenarios where the PIVs have low discriminative power with few expected registration errors; it is particularly efficient when there is enough information to model instability of some PIV changing across time (such as the postal code). More can be found on this topic in the simulation setting of the paper, which is provided in FlexRL-experiments.