Use a variety of methods to compare fingerprints

sfutils contains a variety of distance and similarity metrics to compare fingerprints and collections of fingerprints. This vignette explores these methods and how to use them.

# Clean wd
rm(list=ls())
# Load data
data("fps_train")
data("fps_test")
# To list
fps_train <- as.list(fps_train)
fps_test <- as.list(fps_test)

# Unpack
train_lbls_b <- fps_train$label_binomial
train_lbls_m <- fps_train$label_multinomial
fps_train <- fps_train$fingerprints

test_lbls_b <- fps_test$label_binomial
test_lbls_m <- fps_test$label_multinomial
fps_test <- fps_test$fingerprints

We can turn a list of Fingerprint objects to a Collection:

# Turn fingerprints into Collection
col_train <- Collection(fps_train)
col_test <- Collection(fps_test)

We can now make a filter to compare each article to. This is one way to classify the articles.

# Let's see if we can pick out the oil & gas companies
# Create a filter. Currently I've not implemented that either can be empty.
filt <- Filter(name = "crude",
               positive = c("oil", "gas", "saudi-arabia", "middle east",
                            "oil platform", "oil reserves", "fuel", "gasoline", "crude oil",
                            "oil pipeline", "petroleum", 
                            "opec", "oil well", "fracking", "oil field", "energy"))
# We can compare the fingerprint plot of the filter to the intensity plot of articles about crude oil
crude_index <- which(train_lbls_b == "crude")
crude_intense <- Collection(fps_train[crude_index])
plot(crude_intense)
plot(filt)

We can use the do_compare() function to compare fingerprints in R:

# Compare two fingerprints
methods <- c("cosine", "jaccard", "dice", "gilbertwells",
  "dennis", "sorgenfrei", "lancewilliams", "euclid", "hamming")
type <- c(rep("similarity", 6), rep("distance", 3))
# Compute for all methods
mm <- mapply(function(x, y) {
    data.frame(
    "metric" = x,
    "type" = y,
    "value" = round(do_compare(fps_train[[1]], 
                               fps_train[[2]], 
                               method = x)[1,1],
                    digits = 3)
      )
    },
    methods,
    type,
    SIMPLIFY = FALSE)

# To data frame
mm <- do.call(rbind.data.frame, mm)
row.names(mm) <- 1:9

knitr::kable(mm)

metric	type	value
cosine	similarity	0.333
jaccard	similarity	0.200
dice	similarity	0.333
gilbertwells	similarity	1.714
dennis	similarity	34.979
sorgenfrei	similarity	0.111
lancewilliams	distance	0.667
euclid	distance	36.222
hamming	distance	1312.000

It is possible to compare a fingerprint to an entire collection by transforming the collection into a sparse binary matrix. This we can achieve using the canonical as.matrix() command in R:

# Turn test and train into a sparse binary matrix
mtrain <- as.matrix(col_train)
mtest <- as.matrix(col_test)

mtrain[1:5, 1:8]
#> 5 x 8 sparse Matrix of class "dgCMatrix"
#>                                      p1 p2 p3 p4 p5 p6 p7 p8
#> 125fb979-9f92-47ae-bf96-06b408169edf  .  .  .  1  .  .  1  1
#> 9abd155e-1d30-4315-a872-200d1a95cb11  .  .  .  .  .  .  .  1
#> 6e2fbf10-57af-4a28-8dc8-aa3d49c21291  .  .  .  .  .  .  1  1
#> 63d4b328-4518-4e20-9431-70bb3c4c4e34  .  .  .  .  .  .  .  1
#> 487e2756-8f2e-4318-b308-00b16b190a37  .  .  .  .  .  .  .  1

# Compare documents to another fingerprinted document
comp <- do_compare(mtrain, fps_train[[1]], method = "cosine")

knitr::kable(as.data.frame(comp[1:10,]))

	comp[1:10, ]
125fb979-9f92-47ae-bf96-06b408169edf	1.0000000
9abd155e-1d30-4315-a872-200d1a95cb11	0.3333333
6e2fbf10-57af-4a28-8dc8-aa3d49c21291	0.4420732
63d4b328-4518-4e20-9431-70bb3c4c4e34	0.3932927
487e2756-8f2e-4318-b308-00b16b190a37	0.5060976
94e41525-5797-42a6-8f8e-7ea2abee47d1	0.3363821
325282a6-a66c-41cd-b0f7-1865d8e6efc5	0.4400407
cf5d3db4-89a0-4992-8220-198f7e8c5bdf	0.4359756
cd3a4cdc-9c87-4bd3-8158-3271a08a51ad	0.3953252
dfc62156-9a58-48fd-b5ec-6d9976fd7b77	0.4654472

For example, if we’d like to display the two most similar articles to some document, we can do this as follows:

# Quick convenience function
get_most_similar_articles <- function(doc) {
  # Compare documents to another fingerprinted document
  comp <- do_compare(mtrain, doc, method = "cosine")
  # To data frame
  comp_df <- data.frame("sim" = comp[,1],
                        "uuid" = row.names(comp))
  # Order
  comp_df <- comp_df[order(comp_df$sim, decreasing = TRUE), ]
  # Get docs number 2 and 3
  comp_df <- comp_df[2:3, "uuid"]
  # Get text
  texts <- lapply(fps_train, function(x) if(uuid(x) %in% comp_df) text(x))
  # Remove null
  texts <- texts[sapply(texts, function(x) !is.null(x))]
  # Cat
  prnt <- paste0("INPUT DOC: \n\n", 
                 text(doc), "\n\n", 
                 paste0(paste0("RECOMMENDATION: \n\n",
                               unlist(texts)), collapse = "\n\n"))
  cat(prnt)
}
# Test function
get_most_similar_articles(fps_train[[45]])
#> INPUT DOC: 
#> 
#> Lac Minerals Ltd and <Cambior Inc> said
#> they completed a second hole at their jointly owned Doyon mine
#> in Quebec, which showed two significant gold intersections.
#>     One intersection graded 0.33 ounce gold a short ton over 44
#> feet at depth of 1,411 feet, while the other graded 0.22 ounce
#> gold a ton over 23 feet at 2,064 feet, the companies said. The
#> hole is 460 feet east of the previously reported first hole.
#>     They said they were now drilling another hole 460 feet to
#> the west of the first drill hole and expected to report results
#> in late March or early April.
#>  Reuter
#>  
#> 
#> RECOMMENDATION: 
#> 
#> D'Or Val Mines Ltd said a recent
#> drill hole from the surfrace has intersected high-grade ore in
#> a downdip extension of the Discovery Vein in its D'Or Val Mine
#> in northern Quebec.
#>     The company said 42.3 feet of the hole graded 0.92 ounce
#> per short ton of gold, including a 17.5 foot section grading
#> 2.17 ounces.
#>     It said the zone is just below the projection of the
#> seventh level of the mine about 1,450 feet below the surface
#> and 820 feet west of the shaft.
#>     D'Or Val said this find and other recent ones will make
#> substantial contributions to the mine's ore reserves and grade.
#>  Reuter
#>  
#> 
#> RECOMMENDATION: 
#> 
#> Golden North
#> Resource Corp said said surface and underground drilling on the
#> Canty project and Mascot fraction at its Nickel Plate Mountain
#> property in British Columbia returned encouraging gold assays.
#>     It said one Canty hole encountered several mineralized
#> intervals including 11 feet grading 0.342 ounce gold a short
#> ton from 86 to 97 feet and 17 feet grading 0.756 ounce gold ton
#> from 170.5 feet to 187.5 feet.
#>     A Mascot fraction hole returned assays including 0.190
#> ounce gold ton over seven feet between 57 and 64 feet, it said.
#>  Reuter
#>

We can use several methods to perform classification of the documents. Cortical claims that a cosine similarity of 0.3 between two documents or a document and a filter is sufficient to classify two texts. This is a transparent and computationally efficient way to classify documents since the cosine similarity metric is easy to understand. It does, however, take some tinkering to get the filter ‘right’. One could also create multiple filters and compare the documents against those to improve the accuracy of predictions.

Jasper Ginn

2018-05-28