Implement BM25+ ranking algorithm

2024-06-02 23:07:58 +03:00 · 2024-06-02 23:07:58 +03:00 · ab0dea9404
commit ab0dea9404
parent 25a47d62d6
3 changed files with 262 additions and 0 deletions
--- a/.idea/.name
+++ b/.idea/.name
@ -0,0 +1 @@
+DSA
--- a/src/main/kotlin/ranking/bm25/Bm25Plus.kt
+++ b/src/main/kotlin/ranking/bm25/Bm25Plus.kt
@ -0,0 +1,187 @@
+package dev.nuculabs.dsa.ranking.bm25
+
+import java.lang.Double.isFinite
+import java.util.HashMap
+import kotlin.math.log10
+
+/**
+ * Document models a simple document which contains a numeric id and text.
+ */
+data class Document(val id: Int, val text: String)
+
+/**
+ * TokenizedDocument models a document which is tokenized using a simple strategy tokenization strategy.
+ */
+data class TokenizedDocument(val document: Document, private val text: String) {
+    private var tokens: List<String> = document.text.split(" ").map { token ->
+        token.filter { it.isLetterOrDigit() }.lowercase()
+    }.filter {
+        it.isNotEmpty()
+    }
+
+    fun getTokens(): List<String> {
+        return tokens
+    }
+
+    override fun equals(other: Any?): Boolean {
+        if (this === other) return true
+        if (javaClass != other?.javaClass) return false
+
+        other as TokenizedDocument
+
+        return document == other.document
+    }
+
+    override fun hashCode(): Int {
+        return document.hashCode()
+    }
+
+}
+
+/**
+ * BM25+ is a variation of the BM25 ranking algorithm.
+ *
+ * The algorithm is implemented using the following paper as a reference.
+ * http://www.cs.otago.ac.nz/homepages/andrew/papers/2014-2.pdf
+ */
+class Bm25Plus {
+    /**
+     * The storage holds a mapping of document id -> document.
+     */
+    private var storage: MutableMap<Int, TokenizedDocument> = HashMap()
+
+    /**
+     * The term frequency index holds a mapping of term -> list of documents in which the term occurs.
+     */
+    private var termFrequencyIndex: MutableMap<String, HashSet<Int>> = HashMap()
+
+    /**
+     * The tuning parameters are used to tune the result of the algorithm.
+     *
+     * These values were taken directly from the paper.
+     */
+    private var tuningParameterB: Double = 0.3
+    private var tuningParameterK1: Double = 1.6
+    private var tuningParameterDelta: Double = 0.7
+
+    private var totalTokens: Int = 0
+    private var meanDocumentLengths: Double = 0.0
+
+    /**
+     * Returns the number of indexed documents.
+     */
+    fun indexSize(): Int {
+        return storage.size
+    }
+
+    /**
+     * Indexes a document.
+     */
+    fun index(document: Document) {
+        // Tokenize the document, for educational purposes and simplicity we will consider tokens only
+        // the words delimited by a space and transform them into lowercase.
+        val tokenizedDocument = TokenizedDocument(document, document.text)
+
+        // Document does not exist in index
+        if (!storage.containsKey(document.id)) {
+            storage[document.id] = tokenizedDocument
+
+            totalTokens += tokenizedDocument.getTokens().size
+            meanDocumentLengths = (totalTokens / storage.size).toDouble()
+
+            // Index all tokens
+            tokenizedDocument.getTokens().forEach {
+                if (termFrequencyIndex.containsKey(it)) {
+                    termFrequencyIndex[it]?.add(document.id)
+                } else {
+                    termFrequencyIndex[it] = HashSet()
+                    termFrequencyIndex[it]?.add(document.id)
+                }
+            }
+        }
+    }
+
+    /**
+     * Indexes all the documents.
+     */
+    fun indexAll(vararg documents: Document) {
+        documents.forEach {
+            index(it)
+        }
+    }
+
+    /**
+     * Queries documents using the given term and returns a list of documents which contain the term ordered by
+     * relevance.
+     *
+     */
+    fun termQuery(term: String): List<Pair<Double, Document>> {
+        val documentIds = termFrequencyIndex[term.lowercase()] ?: return emptyList()
+        // Compute the RSV for each document.
+        return documentIds.map {
+            val document = storage[it] ?: return@map null
+            val documentRsv = computeRsv(term.lowercase(), document)
+            return@map documentRsv to document.document
+            // Sort results by highest score and filter out Infinity scores, which mean that the term does not exist.
+        }.filterNotNull().filter { isFinite(it.first) }.sortedByDescending { it.first }
+    }
+
+    /**
+     * Queries documents using the given terms and returns a list of documents which contain the terms ordered by
+     * relevance.
+     *
+     */
+    fun termsQuery(vararg terms: String): List<Pair<Double, Document>> {
+
+        val documentIds = terms.map { term ->
+            Pair(term, termFrequencyIndex[term.lowercase()] ?: mutableSetOf())
+        }.reduce { acc, pair ->
+            // add all documents which contain them terms to the documents set.
+            acc.second.addAll(pair.second)
+            // return
+            acc
+        }.second
+
+        // Compute the terms RSV sum for each document.
+        return documentIds.map {
+            val document = storage[it] ?: return@map null
+            val documentRsv: Double = terms.sumOf { term -> computeRsv(term.lowercase(), document) }
+            return@map documentRsv to document.document
+            // Sort results by highest score and filter out Infinity scores, which mean that the term does not exist.
+        }.filterNotNull().filter { isFinite(it.first) }.sortedByDescending { it.first }
+    }
+
+    /**
+     * Computes the inverse document frequency for a given term.
+     *
+     * THe IDF is defined as the total number of documents (N) divided by the documents that contain the term (dft).
+     * In the BM25+ version the IDF is the (N+1)/(dft)
+     */
+    private fun computeInverseDocumentFrequency(term: String): Double {
+        val numberOfDocumentsContainingTheTerm = termFrequencyIndex[term]?.size ?: 0
+        return (storage.size + 1) / numberOfDocumentsContainingTheTerm.toDouble()
+    }
+
+    /**
+     * Computes the RSV for the given term and document.
+     * The RSV (Retrieval Status Value) is computed for every document using the BM25+ formula from the paper.
+     */
+    private fun computeRsv(
+        term: String,
+        document: TokenizedDocument
+    ): Double {
+        val inverseDocumentFrequencyLog: Double = log10(computeInverseDocumentFrequency(term.lowercase()))
+        val termOccurringInDocumentFrequency: Double =
+            document.getTokens().filter { token -> token == term.lowercase() }.size.toDouble()
+        val documentLength: Double = document.getTokens().size.toDouble()
+
+        val score =
+            inverseDocumentFrequencyLog *
+                    (
+                            ((tuningParameterK1 + 1) * termOccurringInDocumentFrequency) /
+                                    ((tuningParameterK1 * ((1 - tuningParameterB) + tuningParameterB * (documentLength / meanDocumentLengths))) + termOccurringInDocumentFrequency)
+                                    + tuningParameterDelta
+                            )
+        return score
+    }
+}
--- a/src/test/kotlin/ranking/bm25/BM25PlusTest.kt
+++ b/src/test/kotlin/ranking/bm25/BM25PlusTest.kt
@ -0,0 +1,74 @@
+package ranking.bm25
+
+import dev.nuculabs.dsa.ranking.bm25.Bm25Plus
+import dev.nuculabs.dsa.ranking.bm25.Document
+import kotlin.test.Test
+import kotlin.test.assertEquals
+
+class BM25PlusTest {
+    @Test
+    fun test_index_and_indexSize() {
+        // Setup
+        val bm25Plus = Bm25Plus()
+
+        val document1 = Document(1, "Ana are mere")
+        val document2 = Document(2, "Ana Ana Ana Ana Ana Ana Ana Ana")
+
+        // Test
+        bm25Plus.indexAll(document1, document2)
+
+        // Assert
+        assertEquals(2, bm25Plus.indexSize())
+    }
+
+    @Test
+    fun test_termQuery() {
+        // Given
+        val bm25Plus = Bm25Plus()
+
+        val document1 = Document(1, "Ana are mere")
+        val document2 = Document(2, "Ana Ana Ana Ana Ana Ana Ana Ana")
+
+        // Then
+        bm25Plus.index(document1)
+        bm25Plus.index(document2)
+
+        assertEquals(
+            listOf(0.4936823874431607 to document2, 0.3133956394555762 to document1),
+            bm25Plus.termQuery("Ana")
+        )
+        assertEquals(listOf(0.8491490237651933 to document1), bm25Plus.termQuery("mere"))
+        assertEquals(listOf(), bm25Plus.termQuery("batman"))
+        assertEquals(
+            listOf(0.4936823874431607 to document2, 0.3133956394555762 to document1),
+            bm25Plus.termQuery("ana")
+        )
+    }
+
+    @Test
+    fun test_termsQuery() {
+        // Setup
+        val bm25Plus = Bm25Plus()
+
+        val document1 = Document(
+            1,
+            "A linked list is a fundamental data structure which consists of Nodes that are connected to each other."
+        )
+        val document2 =
+            Document(2, "The Linked List data structure permits the storage of data in an efficient manner.")
+        val document3 =
+            Document(3, "The space and time complexity of the linked list operations depends on the implementation.")
+        val document4 = Document(
+            4,
+            "The operations that take O(N) time takes this much because you have to traverse the list’s for at least N nodes in order to perform it successfully. On the other hand, operations that take O(1) time do not require any traversals because the list holds pointers to the head first Node and tail last Node."
+        )
+
+        bm25Plus.indexAll(document1, document2, document3, document4)
+
+        // Test
+        val results = bm25Plus.termsQuery("linked", "list", "complexity")
+
+        // Assert
+        assertEquals(1.5966769323799244 to document3, results.first())
+    }
+}