diff --git a/Partner-Core/pom.xml b/Partner-Core/pom.xml index d6eafe5b..4e875887 100644 --- a/Partner-Core/pom.xml +++ b/Partner-Core/pom.xml @@ -65,6 +65,11 @@ cron-utils 9.2.1 + + com.huaban + jieba-analysis + 1.0.2 + diff --git a/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/ImpressionTokenizer.kt b/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/ImpressionTokenizer.kt new file mode 100644 index 00000000..984c961d --- /dev/null +++ b/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/ImpressionTokenizer.kt @@ -0,0 +1,5 @@ +package work.slhaf.partner.core.cognition.impression.search + +interface ImpressionTokenizer { + fun tokenize(text: String): Set +} \ No newline at end of file diff --git a/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/JiebaImpressionTokenizer.kt b/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/JiebaImpressionTokenizer.kt new file mode 100644 index 00000000..4b5d7f86 --- /dev/null +++ b/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/JiebaImpressionTokenizer.kt @@ -0,0 +1,37 @@ +package work.slhaf.partner.core.cognition.impression.search + +import com.huaban.analysis.jieba.JiebaSegmenter + +class JiebaImpressionTokenizer( + private val segmenter: JiebaSegmenter = JiebaSegmenter(), + private val mode: JiebaSegmenter.SegMode = JiebaSegmenter.SegMode.SEARCH, +) : ImpressionTokenizer { + + override fun tokenize(text: String): Set { + val normalized = normalize(text) + if (normalized.isBlank()) { + return emptySet() + } + + val jiebaTerms = segmenter.process(normalized, mode) + .asSequence() + .map { it.word } + .map(::normalize) + .filter { it.isNotBlank() } + + return (jiebaTerms + alphaNumericTerms(normalized)).toSet() + } + + private fun alphaNumericTerms(text: String): Sequence = + ALPHA_NUMERIC_REGEX.findAll(text).map { it.value } + + private fun normalize(text: String): String = + text.lowercase() + .replace(WHITESPACE_REGEX, " ") + .trim() + + companion object { + private val WHITESPACE_REGEX = Regex("\\s+") + private val ALPHA_NUMERIC_REGEX = Regex("[a-z0-9]+(?:[-_./][a-z0-9]+)*") + } +} diff --git a/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/SimpleTextSearch.kt b/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/SimpleTextSearch.kt new file mode 100644 index 00000000..e6174af2 --- /dev/null +++ b/Partner-Core/src/main/java/work/slhaf/partner/core/cognition/impression/search/SimpleTextSearch.kt @@ -0,0 +1,136 @@ +package work.slhaf.partner.core.cognition.impression.search + +class SimpleTextSearch( + private val tokenizer: ImpressionTokenizer = JiebaImpressionTokenizer(), +) : ImpressionTextSearch { + + private val documents = linkedMapOf() + private val invertedIndex = linkedMapOf>() + + @Synchronized + override fun rebuild(documents: Collection) { + this.documents.clear() + invertedIndex.clear() + documents.forEach(::upsertInternal) + } + + @Synchronized + override fun upsert(document: ImpressionSearchDocument) { + removeByDocumentId(document.id) + upsertInternal(document) + } + + @Synchronized + override fun removeByTarget(target: ImpressionSearchTarget) { + documents.values + .asSequence() + .filter { it.document.target == target } + .map { it.document.id } + .toList() + .forEach(::removeByDocumentId) + } + + @Synchronized + override fun search(query: String, limit: Int): List { + if (limit <= 0) { + return emptyList() + } + + val normalizedQuery = normalize(query) + if (normalizedQuery.isBlank()) { + return emptyList() + } + + val queryTerms = tokenizer.tokenize(normalizedQuery) + val candidateIds = if (queryTerms.isEmpty()) { + documents.keys.toSet() + } else { + queryTerms + .asSequence() + .flatMap { invertedIndex[it].orEmpty().asSequence() } + .toSet() + } + + return candidateIds + .asSequence() + .mapNotNull { documentId -> scoreDocument(documents[documentId] ?: return@mapNotNull null, normalizedQuery, queryTerms) } + .filter { it.score > 0.0 } + .sortedWith(compareByDescending { it.score }.thenBy { it.document.id }) + .take(limit) + .toList() + } + + private fun upsertInternal(document: ImpressionSearchDocument) { + val normalizedText = normalize(document.text) + val terms = tokenizer.tokenize(normalizedText) + val indexedDocument = IndexedDocument(document, normalizedText, terms) + documents[document.id] = indexedDocument + terms.forEach { term -> + invertedIndex.getOrPut(term) { linkedSetOf() }.add(document.id) + } + } + + private fun removeByDocumentId(documentId: String) { + val indexedDocument = documents.remove(documentId) ?: return + indexedDocument.terms.forEach { term -> + val ids = invertedIndex[term] ?: return@forEach + ids.remove(documentId) + if (ids.isEmpty()) { + invertedIndex.remove(term) + } + } + } + + private fun scoreDocument( + indexedDocument: IndexedDocument, + normalizedQuery: String, + queryTerms: Set, + ): ImpressionSearchHit? { + val matchedTerms = if (queryTerms.isEmpty()) { + emptySet() + } else { + queryTerms.intersect(indexedDocument.terms) + } + val exactPhraseMatched = indexedDocument.normalizedText.contains(normalizedQuery) + + if (matchedTerms.isEmpty() && !exactPhraseMatched) { + return null + } + + val coverage = if (queryTerms.isEmpty()) 0.0 else matchedTerms.size.toDouble() / queryTerms.size.toDouble() + val termScore = matchedTerms.size.toDouble() + val exactPhraseBonus = if (exactPhraseMatched) EXACT_PHRASE_BONUS else 0.0 + val fieldBonus = fieldBonus(indexedDocument.document.field) + val score = (termScore + coverage + exactPhraseBonus + fieldBonus) * indexedDocument.document.weight + + return ImpressionSearchHit( + document = indexedDocument.document, + score = score, + matchedTerms = matchedTerms, + ) + } + + private fun fieldBonus(field: ImpressionSearchField): Double = when (field) { + ImpressionSearchField.SUBJECT -> 0.8 + ImpressionSearchField.FEATURE -> 0.35 + ImpressionSearchField.IMPRESSION -> 0.25 + ImpressionSearchField.RELATION -> 0.15 + ImpressionSearchField.EVIDENCE -> 0.0 + } + + private fun normalize(text: String): String = + text.lowercase() + .replace(WHITESPACE_REGEX, " ") + .trim() + + private data class IndexedDocument( + val document: ImpressionSearchDocument, + val normalizedText: String, + val terms: Set, + ) + + companion object { + private const val EXACT_PHRASE_BONUS = 1.5 + private val WHITESPACE_REGEX = Regex("\\s+") + } +} diff --git a/Partner-Core/src/test/java/work/slhaf/partner/core/cognition/impression/search/SimpleTextSearchTest.kt b/Partner-Core/src/test/java/work/slhaf/partner/core/cognition/impression/search/SimpleTextSearchTest.kt new file mode 100644 index 00000000..3dcf73d7 --- /dev/null +++ b/Partner-Core/src/test/java/work/slhaf/partner/core/cognition/impression/search/SimpleTextSearchTest.kt @@ -0,0 +1,226 @@ +package work.slhaf.partner.core.cognition.impression.search + +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertFalse +import org.junit.jupiter.api.Assertions.assertTrue +import org.junit.jupiter.api.Test +import work.slhaf.partner.core.cognition.impression.ActiveEntity + +class SimpleTextSearchTest { + + @Test + fun `search ranks subject hit before evidence hit when both match similar terms`() { + val search = SimpleTextSearch(TestTokenizer()) + val targetA = activeTarget("a") + val targetB = activeTarget("b") + + search.rebuild( + listOf( + document("a-subject", targetA, ImpressionSearchField.SUBJECT, "城南旧书店老板", 1.0), + document("b-evidence", targetB, ImpressionSearchField.EVIDENCE, "用户提到城南旧书店附近有一家打印店", 0.8), + ) + ) + + val hits = search.search("城南旧书店", limit = 10) + + assertEquals(listOf("a-subject", "b-evidence"), hits.map { it.document.id }) + assertTrue(hits.first().score > hits[1].score) + assertTrue(hits.first().matchedTerms.containsAll(setOf("城南", "旧书店"))) + } + + @Test + fun `exact phrase match can beat partial subject match`() { + val search = SimpleTextSearch(TestTokenizer()) + val partialSubject = activeTarget("partial") + val exactEvidence = activeTarget("exact") + + search.rebuild( + listOf( + document("partial-subject", partialSubject, ImpressionSearchField.SUBJECT, "工程教材", 1.0), + document("exact-evidence", exactEvidence, ImpressionSearchField.EVIDENCE, "旧书店老板推荐过工程教材", 0.8), + ) + ) + + val hits = search.search("旧书店老板推荐过工程教材", limit = 10) + + assertEquals("exact-evidence", hits.first().document.id) + assertTrue(hits.first().matchedTerms.containsAll(setOf("旧书店", "老板", "推荐", "工程", "教材"))) + } + + @Test + fun `search recalls bookstore owner from generated active entity documents`() { + val search = SimpleTextSearch(TestTokenizer()) + val bookstoreOwner = activeEntity("bookstore", "城南旧书店老板") { + addEvidence("用户上周提到城南旧书店老板推荐过一本水利工程教材") + addProjectedFeatures("熟悉工程类旧书" to 0.9) + } + val technicalPartner = activeEntity("technical", "Java 技术搭子") { + addEvidence("用户正在讨论 Jieba 分词、SimpleTextSearch 和倒排索引") + addProjectedFeatures("熟悉 Kotlin 与检索实现" to 0.9) + } + val reportRoommate = activeEntity("report", "实验报告室友") { + addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件") + } + + search.rebuild( + listOf(bookstoreOwner, technicalPartner, reportRoommate) + .flatMap(ImpressionSearchDocuments::fromActiveEntity) + ) + + val hits = search.search("旧书店老板推荐的工程教材", limit = 10) + + assertFalse(hits.isEmpty()) + assertEquals("bookstore", hits.first().document.target.id) + } + + @Test + fun `search recalls technical active entity from implementation terms`() { + val search = SimpleTextSearch(TestTokenizer()) + val technicalPartner = activeEntity("technical", "Java 技术搭子") { + addEvidence("用户正在讨论 Jieba 分词、SimpleTextSearch 和倒排索引") + addProjectedImpressions("需要补充搜索召回测试" to 0.8) + } + val reportRoommate = activeEntity("report", "实验报告室友") { + addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件") + } + + search.rebuild( + listOf(technicalPartner, reportRoommate) + .flatMap(ImpressionSearchDocuments::fromActiveEntity) + ) + + val hits = search.search("jieba 分词 SimpleTextSearch 倒排索引", limit = 10) + + assertFalse(hits.isEmpty()) + assertEquals("technical", hits.first().document.target.id) + } + + @Test + fun `search recalls report active entity from document task terms`() { + val search = SimpleTextSearch(TestTokenizer()) + val technicalPartner = activeEntity("technical", "Java 技术搭子") { + addEvidence("用户正在讨论 Kotlin、Jieba 分词和 SimpleTextSearch") + } + val reportRoommate = activeEntity("report", "实验报告室友") { + addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件") + } + + search.rebuild( + listOf(technicalPartner, reportRoommate) + .flatMap(ImpressionSearchDocuments::fromActiveEntity) + ) + + val hits = search.search("Vivado 实验报告模板", limit = 10) + + assertFalse(hits.isEmpty()) + assertEquals("report", hits.first().document.target.id) + } + + @Test + fun `upsert replaces previous index terms for the same document id`() { + val search = SimpleTextSearch(TestTokenizer()) + val target = activeTarget("entity") + + search.upsert(document("doc", target, ImpressionSearchField.EVIDENCE, "旧书店老板", 1.0)) + assertEquals(listOf("doc"), search.search("老板", limit = 10).map { it.document.id }) + + search.upsert(document("doc", target, ImpressionSearchField.EVIDENCE, "实验报告模板", 1.0)) + + assertTrue(search.search("老板", limit = 10).isEmpty()) + assertEquals(listOf("doc"), search.search("实验报告", limit = 10).map { it.document.id }) + } + + @Test + fun `removeByTarget removes all documents belonging to that target`() { + val search = SimpleTextSearch(TestTokenizer()) + val removed = activeTarget("removed") + val kept = activeTarget("kept") + + search.rebuild( + listOf( + document("removed-subject", removed, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0), + document("removed-evidence", removed, ImpressionSearchField.EVIDENCE, "工程教材", 0.8), + document("kept-evidence", kept, ImpressionSearchField.EVIDENCE, "实验报告模板", 0.8), + ) + ) + + search.removeByTarget(removed) + + val hits = search.search("实验报告", limit = 10) + assertEquals(listOf("kept-evidence"), hits.map { it.document.id }) + assertFalse(hits.any { it.document.target == removed }) + assertTrue(search.search("旧书店", limit = 10).isEmpty()) + } + + @Test + fun `rebuild clears previous documents and index terms`() { + val search = SimpleTextSearch(TestTokenizer()) + val target = activeTarget("entity") + + search.rebuild(listOf(document("old", target, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0))) + assertEquals(listOf("old"), search.search("老板", limit = 10).map { it.document.id }) + + search.rebuild(listOf(document("new", target, ImpressionSearchField.SUBJECT, "实验报告模板", 1.0))) + + assertTrue(search.search("老板", limit = 10).isEmpty()) + assertEquals(listOf("new"), search.search("实验报告", limit = 10).map { it.document.id }) + } + + @Test + fun `blank unmatched and zero limit queries return empty hits`() { + val search = SimpleTextSearch(TestTokenizer()) + val target = activeTarget("entity") + search.rebuild(listOf(document("doc", target, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0))) + + assertTrue(search.search(" ", limit = 10).isEmpty()) + assertTrue(search.search("完全不存在", limit = 10).isEmpty()) + assertTrue(search.search("旧书店", limit = 0).isEmpty()) + } + + private fun activeTarget(id: String) = + ImpressionSearchTarget(ImpressionSearchTarget.Type.ACTIVE_ENTITY, id) + + private fun activeEntity( + runtimeId: String, + subject: String, + configure: ActiveEntity.() -> Unit, + ): ActiveEntity = ActiveEntity(runtimeId = runtimeId).apply { + updateSubject(subject) + configure() + } + + private fun document( + id: String, + target: ImpressionSearchTarget, + field: ImpressionSearchField, + text: String, + weight: Double, + ) = ImpressionSearchDocument( + id = id, + target = target, + field = field, + text = text, + weight = weight, + ) + + private class TestTokenizer : ImpressionTokenizer { + private val dictionary = listOf( + "城南", "旧书店", "老板", "推荐", "工程", "教材", "水利", "熟悉", "旧书", + "java", "kotlin", "jieba", "分词", "simpletextsearch", "倒排", "索引", "检索", "测试", "召回", + "vivado", "实验报告", "实验", "报告", "模板", "docx", "室友", "整理", "文件" + ) + private val alphaNumericRegex = Regex("[a-z0-9]+(?:[-_./][a-z0-9]+)*") + + override fun tokenize(text: String): Set { + val normalized = text.lowercase().trim() + if (normalized.isBlank()) { + return emptySet() + } + + return buildSet { + dictionary.filterTo(this) { normalized.contains(it) } + alphaNumericRegex.findAll(normalized).mapTo(this) { it.value } + } + } + } +}