mirror of
https://github.com/slhaf/Partner.git
synced 2026-06-28 01:59:17 +08:00
feat(impression): Add impression text search
This commit is contained in:
@@ -65,6 +65,11 @@
|
|||||||
<artifactId>cron-utils</artifactId>
|
<artifactId>cron-utils</artifactId>
|
||||||
<version>9.2.1</version>
|
<version>9.2.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.huaban</groupId>
|
||||||
|
<artifactId>jieba-analysis</artifactId>
|
||||||
|
<version>1.0.2</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
package work.slhaf.partner.core.cognition.impression.search
|
||||||
|
|
||||||
|
interface ImpressionTokenizer {
|
||||||
|
fun tokenize(text: String): Set<String>
|
||||||
|
}
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
package work.slhaf.partner.core.cognition.impression.search
|
||||||
|
|
||||||
|
import com.huaban.analysis.jieba.JiebaSegmenter
|
||||||
|
|
||||||
|
class JiebaImpressionTokenizer(
|
||||||
|
private val segmenter: JiebaSegmenter = JiebaSegmenter(),
|
||||||
|
private val mode: JiebaSegmenter.SegMode = JiebaSegmenter.SegMode.SEARCH,
|
||||||
|
) : ImpressionTokenizer {
|
||||||
|
|
||||||
|
override fun tokenize(text: String): Set<String> {
|
||||||
|
val normalized = normalize(text)
|
||||||
|
if (normalized.isBlank()) {
|
||||||
|
return emptySet()
|
||||||
|
}
|
||||||
|
|
||||||
|
val jiebaTerms = segmenter.process(normalized, mode)
|
||||||
|
.asSequence()
|
||||||
|
.map { it.word }
|
||||||
|
.map(::normalize)
|
||||||
|
.filter { it.isNotBlank() }
|
||||||
|
|
||||||
|
return (jiebaTerms + alphaNumericTerms(normalized)).toSet()
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun alphaNumericTerms(text: String): Sequence<String> =
|
||||||
|
ALPHA_NUMERIC_REGEX.findAll(text).map { it.value }
|
||||||
|
|
||||||
|
private fun normalize(text: String): String =
|
||||||
|
text.lowercase()
|
||||||
|
.replace(WHITESPACE_REGEX, " ")
|
||||||
|
.trim()
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
private val WHITESPACE_REGEX = Regex("\\s+")
|
||||||
|
private val ALPHA_NUMERIC_REGEX = Regex("[a-z0-9]+(?:[-_./][a-z0-9]+)*")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,136 @@
|
|||||||
|
package work.slhaf.partner.core.cognition.impression.search
|
||||||
|
|
||||||
|
class SimpleTextSearch(
|
||||||
|
private val tokenizer: ImpressionTokenizer = JiebaImpressionTokenizer(),
|
||||||
|
) : ImpressionTextSearch {
|
||||||
|
|
||||||
|
private val documents = linkedMapOf<String, IndexedDocument>()
|
||||||
|
private val invertedIndex = linkedMapOf<String, MutableSet<String>>()
|
||||||
|
|
||||||
|
@Synchronized
|
||||||
|
override fun rebuild(documents: Collection<ImpressionSearchDocument>) {
|
||||||
|
this.documents.clear()
|
||||||
|
invertedIndex.clear()
|
||||||
|
documents.forEach(::upsertInternal)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Synchronized
|
||||||
|
override fun upsert(document: ImpressionSearchDocument) {
|
||||||
|
removeByDocumentId(document.id)
|
||||||
|
upsertInternal(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Synchronized
|
||||||
|
override fun removeByTarget(target: ImpressionSearchTarget) {
|
||||||
|
documents.values
|
||||||
|
.asSequence()
|
||||||
|
.filter { it.document.target == target }
|
||||||
|
.map { it.document.id }
|
||||||
|
.toList()
|
||||||
|
.forEach(::removeByDocumentId)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Synchronized
|
||||||
|
override fun search(query: String, limit: Int): List<ImpressionSearchHit> {
|
||||||
|
if (limit <= 0) {
|
||||||
|
return emptyList()
|
||||||
|
}
|
||||||
|
|
||||||
|
val normalizedQuery = normalize(query)
|
||||||
|
if (normalizedQuery.isBlank()) {
|
||||||
|
return emptyList()
|
||||||
|
}
|
||||||
|
|
||||||
|
val queryTerms = tokenizer.tokenize(normalizedQuery)
|
||||||
|
val candidateIds = if (queryTerms.isEmpty()) {
|
||||||
|
documents.keys.toSet()
|
||||||
|
} else {
|
||||||
|
queryTerms
|
||||||
|
.asSequence()
|
||||||
|
.flatMap { invertedIndex[it].orEmpty().asSequence() }
|
||||||
|
.toSet()
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidateIds
|
||||||
|
.asSequence()
|
||||||
|
.mapNotNull { documentId -> scoreDocument(documents[documentId] ?: return@mapNotNull null, normalizedQuery, queryTerms) }
|
||||||
|
.filter { it.score > 0.0 }
|
||||||
|
.sortedWith(compareByDescending<ImpressionSearchHit> { it.score }.thenBy { it.document.id })
|
||||||
|
.take(limit)
|
||||||
|
.toList()
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun upsertInternal(document: ImpressionSearchDocument) {
|
||||||
|
val normalizedText = normalize(document.text)
|
||||||
|
val terms = tokenizer.tokenize(normalizedText)
|
||||||
|
val indexedDocument = IndexedDocument(document, normalizedText, terms)
|
||||||
|
documents[document.id] = indexedDocument
|
||||||
|
terms.forEach { term ->
|
||||||
|
invertedIndex.getOrPut(term) { linkedSetOf() }.add(document.id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun removeByDocumentId(documentId: String) {
|
||||||
|
val indexedDocument = documents.remove(documentId) ?: return
|
||||||
|
indexedDocument.terms.forEach { term ->
|
||||||
|
val ids = invertedIndex[term] ?: return@forEach
|
||||||
|
ids.remove(documentId)
|
||||||
|
if (ids.isEmpty()) {
|
||||||
|
invertedIndex.remove(term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun scoreDocument(
|
||||||
|
indexedDocument: IndexedDocument,
|
||||||
|
normalizedQuery: String,
|
||||||
|
queryTerms: Set<String>,
|
||||||
|
): ImpressionSearchHit? {
|
||||||
|
val matchedTerms = if (queryTerms.isEmpty()) {
|
||||||
|
emptySet()
|
||||||
|
} else {
|
||||||
|
queryTerms.intersect(indexedDocument.terms)
|
||||||
|
}
|
||||||
|
val exactPhraseMatched = indexedDocument.normalizedText.contains(normalizedQuery)
|
||||||
|
|
||||||
|
if (matchedTerms.isEmpty() && !exactPhraseMatched) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
val coverage = if (queryTerms.isEmpty()) 0.0 else matchedTerms.size.toDouble() / queryTerms.size.toDouble()
|
||||||
|
val termScore = matchedTerms.size.toDouble()
|
||||||
|
val exactPhraseBonus = if (exactPhraseMatched) EXACT_PHRASE_BONUS else 0.0
|
||||||
|
val fieldBonus = fieldBonus(indexedDocument.document.field)
|
||||||
|
val score = (termScore + coverage + exactPhraseBonus + fieldBonus) * indexedDocument.document.weight
|
||||||
|
|
||||||
|
return ImpressionSearchHit(
|
||||||
|
document = indexedDocument.document,
|
||||||
|
score = score,
|
||||||
|
matchedTerms = matchedTerms,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun fieldBonus(field: ImpressionSearchField): Double = when (field) {
|
||||||
|
ImpressionSearchField.SUBJECT -> 0.8
|
||||||
|
ImpressionSearchField.FEATURE -> 0.35
|
||||||
|
ImpressionSearchField.IMPRESSION -> 0.25
|
||||||
|
ImpressionSearchField.RELATION -> 0.15
|
||||||
|
ImpressionSearchField.EVIDENCE -> 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun normalize(text: String): String =
|
||||||
|
text.lowercase()
|
||||||
|
.replace(WHITESPACE_REGEX, " ")
|
||||||
|
.trim()
|
||||||
|
|
||||||
|
private data class IndexedDocument(
|
||||||
|
val document: ImpressionSearchDocument,
|
||||||
|
val normalizedText: String,
|
||||||
|
val terms: Set<String>,
|
||||||
|
)
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
private const val EXACT_PHRASE_BONUS = 1.5
|
||||||
|
private val WHITESPACE_REGEX = Regex("\\s+")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,226 @@
|
|||||||
|
package work.slhaf.partner.core.cognition.impression.search
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions.assertEquals
|
||||||
|
import org.junit.jupiter.api.Assertions.assertFalse
|
||||||
|
import org.junit.jupiter.api.Assertions.assertTrue
|
||||||
|
import org.junit.jupiter.api.Test
|
||||||
|
import work.slhaf.partner.core.cognition.impression.ActiveEntity
|
||||||
|
|
||||||
|
class SimpleTextSearchTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `search ranks subject hit before evidence hit when both match similar terms`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val targetA = activeTarget("a")
|
||||||
|
val targetB = activeTarget("b")
|
||||||
|
|
||||||
|
search.rebuild(
|
||||||
|
listOf(
|
||||||
|
document("a-subject", targetA, ImpressionSearchField.SUBJECT, "城南旧书店老板", 1.0),
|
||||||
|
document("b-evidence", targetB, ImpressionSearchField.EVIDENCE, "用户提到城南旧书店附近有一家打印店", 0.8),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
val hits = search.search("城南旧书店", limit = 10)
|
||||||
|
|
||||||
|
assertEquals(listOf("a-subject", "b-evidence"), hits.map { it.document.id })
|
||||||
|
assertTrue(hits.first().score > hits[1].score)
|
||||||
|
assertTrue(hits.first().matchedTerms.containsAll(setOf("城南", "旧书店")))
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `exact phrase match can beat partial subject match`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val partialSubject = activeTarget("partial")
|
||||||
|
val exactEvidence = activeTarget("exact")
|
||||||
|
|
||||||
|
search.rebuild(
|
||||||
|
listOf(
|
||||||
|
document("partial-subject", partialSubject, ImpressionSearchField.SUBJECT, "工程教材", 1.0),
|
||||||
|
document("exact-evidence", exactEvidence, ImpressionSearchField.EVIDENCE, "旧书店老板推荐过工程教材", 0.8),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
val hits = search.search("旧书店老板推荐过工程教材", limit = 10)
|
||||||
|
|
||||||
|
assertEquals("exact-evidence", hits.first().document.id)
|
||||||
|
assertTrue(hits.first().matchedTerms.containsAll(setOf("旧书店", "老板", "推荐", "工程", "教材")))
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `search recalls bookstore owner from generated active entity documents`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val bookstoreOwner = activeEntity("bookstore", "城南旧书店老板") {
|
||||||
|
addEvidence("用户上周提到城南旧书店老板推荐过一本水利工程教材")
|
||||||
|
addProjectedFeatures("熟悉工程类旧书" to 0.9)
|
||||||
|
}
|
||||||
|
val technicalPartner = activeEntity("technical", "Java 技术搭子") {
|
||||||
|
addEvidence("用户正在讨论 Jieba 分词、SimpleTextSearch 和倒排索引")
|
||||||
|
addProjectedFeatures("熟悉 Kotlin 与检索实现" to 0.9)
|
||||||
|
}
|
||||||
|
val reportRoommate = activeEntity("report", "实验报告室友") {
|
||||||
|
addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件")
|
||||||
|
}
|
||||||
|
|
||||||
|
search.rebuild(
|
||||||
|
listOf(bookstoreOwner, technicalPartner, reportRoommate)
|
||||||
|
.flatMap(ImpressionSearchDocuments::fromActiveEntity)
|
||||||
|
)
|
||||||
|
|
||||||
|
val hits = search.search("旧书店老板推荐的工程教材", limit = 10)
|
||||||
|
|
||||||
|
assertFalse(hits.isEmpty())
|
||||||
|
assertEquals("bookstore", hits.first().document.target.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `search recalls technical active entity from implementation terms`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val technicalPartner = activeEntity("technical", "Java 技术搭子") {
|
||||||
|
addEvidence("用户正在讨论 Jieba 分词、SimpleTextSearch 和倒排索引")
|
||||||
|
addProjectedImpressions("需要补充搜索召回测试" to 0.8)
|
||||||
|
}
|
||||||
|
val reportRoommate = activeEntity("report", "实验报告室友") {
|
||||||
|
addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件")
|
||||||
|
}
|
||||||
|
|
||||||
|
search.rebuild(
|
||||||
|
listOf(technicalPartner, reportRoommate)
|
||||||
|
.flatMap(ImpressionSearchDocuments::fromActiveEntity)
|
||||||
|
)
|
||||||
|
|
||||||
|
val hits = search.search("jieba 分词 SimpleTextSearch 倒排索引", limit = 10)
|
||||||
|
|
||||||
|
assertFalse(hits.isEmpty())
|
||||||
|
assertEquals("technical", hits.first().document.target.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `search recalls report active entity from document task terms`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val technicalPartner = activeEntity("technical", "Java 技术搭子") {
|
||||||
|
addEvidence("用户正在讨论 Kotlin、Jieba 分词和 SimpleTextSearch")
|
||||||
|
}
|
||||||
|
val reportRoommate = activeEntity("report", "实验报告室友") {
|
||||||
|
addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件")
|
||||||
|
}
|
||||||
|
|
||||||
|
search.rebuild(
|
||||||
|
listOf(technicalPartner, reportRoommate)
|
||||||
|
.flatMap(ImpressionSearchDocuments::fromActiveEntity)
|
||||||
|
)
|
||||||
|
|
||||||
|
val hits = search.search("Vivado 实验报告模板", limit = 10)
|
||||||
|
|
||||||
|
assertFalse(hits.isEmpty())
|
||||||
|
assertEquals("report", hits.first().document.target.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `upsert replaces previous index terms for the same document id`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val target = activeTarget("entity")
|
||||||
|
|
||||||
|
search.upsert(document("doc", target, ImpressionSearchField.EVIDENCE, "旧书店老板", 1.0))
|
||||||
|
assertEquals(listOf("doc"), search.search("老板", limit = 10).map { it.document.id })
|
||||||
|
|
||||||
|
search.upsert(document("doc", target, ImpressionSearchField.EVIDENCE, "实验报告模板", 1.0))
|
||||||
|
|
||||||
|
assertTrue(search.search("老板", limit = 10).isEmpty())
|
||||||
|
assertEquals(listOf("doc"), search.search("实验报告", limit = 10).map { it.document.id })
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `removeByTarget removes all documents belonging to that target`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val removed = activeTarget("removed")
|
||||||
|
val kept = activeTarget("kept")
|
||||||
|
|
||||||
|
search.rebuild(
|
||||||
|
listOf(
|
||||||
|
document("removed-subject", removed, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0),
|
||||||
|
document("removed-evidence", removed, ImpressionSearchField.EVIDENCE, "工程教材", 0.8),
|
||||||
|
document("kept-evidence", kept, ImpressionSearchField.EVIDENCE, "实验报告模板", 0.8),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
search.removeByTarget(removed)
|
||||||
|
|
||||||
|
val hits = search.search("实验报告", limit = 10)
|
||||||
|
assertEquals(listOf("kept-evidence"), hits.map { it.document.id })
|
||||||
|
assertFalse(hits.any { it.document.target == removed })
|
||||||
|
assertTrue(search.search("旧书店", limit = 10).isEmpty())
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `rebuild clears previous documents and index terms`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val target = activeTarget("entity")
|
||||||
|
|
||||||
|
search.rebuild(listOf(document("old", target, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0)))
|
||||||
|
assertEquals(listOf("old"), search.search("老板", limit = 10).map { it.document.id })
|
||||||
|
|
||||||
|
search.rebuild(listOf(document("new", target, ImpressionSearchField.SUBJECT, "实验报告模板", 1.0)))
|
||||||
|
|
||||||
|
assertTrue(search.search("老板", limit = 10).isEmpty())
|
||||||
|
assertEquals(listOf("new"), search.search("实验报告", limit = 10).map { it.document.id })
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `blank unmatched and zero limit queries return empty hits`() {
|
||||||
|
val search = SimpleTextSearch(TestTokenizer())
|
||||||
|
val target = activeTarget("entity")
|
||||||
|
search.rebuild(listOf(document("doc", target, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0)))
|
||||||
|
|
||||||
|
assertTrue(search.search(" ", limit = 10).isEmpty())
|
||||||
|
assertTrue(search.search("完全不存在", limit = 10).isEmpty())
|
||||||
|
assertTrue(search.search("旧书店", limit = 0).isEmpty())
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun activeTarget(id: String) =
|
||||||
|
ImpressionSearchTarget(ImpressionSearchTarget.Type.ACTIVE_ENTITY, id)
|
||||||
|
|
||||||
|
private fun activeEntity(
|
||||||
|
runtimeId: String,
|
||||||
|
subject: String,
|
||||||
|
configure: ActiveEntity.() -> Unit,
|
||||||
|
): ActiveEntity = ActiveEntity(runtimeId = runtimeId).apply {
|
||||||
|
updateSubject(subject)
|
||||||
|
configure()
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun document(
|
||||||
|
id: String,
|
||||||
|
target: ImpressionSearchTarget,
|
||||||
|
field: ImpressionSearchField,
|
||||||
|
text: String,
|
||||||
|
weight: Double,
|
||||||
|
) = ImpressionSearchDocument(
|
||||||
|
id = id,
|
||||||
|
target = target,
|
||||||
|
field = field,
|
||||||
|
text = text,
|
||||||
|
weight = weight,
|
||||||
|
)
|
||||||
|
|
||||||
|
private class TestTokenizer : ImpressionTokenizer {
|
||||||
|
private val dictionary = listOf(
|
||||||
|
"城南", "旧书店", "老板", "推荐", "工程", "教材", "水利", "熟悉", "旧书",
|
||||||
|
"java", "kotlin", "jieba", "分词", "simpletextsearch", "倒排", "索引", "检索", "测试", "召回",
|
||||||
|
"vivado", "实验报告", "实验", "报告", "模板", "docx", "室友", "整理", "文件"
|
||||||
|
)
|
||||||
|
private val alphaNumericRegex = Regex("[a-z0-9]+(?:[-_./][a-z0-9]+)*")
|
||||||
|
|
||||||
|
override fun tokenize(text: String): Set<String> {
|
||||||
|
val normalized = text.lowercase().trim()
|
||||||
|
if (normalized.isBlank()) {
|
||||||
|
return emptySet()
|
||||||
|
}
|
||||||
|
|
||||||
|
return buildSet {
|
||||||
|
dictionary.filterTo(this) { normalized.contains(it) }
|
||||||
|
alphaNumericRegex.findAll(normalized).mapTo(this) { it.value }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user