9 Commits

12 changed files with 1151 additions and 11 deletions

16
.codegraph/.gitignore vendored Normal file
View File

@@ -0,0 +1,16 @@
# CodeGraph data files
# These are local to each machine and should not be committed
# Database
*.db
*.db-wal
*.db-shm
# Cache
cache/
# Logs
*.log
# Hook markers
.dirty

View File

@@ -65,6 +65,11 @@
<artifactId>cron-utils</artifactId>
<version>9.2.1</version>
</dependency>
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.2</version>
</dependency>
</dependencies>
<properties>

View File

@@ -2,10 +2,12 @@ package work.slhaf.partner.core.cognition;
import org.w3c.dom.Element;
import work.slhaf.partner.core.cognition.context.ContextWorkspace;
import work.slhaf.partner.core.cognition.impression.ActiveEntity;
import work.slhaf.partner.framework.agent.factory.capability.annotation.Capability;
import work.slhaf.partner.framework.agent.model.pojo.Message;
import java.util.List;
import java.util.Set;
import java.util.concurrent.locks.Lock;
@Capability("cognition")
@@ -27,4 +29,6 @@ public interface CognitionCapability {
Lock getMessageLock();
Set<ActiveEntity> projectEntity(String input);
}

View File

@@ -112,6 +112,16 @@ class ActiveEntity @JvmOverloads constructor(
private fun modelTime(time: Instant): String =
time.atZone(ZoneId.systemDefault()).toString()
override fun equals(other: Any?): Boolean {
if (this === other) return true
if (other !is ActiveEntity) return false
return runtimeId == other.runtimeId
}
override fun hashCode(): Int {
return runtimeId.hashCode()
}
}
private fun newActiveEntityRuntimeId(): String =

View File

@@ -3,6 +3,7 @@ package work.slhaf.partner.core.cognition.impression;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import org.jetbrains.annotations.NotNull;
import work.slhaf.partner.core.cognition.impression.search.*;
import work.slhaf.partner.framework.agent.factory.capability.annotation.CapabilityCore;
import work.slhaf.partner.framework.agent.factory.capability.annotation.CapabilityMethod;
import work.slhaf.partner.framework.agent.state.State;
@@ -10,10 +11,9 @@ import work.slhaf.partner.framework.agent.state.StateSerializable;
import work.slhaf.partner.framework.agent.state.StateValue;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
@CapabilityCore(value = "cognition")
public class ImpressionCore implements StateSerializable {
@@ -23,21 +23,177 @@ public class ImpressionCore implements StateSerializable {
*/
private final ConcurrentHashMap<String, Entity> knownEntitiesByUuid = new ConcurrentHashMap<>();
private final ImpressionVectorIndex vectorIndex = new ImpressionVectorIndex();
private final Set<ActiveEntity> activeEntities = new HashSet<>();
private final ImpressionTextSearch textSearch = new SimpleTextSearch();
private static final int TEXT_SEARCH_LIMIT = 20;
private static final int ASSOCIATION_MATCH_LIMIT = 8;
private static final double SUPPORTING_HIT_FACTOR = 0.3;
private static final double ASSOCIATION_CONFIDENCE_DIVISOR = 5.0;
/**
* 根据新的 Input 召回相关的实体,如果实体已重复,则将输入追加到 ActiveEntity 的证据中。
*
* @param input 本次输入内容
* @return 本次被召回的活跃实体(包括重复的实体)
*/
@CapabilityMethod
public void updateRelation() {
public Set<ActiveEntity> projectEntity(String input) {
if (input == null || input.isBlank()) {
return Set.of();
}
List<ImpressionSearchHit> textSearchHits = textSearch.search(input, TEXT_SEARCH_LIMIT);
List<EntityAssociationMatch> associationMatches = aggregateMatches(textSearchHits, ASSOCIATION_MATCH_LIMIT);
if (associationMatches.isEmpty()) {
return Set.of();
}
Set<ActiveEntity> projected = new HashSet<>();
for (EntityAssociationMatch match : associationMatches) {
Optional<ActiveEntity> activeEntity = resolveActiveEntity(match.getTarget());
if (activeEntity.isEmpty()) {
continue;
}
ActiveEntity entity = activeEntity.get();
entity.addEvidence(
input,
associationConfidence(match),
EntityEvidence.Source.USER_INPUT
);
refreshActiveEntityTextSearch(entity);
projected.add(entity);
}
return projected;
}
@CapabilityMethod
public void updateImpression() {
private List<EntityAssociationMatch> aggregateMatches(
List<ImpressionSearchHit> hits,
int limit
) {
if (hits == null || hits.isEmpty() || limit <= 0) {
return List.of();
}
return hits.stream()
.collect(Collectors.groupingBy(
hit -> hit.getDocument().getTarget(),
LinkedHashMap::new,
Collectors.toList()
))
.entrySet()
.stream()
.map(entry -> {
List<ImpressionSearchHit> sortedHits = entry.getValue()
.stream()
.sorted(Comparator
.comparingDouble(ImpressionSearchHit::getScore)
.reversed()
.thenComparing(hit -> hit.getDocument().getId()))
.toList();
return new EntityAssociationMatch(
entry.getKey(),
aggregateScore(sortedHits),
sortedHits
);
})
.sorted(Comparator
.comparingDouble(EntityAssociationMatch::getScore)
.reversed()
.thenComparing(match -> match.getTarget().getType().name())
.thenComparing(match -> match.getTarget().getId()))
.limit(limit)
.toList();
}
@CapabilityMethod
public void showImpressions() {
private double aggregateScore(List<ImpressionSearchHit> sortedHits) {
if (sortedHits.isEmpty()) {
return 0.0;
}
double bestHitScore = sortedHits.getFirst().getScore();
double supportingScore = sortedHits.stream()
.skip(1)
.limit(2)
.mapToDouble(hit -> hit.getScore() * SUPPORTING_HIT_FACTOR)
.sum();
return bestHitScore + supportingScore;
}
@CapabilityMethod
public void projectEntity(Set<ActiveEntity> activeEntities) {
private Optional<ActiveEntity> resolveActiveEntity(ImpressionSearchTarget target) {
return switch (target.getType()) {
case ACTIVE_ENTITY -> findActiveEntityByRuntimeId(target.getId());
case ENTITY -> activateKnownEntity(target.getId());
};
}
private Optional<ActiveEntity> findActiveEntityByRuntimeId(String runtimeId) {
synchronized (activeEntities) {
return activeEntities.stream()
.filter(activeEntity -> activeEntity.getRuntimeId().equals(runtimeId))
.findFirst();
}
}
private Optional<ActiveEntity> findActiveEntityByBoundEntityUuid(String uuid) {
synchronized (activeEntities) {
return activeEntities.stream()
.filter(activeEntity -> uuid.equals(activeEntity.getBoundEntityUuid()))
.findFirst();
}
}
private Optional<ActiveEntity> activateKnownEntity(String uuid) {
Entity knownEntity = knownEntitiesByUuid.get(uuid);
if (knownEntity == null) {
return Optional.empty();
}
Optional<ActiveEntity> existing = findActiveEntityByBoundEntityUuid(uuid);
if (existing.isPresent()) {
return existing;
}
ActiveEntity activeEntity = new ActiveEntity();
activeEntity.updateSubject(knownEntity.getSubject());
activeEntity.bindEntity(uuid);
synchronized (activeEntities) {
activeEntities.add(activeEntity);
}
refreshActiveEntityTextSearch(activeEntity);
return Optional.of(activeEntity);
}
private double associationConfidence(EntityAssociationMatch match) {
double normalized = match.getScore() / ASSOCIATION_CONFIDENCE_DIVISOR;
return Math.clamp(normalized, 0.05, 1.0);
}
private void refreshActiveEntityTextSearch(ActiveEntity activeEntity) {
ImpressionSearchTarget target = new ImpressionSearchTarget(
ImpressionSearchTarget.Type.ACTIVE_ENTITY,
activeEntity.getRuntimeId()
);
textSearch.removeByTarget(target);
for (ImpressionSearchDocument document : ImpressionSearchDocuments.INSTANCE.fromActiveEntity(activeEntity)) {
textSearch.upsert(document);
}
}
private void rebuildTextSearch() {
List<ImpressionSearchDocument> documents = new ArrayList<>();
knownEntitiesByUuid.values().forEach(entity ->
documents.addAll(ImpressionSearchDocuments.INSTANCE.fromEntity(entity))
);
synchronized (activeEntities) {
activeEntities.forEach(activeEntity ->
documents.addAll(ImpressionSearchDocuments.INSTANCE.fromActiveEntity(activeEntity))
);
}
textSearch.rebuild(documents);
}
@Override
@@ -70,9 +226,9 @@ public class ImpressionCore implements StateSerializable {
vectorIndex.sync(entity);
knownEntitiesByUuid.put(uuid, entity);
}
rebuildTextSearch();
}
@Override
public @NotNull State convert() {
State state = new State();

View File

@@ -0,0 +1,5 @@
package work.slhaf.partner.core.cognition.impression.search
interface ImpressionTokenizer {
fun tokenize(text: String): Set<String>
}

View File

@@ -0,0 +1,37 @@
package work.slhaf.partner.core.cognition.impression.search
import com.huaban.analysis.jieba.JiebaSegmenter
class JiebaImpressionTokenizer(
private val segmenter: JiebaSegmenter = JiebaSegmenter(),
private val mode: JiebaSegmenter.SegMode = JiebaSegmenter.SegMode.SEARCH,
) : ImpressionTokenizer {
override fun tokenize(text: String): Set<String> {
val normalized = normalize(text)
if (normalized.isBlank()) {
return emptySet()
}
val jiebaTerms = segmenter.process(normalized, mode)
.asSequence()
.map { it.word }
.map(::normalize)
.filter { it.isNotBlank() }
return (jiebaTerms + alphaNumericTerms(normalized)).toSet()
}
private fun alphaNumericTerms(text: String): Sequence<String> =
ALPHA_NUMERIC_REGEX.findAll(text).map { it.value }
private fun normalize(text: String): String =
text.lowercase()
.replace(WHITESPACE_REGEX, " ")
.trim()
companion object {
private val WHITESPACE_REGEX = Regex("\\s+")
private val ALPHA_NUMERIC_REGEX = Regex("[a-z0-9]+(?:[-_./][a-z0-9]+)*")
}
}

View File

@@ -0,0 +1,136 @@
package work.slhaf.partner.core.cognition.impression.search
class SimpleTextSearch(
private val tokenizer: ImpressionTokenizer = JiebaImpressionTokenizer(),
) : ImpressionTextSearch {
private val documents = linkedMapOf<String, IndexedDocument>()
private val invertedIndex = linkedMapOf<String, MutableSet<String>>()
@Synchronized
override fun rebuild(documents: Collection<ImpressionSearchDocument>) {
this.documents.clear()
invertedIndex.clear()
documents.forEach(::upsertInternal)
}
@Synchronized
override fun upsert(document: ImpressionSearchDocument) {
removeByDocumentId(document.id)
upsertInternal(document)
}
@Synchronized
override fun removeByTarget(target: ImpressionSearchTarget) {
documents.values
.asSequence()
.filter { it.document.target == target }
.map { it.document.id }
.toList()
.forEach(::removeByDocumentId)
}
@Synchronized
override fun search(query: String, limit: Int): List<ImpressionSearchHit> {
if (limit <= 0) {
return emptyList()
}
val normalizedQuery = normalize(query)
if (normalizedQuery.isBlank()) {
return emptyList()
}
val queryTerms = tokenizer.tokenize(normalizedQuery)
val candidateIds = if (queryTerms.isEmpty()) {
documents.keys.toSet()
} else {
queryTerms
.asSequence()
.flatMap { invertedIndex[it].orEmpty().asSequence() }
.toSet()
}
return candidateIds
.asSequence()
.mapNotNull { documentId -> scoreDocument(documents[documentId] ?: return@mapNotNull null, normalizedQuery, queryTerms) }
.filter { it.score > 0.0 }
.sortedWith(compareByDescending<ImpressionSearchHit> { it.score }.thenBy { it.document.id })
.take(limit)
.toList()
}
private fun upsertInternal(document: ImpressionSearchDocument) {
val normalizedText = normalize(document.text)
val terms = tokenizer.tokenize(normalizedText)
val indexedDocument = IndexedDocument(document, normalizedText, terms)
documents[document.id] = indexedDocument
terms.forEach { term ->
invertedIndex.getOrPut(term) { linkedSetOf() }.add(document.id)
}
}
private fun removeByDocumentId(documentId: String) {
val indexedDocument = documents.remove(documentId) ?: return
indexedDocument.terms.forEach { term ->
val ids = invertedIndex[term] ?: return@forEach
ids.remove(documentId)
if (ids.isEmpty()) {
invertedIndex.remove(term)
}
}
}
private fun scoreDocument(
indexedDocument: IndexedDocument,
normalizedQuery: String,
queryTerms: Set<String>,
): ImpressionSearchHit? {
val matchedTerms = if (queryTerms.isEmpty()) {
emptySet()
} else {
queryTerms.intersect(indexedDocument.terms)
}
val exactPhraseMatched = indexedDocument.normalizedText.contains(normalizedQuery)
if (matchedTerms.isEmpty() && !exactPhraseMatched) {
return null
}
val coverage = if (queryTerms.isEmpty()) 0.0 else matchedTerms.size.toDouble() / queryTerms.size.toDouble()
val termScore = matchedTerms.size.toDouble()
val exactPhraseBonus = if (exactPhraseMatched) EXACT_PHRASE_BONUS else 0.0
val fieldBonus = fieldBonus(indexedDocument.document.field)
val score = (termScore + coverage + exactPhraseBonus + fieldBonus) * indexedDocument.document.weight
return ImpressionSearchHit(
document = indexedDocument.document,
score = score,
matchedTerms = matchedTerms,
)
}
private fun fieldBonus(field: ImpressionSearchField): Double = when (field) {
ImpressionSearchField.SUBJECT -> 0.8
ImpressionSearchField.FEATURE -> 0.35
ImpressionSearchField.IMPRESSION -> 0.25
ImpressionSearchField.RELATION -> 0.15
ImpressionSearchField.EVIDENCE -> 0.0
}
private fun normalize(text: String): String =
text.lowercase()
.replace(WHITESPACE_REGEX, " ")
.trim()
private data class IndexedDocument(
val document: ImpressionSearchDocument,
val normalizedText: String,
val terms: Set<String>,
)
companion object {
private const val EXACT_PHRASE_BONUS = 1.5
private val WHITESPACE_REGEX = Regex("\\s+")
}
}

View File

@@ -0,0 +1,54 @@
package work.slhaf.partner.module.impression;
import lombok.val;
import org.jetbrains.annotations.NotNull;
import work.slhaf.partner.core.cognition.CognitionCapability;
import work.slhaf.partner.core.cognition.context.ContextBlock;
import work.slhaf.partner.framework.agent.factory.capability.annotation.InjectCapability;
import work.slhaf.partner.framework.agent.factory.component.abstracts.AbstractAgentModule;
import work.slhaf.partner.runtime.PartnerRunningFlowContext;
import java.util.Collection;
import java.util.Set;
import java.util.stream.Collectors;
public class ImpressionRecaller extends AbstractAgentModule.Running<PartnerRunningFlowContext> {
@InjectCapability
private CognitionCapability cognitionCapability;
/**
* 从交互中积累谈论的内容的特征(证据),基于证据创建 ActiveEntity然后交给 CognitionCapability 进行投影并更新上下文
*/
@Override
protected void doExecute(@NotNull PartnerRunningFlowContext context) {
val contextWorkspace = cognitionCapability.contextWorkspace();
context.getInputs()
.stream()
.map(inputEntry -> {
val content = inputEntry.getContent();
return cognitionCapability.projectEntity(content);
})
.flatMap(Collection::stream)
.collect(Collectors.toSet())
.forEach(activeEntity -> {
contextWorkspace.register(new ContextBlock(
activeEntity,
activeEntity,
activeEntity,
Set.of(
ContextBlock.FocusedDomain.COGNITION,
ContextBlock.FocusedDomain.MEMORY
),
100,
0.5,
20
));
});
}
@Override
public int order() {
return 2;
}
}

View File

@@ -0,0 +1,226 @@
package work.slhaf.partner.core.cognition.impression.search
import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.Assertions.assertFalse
import org.junit.jupiter.api.Assertions.assertTrue
import org.junit.jupiter.api.Test
import work.slhaf.partner.core.cognition.impression.ActiveEntity
class SimpleTextSearchTest {
@Test
fun `search ranks subject hit before evidence hit when both match similar terms`() {
val search = SimpleTextSearch(TestTokenizer())
val targetA = activeTarget("a")
val targetB = activeTarget("b")
search.rebuild(
listOf(
document("a-subject", targetA, ImpressionSearchField.SUBJECT, "城南旧书店老板", 1.0),
document("b-evidence", targetB, ImpressionSearchField.EVIDENCE, "用户提到城南旧书店附近有一家打印店", 0.8),
)
)
val hits = search.search("城南旧书店", limit = 10)
assertEquals(listOf("a-subject", "b-evidence"), hits.map { it.document.id })
assertTrue(hits.first().score > hits[1].score)
assertTrue(hits.first().matchedTerms.containsAll(setOf("城南", "旧书店")))
}
@Test
fun `exact phrase match can beat partial subject match`() {
val search = SimpleTextSearch(TestTokenizer())
val partialSubject = activeTarget("partial")
val exactEvidence = activeTarget("exact")
search.rebuild(
listOf(
document("partial-subject", partialSubject, ImpressionSearchField.SUBJECT, "工程教材", 1.0),
document("exact-evidence", exactEvidence, ImpressionSearchField.EVIDENCE, "旧书店老板推荐过工程教材", 0.8),
)
)
val hits = search.search("旧书店老板推荐过工程教材", limit = 10)
assertEquals("exact-evidence", hits.first().document.id)
assertTrue(hits.first().matchedTerms.containsAll(setOf("旧书店", "老板", "推荐", "工程", "教材")))
}
@Test
fun `search recalls bookstore owner from generated active entity documents`() {
val search = SimpleTextSearch(TestTokenizer())
val bookstoreOwner = activeEntity("bookstore", "城南旧书店老板") {
addEvidence("用户上周提到城南旧书店老板推荐过一本水利工程教材")
addProjectedFeatures("熟悉工程类旧书" to 0.9)
}
val technicalPartner = activeEntity("technical", "Java 技术搭子") {
addEvidence("用户正在讨论 Jieba 分词、SimpleTextSearch 和倒排索引")
addProjectedFeatures("熟悉 Kotlin 与检索实现" to 0.9)
}
val reportRoommate = activeEntity("report", "实验报告室友") {
addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件")
}
search.rebuild(
listOf(bookstoreOwner, technicalPartner, reportRoommate)
.flatMap(ImpressionSearchDocuments::fromActiveEntity)
)
val hits = search.search("旧书店老板推荐的工程教材", limit = 10)
assertFalse(hits.isEmpty())
assertEquals("bookstore", hits.first().document.target.id)
}
@Test
fun `search recalls technical active entity from implementation terms`() {
val search = SimpleTextSearch(TestTokenizer())
val technicalPartner = activeEntity("technical", "Java 技术搭子") {
addEvidence("用户正在讨论 Jieba 分词、SimpleTextSearch 和倒排索引")
addProjectedImpressions("需要补充搜索召回测试" to 0.8)
}
val reportRoommate = activeEntity("report", "实验报告室友") {
addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件")
}
search.rebuild(
listOf(technicalPartner, reportRoommate)
.flatMap(ImpressionSearchDocuments::fromActiveEntity)
)
val hits = search.search("jieba 分词 SimpleTextSearch 倒排索引", limit = 10)
assertFalse(hits.isEmpty())
assertEquals("technical", hits.first().document.target.id)
}
@Test
fun `search recalls report active entity from document task terms`() {
val search = SimpleTextSearch(TestTokenizer())
val technicalPartner = activeEntity("technical", "Java 技术搭子") {
addEvidence("用户正在讨论 Kotlin、Jieba 分词和 SimpleTextSearch")
}
val reportRoommate = activeEntity("report", "实验报告室友") {
addEvidence("用户帮室友整理 Vivado 进阶仿真实验报告模板和 docx 文件")
}
search.rebuild(
listOf(technicalPartner, reportRoommate)
.flatMap(ImpressionSearchDocuments::fromActiveEntity)
)
val hits = search.search("Vivado 实验报告模板", limit = 10)
assertFalse(hits.isEmpty())
assertEquals("report", hits.first().document.target.id)
}
@Test
fun `upsert replaces previous index terms for the same document id`() {
val search = SimpleTextSearch(TestTokenizer())
val target = activeTarget("entity")
search.upsert(document("doc", target, ImpressionSearchField.EVIDENCE, "旧书店老板", 1.0))
assertEquals(listOf("doc"), search.search("老板", limit = 10).map { it.document.id })
search.upsert(document("doc", target, ImpressionSearchField.EVIDENCE, "实验报告模板", 1.0))
assertTrue(search.search("老板", limit = 10).isEmpty())
assertEquals(listOf("doc"), search.search("实验报告", limit = 10).map { it.document.id })
}
@Test
fun `removeByTarget removes all documents belonging to that target`() {
val search = SimpleTextSearch(TestTokenizer())
val removed = activeTarget("removed")
val kept = activeTarget("kept")
search.rebuild(
listOf(
document("removed-subject", removed, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0),
document("removed-evidence", removed, ImpressionSearchField.EVIDENCE, "工程教材", 0.8),
document("kept-evidence", kept, ImpressionSearchField.EVIDENCE, "实验报告模板", 0.8),
)
)
search.removeByTarget(removed)
val hits = search.search("实验报告", limit = 10)
assertEquals(listOf("kept-evidence"), hits.map { it.document.id })
assertFalse(hits.any { it.document.target == removed })
assertTrue(search.search("旧书店", limit = 10).isEmpty())
}
@Test
fun `rebuild clears previous documents and index terms`() {
val search = SimpleTextSearch(TestTokenizer())
val target = activeTarget("entity")
search.rebuild(listOf(document("old", target, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0)))
assertEquals(listOf("old"), search.search("老板", limit = 10).map { it.document.id })
search.rebuild(listOf(document("new", target, ImpressionSearchField.SUBJECT, "实验报告模板", 1.0)))
assertTrue(search.search("老板", limit = 10).isEmpty())
assertEquals(listOf("new"), search.search("实验报告", limit = 10).map { it.document.id })
}
@Test
fun `blank unmatched and zero limit queries return empty hits`() {
val search = SimpleTextSearch(TestTokenizer())
val target = activeTarget("entity")
search.rebuild(listOf(document("doc", target, ImpressionSearchField.SUBJECT, "旧书店老板", 1.0)))
assertTrue(search.search(" ", limit = 10).isEmpty())
assertTrue(search.search("完全不存在", limit = 10).isEmpty())
assertTrue(search.search("旧书店", limit = 0).isEmpty())
}
private fun activeTarget(id: String) =
ImpressionSearchTarget(ImpressionSearchTarget.Type.ACTIVE_ENTITY, id)
private fun activeEntity(
runtimeId: String,
subject: String,
configure: ActiveEntity.() -> Unit,
): ActiveEntity = ActiveEntity(runtimeId = runtimeId).apply {
updateSubject(subject)
configure()
}
private fun document(
id: String,
target: ImpressionSearchTarget,
field: ImpressionSearchField,
text: String,
weight: Double,
) = ImpressionSearchDocument(
id = id,
target = target,
field = field,
text = text,
weight = weight,
)
private class TestTokenizer : ImpressionTokenizer {
private val dictionary = listOf(
"城南", "旧书店", "老板", "推荐", "工程", "教材", "水利", "熟悉", "旧书",
"java", "kotlin", "jieba", "分词", "simpletextsearch", "倒排", "索引", "检索", "测试", "召回",
"vivado", "实验报告", "实验", "报告", "模板", "docx", "室友", "整理", "文件"
)
private val alphaNumericRegex = Regex("[a-z0-9]+(?:[-_./][a-z0-9]+)*")
override fun tokenize(text: String): Set<String> {
val normalized = text.lowercase().trim()
if (normalized.isBlank()) {
return emptySet()
}
return buildSet {
dictionary.filterTo(this) { normalized.contains(it) }
alphaNumericRegex.findAll(normalized).mapTo(this) { it.value }
}
}
}
}

View File

@@ -0,0 +1,281 @@
# First Encounter Module / 初见模块设计草案
## 背景
Partner 当前已经不是“不能跑”的项目,但用户面对一个新的 agent 时,仍然会有明显的启动成本。
这个启动成本不完全来自工程状态,而来自互动预期的不确定:
- 不知道该怎么和它说话;
- 不知道它知道什么、不知道什么;
- 不知道它会不会误解用户;
- 不知道它能不能被纠正;
- 不知道纠正之后会不会真正改变后续行为。
因此Partner 需要一个“初见模块”。
它解决的不是程序启动问题,而是关系和预期建立问题。
## 定位
初见模块不应该只是 `InitModule`
`InitModule` 更像加载配置、初始化资源、检查运行状态;而初见模块面对的是用户第一次或重新面对 Partner 时的交互问题。
因此,代码层可以命名为:
```text
FirstEncounterModule
```
产品/概念层称为:
```text
初见模块
```
它的职责是:
> 在新用户、长时间未使用、上下文断裂、版本升级,或用户主动询问“你现在知道我什么”时,组织一次清醒、温和、可校准的开场。
## 与 Impression 模块的关系
初见模块应当依托 Impression但不属于 ImpressionCore。
边界如下:
```text
ImpressionCore
负责存储、召回、更新关于用户、agent 自身、关系契约、项目上下文等印象。
FirstEncounterModule
负责判断是否进入初见/重逢模式,并将召回的印象组织成本轮对话可用的 EncounterFrame。
EncounterState
负责记录初见流程是否已经完成,以及哪些环节已经向用户公开。
```
也就是说:
> Impression 负责“我对你有什么印象”。
> FirstEncounterModule 负责“第一次见面时,我该如何使用这些印象”。
不应把开场策略、纠错协议、对话引导逻辑直接塞进 ImpressionCore否则记忆模块会被迫承担表达和流程控制职责。
## 触发场景
初见模块可以在以下场景触发:
- 新用户第一次进入;
- 当前 session 没有足够上下文;
- 长时间未使用后重新进入;
- Partner 发生较大版本升级;
- Impression 召回结果置信度较低;
- 用户主动询问:
- “你知道我什么?”
- “你现在能做什么?”
- “我该怎么和你说话?”
- “你是不是还记得之前的事?”
- 系统检测到当前对话存在明显预期不稳定,例如用户多次纠正 agent 的语气、事实或任务边界。
## 核心流程
推荐流程:
```text
User Input
InteractionHub
EncounterDetector
ImpressionRecaller
FirstEncounterModule
EncounterFrame
PromptContributor / AppendPrompt
CoreModel Reply
ImpressionUpdater
```
其中:
1. `EncounterDetector` 判断是否需要进入初见/重逢模式;
2. `ImpressionRecaller` 召回相关印象;
3. `FirstEncounterModule` 将召回结果整理成 EncounterFrame
4. `PromptContributor` 将 EncounterFrame 注入模型上下文;
5. 对话结束后,`ImpressionUpdater` 根据用户反馈更新印象。
## EncounterFrame
`EncounterFrame` 是初见模块的核心输出。它不是长期记忆,而是本轮对话使用的临时认知框架。
示例结构:
```kotlin
data class EncounterFrame(
val mode: EncounterMode,
val knownAboutUser: List<ImpressionProjection>,
val knownAboutSelf: List<ImpressionProjection>,
val knownAboutRelationship: List<ImpressionProjection>,
val uncertainty: List<String>,
val correctionProtocol: CorrectionProtocol,
val openingStrategy: OpeningStrategy
)
```
其中:
- `mode`:当前是初见、重逢、版本升级后再介绍,还是用户主动询问;
- `knownAboutUser`:关于用户的可靠印象;
- `knownAboutSelf`Partner 对自身能力和边界的描述;
- `knownAboutRelationship`:关于互动方式、纠错方式、语气偏好等印象;
- `uncertainty`:当前不能确定的部分;
- `correctionProtocol`:用户如何纠正 Partner
- `openingStrategy`:本次开场应采用的表达策略。
## Impression Subject 建议
为了支持初见模块Impression 可以支持一些特殊 subject
```text
user
agent_self
relationship_contract
interaction_preference
project_context
```
例如:
```text
user:
- 用户偏好技术回答直接,不喜欢客服腔。
- 用户面对陌生 agent 时会在意互动预期是否稳定。
- 用户更容易接受从一个小切口开始推进。
agent_self:
- Partner 当前不是完全成熟的 agent。
- Partner 应公开自己的已知、未知和不确定。
- Partner 不应该在缺少依据时假装熟悉用户。
relationship_contract:
- 用户可以直接纠正 Partner。
- Partner 需要区分事实错误、语气偏差、理解偏差和任务边界偏差。
- 纠正应作为后续 impression 更新的重要信号。
```
## 初见开场策略
初见模块不应一上来问很多问题,也不应假装已经充分了解用户。
更合适的开场结构是:
```text
我现在对你还没有足够稳定的了解。
我会先说明:
- 我目前知道什么;
- 我不知道什么;
- 你可以怎么纠正我;
- 我会如何处理这些纠正。
接下来我们可以从一个很小的任务开始。
```
在 prompt 中可组织为:
```text
你正在与用户进行初见/重逢式对话。
你目前可靠知道:
- 用户希望技术讨论直接、少废话;
- 用户对陌生 agent 的互动预期尚未建立;
- 用户不喜欢 agent 在缺少依据时假装熟悉。
你应该主动说明:
- 你知道什么;
- 你不知道什么;
- 用户可以如何纠正你;
- 你会如何处理纠正。
不要一次性问很多问题。
不要假装亲近。
先从一个很小的任务或对话入口开始。
```
## EncounterState
初见模块需要少量流程状态,但这些状态不一定属于 Impression。
示例:
```kotlin
data class EncounterState(
val hasIntroducedSelf: Boolean,
val hasShownKnownUnknown: Boolean,
val hasExplainedCorrectionProtocol: Boolean,
val firstEncounterCompleted: Boolean,
val lastEncounterVersion: String?
)
```
这些状态表示流程是否完成,而不是关于用户的长期印象。
真正应该进入 Impression 的,是对用户、关系、互动方式的理解,例如:
```text
用户面对新的 agent 时,会担心互动预期不稳定。
用户希望 agent 明确边界,而不是一上来装熟。
用户能接受通过纠正来校准 agent。
```
## 最小实现方案
第一版可以很轻,不需要完整工程化。
建议步骤:
1. 新增 `FirstEncounterPromptContributor`
2. 新增 `EncounterDetector`,先用简单规则判断是否触发;
3.`ImpressionRecaller` 召回 `user``agent_self``relationship_contract``interaction_preference``project_context` 相关印象;
4. 生成一个简化版 `EncounterFrame`
5. 将 EncounterFrame 注入 AppendPrompt
6. 用户纠正后,将纠正内容作为 evidence 交给 ImpressionUpdater。
第一版不需要复杂策略模型,规则足够:
```text
新 session + 低熟悉度 → 初见模式
长时间未使用 + 有历史 impression → 重逢模式
用户主动询问已知/未知 → 自我公开模式
多次纠正 → 关系校准模式
```
## 不做什么
初见模块第一版不做以下内容:
- 不做完整 onboarding 表单;
- 不一次性询问大量偏好;
- 不把用户画像写死;
- 不假装已经理解用户;
- 不替代 ImpressionCore
- 不直接负责长期记忆写入;
- 不在每轮对话中重复自我介绍。
它只负责在关系尚未稳定时,提供一个清醒、可纠正、可继续的开场。
## 价值
初见模块的价值不只是“第一次使用体验更好”。
它实际上补上了 Partner 作为 agent 的一个关键能力:
> 在上下文断裂、长期未见、版本变化或记忆不确定时,仍然能让用户知道该如何继续与它相处。
这使 Partner 不只是一个能运行的程序,而是一个能够建立互动预期、暴露不确定性、接受校准,并逐步形成稳定关系的 agent。

View File

@@ -0,0 +1,210 @@
# Impression Vector Fusion Plan
## Context
Current `ImpressionCore.projectEntity` already connects text recall to active entity projection:
```text
input
-> SimpleTextSearch.search(input)
-> group document hits by ImpressionSearchTarget
-> aggregate into EntityAssociationMatch
-> resolve ACTIVE_ENTITY or ENTITY target
-> append EntityEvidence
-> refresh active entity text-search documents
```
This gives the Impression module a first explainable recall path. Vector recall should not replace this path. It should become another recall signal that is fused with text recall before projection.
## Why not implement vector fusion immediately
Vector fusion is a recall-source enhancement, not the next foundation step.
Before adding more recall sources, the module still needs a clearer organization pipeline:
- how an unmatched input becomes a new `ActiveEntity`;
- how runtime evidence is accumulated, merged, or decayed;
- how an `ActiveEntity` is rolled into a long-term `Entity`;
- how extracted features and impressions update known entities;
- when `textSearch` and `vectorIndex` are refreshed after entity updates.
Unmatched entity creation and `ActiveEntity` rolling are closely related: both decide how temporary evidence becomes a stable entity-level impression. They should be considered as one organization chain rather than two unrelated features.
## Target shape
Future `projectEntity` should have this shape:
```text
input
-> text recall signals
-> vector recall signals
-> normalize scores
-> fuse signals by ImpressionSearchTarget
-> resolve or create ActiveEntity
-> append evidence
-> refresh runtime indexes
```
The later half should stay shared. Text recall, vector recall, relation recall, and recency recall should all produce association signals. Projection should not depend on which recall source produced a match.
## First vector scope
The first vector implementation should only recall long-term `ENTITY` targets.
Reason:
- `ImpressionVectorIndex` already syncs known `Entity` data.
- Known entities have relatively stable features and impressions.
- Active entity evidence changes frequently; embedding every new evidence item would add update cost and lifecycle complexity too early.
So the first vector target should be:
```text
Entity feature / impression vector
-> ImpressionSearchTarget(Type.ENTITY, entityUuid)
```
Later, after the active entity organization chain is stable, active evidence vectors can be added as:
```text
ActiveEntity evidence / projected feature / projected impression vector
-> ImpressionSearchTarget(Type.ACTIVE_ENTITY, runtimeId)
```
## Signal model
`EntityAssociationMatch` is currently text-oriented because it stores `List<ImpressionSearchHit>`.
For fusion, introduce a source-neutral signal model:
```kotlin
data class EntityAssociationSignal(
val target: ImpressionSearchTarget,
val source: Source,
val score: Double,
val reason: String,
val textHit: ImpressionSearchHit? = null,
val vectorHit: ImpressionVectorHit? = null,
) {
enum class Source {
TEXT,
VECTOR,
RELATION,
RECENCY
}
}
```
Then change or extend `EntityAssociationMatch` toward:
```kotlin
data class EntityAssociationMatch(
val target: ImpressionSearchTarget,
val score: Double,
val signals: List<EntityAssociationSignal> = emptyList(),
)
```
This keeps fusion explainable. A match can still tell the model or logs why an entity was recalled.
## Score normalization
Text search score and vector similarity should not be added directly.
Text search currently produces an internal score based on token hits, coverage, exact phrase bonus, field bonus, and document weight. Vector search is usually cosine-like similarity. Normalize both into association-strength-like values before fusion.
Possible first normalization:
```text
textScore01 = clamp(textScore / 5.0, 0.0, 1.0)
vectorScore01 =
similarity < 0.55 -> 0.0
otherwise -> clamp((similarity - 0.55) / 0.35, 0.0, 1.0)
```
The constants are placeholders. They should be tuned with tests and logs.
## Fusion rule
Use strong-hit priority with multi-source support, not simple averaging.
A first rule can be:
```text
targetScore =
max(bestTextScore, bestVectorScore * 0.9)
+ sameTargetCrossSourceBonus
+ supportingSignalBonus
```
Suggested behavior:
- direct subject or phrase text match should beat vague vector similarity;
- vector recall should recover semantically related entities when text recall is weak or empty;
- if text and vector both hit the same target, the target should receive a small confidence boost;
- long documents or many weak signals should not dominate a single strong subject/evidence hit.
## Execution strategy
First implementation can be conservative:
```text
always run TextSearch
run VectorSearch only when:
- text recall is empty; or
- top text match confidence is low; or
- input is long and semantic rather than name-like
```
If the embedding model is local and cheap enough, this can later become parallel text + vector recall.
## Implementation phases
### Phase 1: organization chain first
Implement before vector fusion:
- unmatched input -> new `ActiveEntity` candidate;
- active evidence update and dedup/merge rules;
- active entity rolling into known `Entity`;
- known entity feature/impression update;
- index refresh after entity updates.
### Phase 2: signal abstraction
Introduce `EntityAssociationSignal` and make text hits convert into signals.
Keep current behavior equivalent after refactor.
### Phase 3: long-term entity vector recall
Add vector recall only for known `Entity` targets:
```text
input embedding
-> ImpressionVectorIndex.search(...)
-> vector hits
-> EntityAssociationSignal(source = VECTOR)
-> fuse with text signals
```
### Phase 4: active entity vector recall
Only after active entity lifecycle is stable:
- vectorize active evidence or projected features;
- update active vector index when evidence changes;
- fuse `ACTIVE_ENTITY` vector hits with text hits.
## Non-goals for first vector pass
Do not start with:
- vectorizing every raw evidence item immediately;
- replacing text search ranking;
- using vector score as direct `associationConfidence` without normalization;
- adding opaque fusion that cannot explain why an entity was recalled;
- expanding `projectEntity` into a large source-specific method.
The intended direction is: multiple recall sources produce explainable signals, then `ImpressionCore` performs one shared entity projection flow.