feat(eval): add eval configs example (#19)

Duguce · web-flow · commit 3378fc0f914b · 2025-07-08T18:03:06.000+08:00
* feat(eval): add eval dependencies

* feat(eval): add configs example

* docs(eval): update README.md

* feat(eval): remove the dependency (pydantic)
diff --git a/docs/modules/mem_reader.md b/docs/modules/mem_reader.md
@@ -147,20 +147,20 @@ Documents are chunked and summarized to create searchable knowledge items.
 
 We use [`markitdown`](https://github.com/microsoft/markitdown) to convert files to Markdown format texts.
 
-**MarkItDown currently supports the conversion from:**  
+**MarkItDown currently supports the conversion from:**
 
 ```
-PDF  
-PowerPoint  
-Word  
-Excel  
-Images (EXIF metadata and OCR)  
-Audio (EXIF metadata and speech transcription)  
-HTML  
-Text-based formats (CSV, JSON, XML)  
-ZIP files (iterates over contents)  
-YouTube URLs  
-EPUBs  
+PDF
+PowerPoint
+Word
+Excel
+Images (EXIF metadata and OCR)
+Audio (EXIF metadata and speech transcription)
+HTML
+Text-based formats (CSV, JSON, XML)
+ZIP files (iterates over contents)
+YouTube URLs
+EPUBs
 ... and more!
 ```
 *(Content sourced from [MarkItDown GitHub repository](https://github.com/microsoft/markitdown))*
diff --git a/evaluation/.env-example b/evaluation/.env-example
@@ -0,0 +1,11 @@
+MODEL="gpt-4o-mini"
+OPENAI_API_KEY="sk-***REDACTED***"
+OPENAI_BASE_URL="http://***.***.***.***:3000/v1"
+
+MEM0_API_KEY="m0-***REDACTED***"
+
+ZEP_API_KEY="z_***REDACTED***"
+
+CHAT_MODEL="gpt-4o-mini"
+CHAT_MODEL_BASE_URL="http://***.***.***.***:3000/v1"
+CHAT_MODEL_API_KEY="sk-***REDACTED***"
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -1,6 +1,6 @@
 # Evaluation Memory Framework
 
-This repository provides tools and scripts for evaluating the LoCoMo and LongMemEval dataset using various models and APIs.
+This repository provides tools and scripts for evaluating the LoCoMo dataset using various models and APIs.
 
 ## Installation
 
@@ -17,67 +17,18 @@ This repository provides tools and scripts for evaluating the LoCoMo and LongMem
 
 ## Configuration
 
-Create an `.env` file in the `evaluation/` directory and include the following environment variables:
+1. Copy the `.env-example` file to `.env`, and fill in the required environment variables according to your environment and API keys.
 
-```plaintext
-OPENAI_API_KEY="sk-xxx"
-OPENAI_BASE_URL="your_base_url"
+2. Copy the `configs-example/` directory to a new directory named `configs/`, and modify the configuration files inside it as needed. This directory contains model and API-specific settings.
 
-MEM0_API_KEY="your_mem0_api_key"
-MEM0_PROJECT_ID="your_mem0_proj_id"
-MEM0_ORGANIZATION_ID="your_mem0_org_id"
 
-MODEL="gpt-4o-mini"  # or your preferred model
-EMBEDDING_MODEL="text-embedding-3-small"  # or your preferred embedding model
-ZEP_API_KEY="your_zep_api_key"
-```
+## Evaluation Scripts
 
-## Dataset
-The smaller dataset "LoCoMo" has already been included in the repo to facilitate reproducing.
+### LoCoMo Evaluation
+To evaluate the **LoCoMo** dataset using one of the supported memory frameworks — `memos`, `mem0`, or `zep` — run the following command:
 
-To download the "LongMemEval" dataset, run the following command:
 ```bash
-huggingface-cli download --repo-type dataset --resume-download xiaowu0162/longmemeval --local-dir data/longmemeval
+# Edit the configuration in ./scripts/run_locomo_eval.sh
+# Specify the model and memory backend you want to use (e.g., mem0, zep, etc.)
+./scripts/run_locomo_eval.sh
 ```
-
-After downloading, rename the files as follows:
-- `longmemeval_m.json`
-- `longmemeval_s.json`
-- `longmemeval_oracle.json`
-
-## Evaluation Scripts
-
-To evaluate the `locomo` dataset, execute the following scripts in order:
-
-1. **Ingest locomo history into MemOS:**
-   ```bash
-   python scripts/locomo/locomo_ingestion.py --lib memos
-   ```
-
-2. **Search Memory for each QA pair in locomo:**
-   ```bash
-   python scripts/locomo/locomo_search.py --lib memos
-   ```
-
-3. **Generate responses from OpenAI with provided context:**
-   ```bash
-   python scripts/locomo/locomo_responses.py --lib memos
-   ```
-
-4. **Evaluate the generated answers:**
-   ```bash
-   python scripts/locomo/locomo_eval.py --lib memos
-   ```
-
-5. **Calculate fine-grained scores for each category:**
-   ```bash
-   python scripts/locomo/locomo_metric.py --lib memos
-   ```
-
-## Contributing Guidelines
-
-1. **Add New Metrics**
-When incorporating the evaluation of reflection duration, ensure to record related data in `{lib}_locomo_judged.json`. For additional NLP metrics like BLEU and ROUGE-L score, make adjustments to the `locomo_grader` function in `scripts/locomo/locomo_eval.py`.
-
-2. **Intermediate Results**
-While I have provided intermediate results like `{lib}_locomo_search_results.json`, `{lib}_locomo_responses.json`, and `{lib}_locomo_judged.json` for reproducibility, contributors are encouraged to report final results in the PR description rather than editing these files directly. Any valuable modifications will be combined into an updated version of the evaluation code containing revised intermediate results (at specified intervals).
diff --git a/evaluation/scripts/locomo/locomo_eval.py b/evaluation/scripts/locomo/locomo_eval.py
@@ -363,8 +363,8 @@ async def limited_task(task):
     parser.add_argument(
         "--lib",
         type=str,
-        choices=["zep", "memos", "mem0", "mem0_graph", "memos_mos", "langmem", "openai"],
-        help="Specify the memory framework (zep or memos or mem0 or mem0_graph or memos_mos)",
+        choices=["zep", "memos", "mem0", "mem0_graph", "langmem", "openai"],
+        help="Specify the memory framework (zep or memos or mem0 or mem0_graph)",
     )
     parser.add_argument(
         "--version",
diff --git a/evaluation/scripts/locomo/locomo_ingestion.py b/evaluation/scripts/locomo/locomo_ingestion.py
@@ -15,10 +15,8 @@
 
 from memos.configs.mem_cube import GeneralMemCubeConfig
 from memos.configs.mem_os import MOSConfig
-from memos.configs.memory import MemoryConfigFactory
 from memos.mem_cube.general import GeneralMemCube
 from memos.mem_os.main import MOS
-from memos.memories.factory import MemoryFactory
 
 
 custom_instructions = """
@@ -61,29 +59,10 @@ def get_client(frame: str, user_id: str | None = None, version: str = "default")
         return mem0
 
     elif frame == "memos":
-        config_path = "configs/text_memos_config.json"
-        with open(config_path) as f:
-            config_data = json.load(f)
-        config_data["config"]["extractor_llm"]["config"]["model_name_or_path"] = os.getenv("MODEL")
-        config_data["config"]["extractor_llm"]["config"]["api_key"] = os.getenv("OPENAI_API_KEY")
-        config_data["config"]["extractor_llm"]["config"]["api_base"] = os.getenv("OPENAI_BASE_URL")
-        config_data["config"]["vector_db"]["config"]["path"] = (
-            f"results/locomo/memos-{version}/storages/{user_id}/qdrant"
-        )
-        config_data["config"]["embedder"]["config"]["model_name_or_path"] = os.getenv(
-            "EMBEDDING_MODEL"
-        )
-
-        config = MemoryConfigFactory.model_validate(config_data)
-
-        m = MemoryFactory.from_config(config)
-        m.load(f"results/locomo/memos-{version}/storages/{user_id}")
-        return m
-
-    elif frame == "memos_mos":
         mos_config_path = "configs/mos_memos_config.json"
         with open(mos_config_path) as f:
             mos_config_data = json.load(f)
+        mos_config_data["top_k"] = 20
         mos_config = MOSConfig(**mos_config_data)
         mos = MOS(mos_config)
         mos.create_user(user_id=user_id)
@@ -147,20 +126,6 @@ def ingest_session(client, session, frame, metadata, revised_client=None):
             )
 
     elif frame == "memos":
-        for chat in tqdm(session, desc=f"{metadata['session_key']}"):
-            data = chat.get("speaker") + ": " + chat.get("text")
-            print({"context": data, "conv_id": conv_id, "created_at": iso_date})
-            msg = [{"role": "user", "content": data}]
-
-            try:
-                memories = client.extract(msg)
-            except Exception as ex:
-                print(f"Error extracting message {msg}: {ex}")
-                memories = []
-            print(memories)
-            client.add(memories)
-
-    elif frame == "memos_mos":
         messages = []
         messages_reverse = []
 
@@ -276,14 +241,11 @@ def process_user(conv_idx, frame, locomo_df, version, num_workers=1):
             client.delete_all(user_id=f"{conversation.get('speaker_a')}_{conv_idx}")
             client.delete_all(user_id=f"{conversation.get('speaker_b')}_{conv_idx}")
         elif frame == "memos":
-            conv_id = "locomo_exp_user_" + str(conv_idx)
-            client = get_client("memos", conv_id, version)
-        elif frame == "memos_mos":
             conv_id = "locomo_exp_user_" + str(conv_idx)
             speaker_a_user_id = conv_id + "_speaker_a"
             speaker_b_user_id = conv_id + "_speaker_b"
-            client = get_client("memos_mos", speaker_a_user_id, version)
-            revised_client = get_client("memos_mos", speaker_b_user_id, version)
+            client = get_client("memos", speaker_a_user_id, version)
+            revised_client = get_client("memos", speaker_b_user_id, version)
 
         sessions_to_process = []
         for session_idx in range(max_session_count):
@@ -324,11 +286,6 @@ def process_user(conv_idx, frame, locomo_df, version, num_workers=1):
                 except Exception as e:
                     print(f"Error processing user {conv_idx}, session {session_key}: {e!s}")
 
-        if frame == "memos":
-            conv_id = "locomo_exp_user_" + str(conv_idx)
-            client.dump(f"results/locomo/memos-{version}/storages/{conv_id}")
-            del client
-
         end_time = time.time()
         elapsed_time = round(end_time - start_time, 2)
         print(f"User {conv_idx} processed successfully in {elapsed_time} seconds")
@@ -383,8 +340,8 @@ def main(frame, version="default", num_workers=4):
     parser.add_argument(
         "--lib",
         type=str,
-        choices=["zep", "memos", "mem0", "mem0_graph", "memos_mos"],
-        help="Specify the memory framework (zep or memos or mem0 or mem0_graph or memos_mos)",
+        choices=["zep", "memos", "mem0", "mem0_graph"],
+        help="Specify the memory framework (zep or memos or mem0 or mem0_graph)",
     )
     parser.add_argument(
         "--version",
diff --git a/evaluation/scripts/locomo/locomo_metric.py b/evaluation/scripts/locomo/locomo_metric.py
@@ -9,8 +9,8 @@
 parser.add_argument(
     "--lib",
     type=str,
-    choices=["zep", "memos", "mem0", "mem0_graph", "memos_mos", "langmem", "openai"],
-    help="Specify the memory framework (zep or memos or mem0 or mem0_graph or memos_mos)",
+    choices=["zep", "memos", "mem0", "mem0_graph", "langmem", "openai"],
+    help="Specify the memory framework (zep or memos or mem0 or mem0_graph)",
 )
 parser.add_argument(
     "--version",
diff --git a/evaluation/scripts/locomo/locomo_responses.py b/evaluation/scripts/locomo/locomo_responses.py
@@ -24,7 +24,7 @@ async def locomo_response(frame, llm_client, context: str, question: str) -> str
             context=context,
             question=question,
         )
-    elif frame == "memos" or frame == "memos_mos":
+    elif frame == "memos":
         prompt = ANSWER_PROMPT_MEMOS.format(
             context=context,
             question=question,
@@ -124,8 +124,8 @@ async def main(frame, version="default"):
     parser.add_argument(
         "--lib",
         type=str,
-        choices=["zep", "memos", "mem0", "mem0_graph", "memos_mos", "openai"],
-        help="Specify the memory framework (zep or memos or mem0 or mem0_graph or memos_mos)",
+        choices=["zep", "memos", "mem0", "mem0_graph", "openai"],
+        help="Specify the memory framework (zep or memos or mem0 or mem0_graph)",
     )
     parser.add_argument(
         "--version",
diff --git a/evaluation/scripts/locomo/locomo_search.py b/evaluation/scripts/locomo/locomo_search.py
@@ -15,12 +15,10 @@
 from zep_cloud.client import Zep
 
 from memos.configs.mem_os import MOSConfig
-from memos.configs.memory import MemoryConfigFactory
 from memos.mem_os.main import MOS
-from memos.memories.factory import MemoryFactory
 
 
-def get_client(frame: str, user_id: str | None = None, version: str = "default"):
+def get_client(frame: str, user_id: str | None = None, version: str = "default", top_k: int = 20):
     if frame == "zep":
         zep = Zep(api_key=os.getenv("ZEP_API_KEY"), base_url="https://api.getzep.com/api/v2")
         return zep
@@ -30,29 +28,10 @@ def get_client(frame: str, user_id: str | None = None, version: str = "default")
         return mem0
 
     elif frame == "memos":
-        config_path = "configs/text_memos_config.json"
-        with open(config_path) as f:
-            config_data = json.load(f)
-        config_data["config"]["extractor_llm"]["config"]["model_name_or_path"] = os.getenv("MODEL")
-        config_data["config"]["extractor_llm"]["config"]["api_key"] = os.getenv("OPENAI_API_KEY")
-        config_data["config"]["extractor_llm"]["config"]["api_base"] = os.getenv("OPENAI_BASE_URL")
-        config_data["config"]["vector_db"]["config"]["path"] = (
-            f"results/locomo/memos-{version}/storages/{user_id}/qdrant"
-        )
-        config_data["config"]["embedder"]["config"]["model_name_or_path"] = os.getenv(
-            "EMBEDDING_MODEL"
-        )
-
-        config = MemoryConfigFactory.model_validate(config_data)
-
-        m = MemoryFactory.from_config(config)
-        m.load(f"results/locomo/memos-{version}/storages/{user_id}")
-        return m
-
-    elif frame == "memos_mos":
         mos_config_path = "configs/mos_memos_config.json"
         with open(mos_config_path) as f:
             mos_config_data = json.load(f)
+        mos_config_data["top_k"] = top_k
         mos_config = MOSConfig(**mos_config_data)
         mos = MOS(mos_config)
         mos.create_user(user_id=user_id)
@@ -123,18 +102,6 @@ def get_client(frame: str, user_id: str | None = None, version: str = "default")
 """
 
 
-def memos_search(client, query):
-    start = time()
-    search_results = client.search(query, top_k=20)
-    context = ""
-    for item in search_results:
-        item = item.to_dict()
-        context += f"{item['memory']}\n"
-    print(query, context)
-    duration_ms = (time() - start) * 1000
-    return context, duration_ms
-
-
 def mem0_search(client, query, speaker_a_user_id, speaker_b_user_id, top_k=20):
     start = time()
     search_speaker_a_results = client.search(
@@ -192,7 +159,7 @@ def mem0_search(client, query, speaker_a_user_id, speaker_b_user_id, top_k=20):
     return context, duration_ms
 
 
-def memos_mos_search(client, query, conv_id, speaker_a, speaker_b, reversed_client=None):
+def memos_search(client, query, conv_id, speaker_a, speaker_b, reversed_client=None):
     start = time()
     search_a_results = client.search(
         query=query,
@@ -349,8 +316,8 @@ def search_query(client, query, metadata, frame, reversed_client=None, top_k=20)
         context, duration_ms = mem0_graph_search(
             client, query, speaker_a_user_id, speaker_b_user_id, top_k
         )
-    elif frame == "memos_mos":
-        context, duration_ms = memos_mos_search(
+    elif frame == "memos":
+        context, duration_ms = memos_search(
             client, query, conv_id, speaker_a, speaker_b, reversed_client
         )
     return context, duration_ms
@@ -394,11 +361,11 @@ def process_user(group_idx, locomo_df, frame, version, top_k=20, num_workers=1):
     }
 
     reversed_client = None
-    if frame == "memos_mos":
+    if frame == "memos":
         speaker_a_user_id = conv_id + "_speaker_a"
         speaker_b_user_id = conv_id + "_speaker_b"
-        client = get_client(frame, speaker_a_user_id, version)
-        reversed_client = get_client(frame, speaker_b_user_id, version)
+        client = get_client(frame, speaker_a_user_id, version, top_k=top_k)
+        reversed_client = get_client(frame, speaker_b_user_id, version, top_k=top_k)
     else:
         client = get_client(frame, conv_id, version)
 
@@ -474,8 +441,8 @@ def main(frame, version="default", num_workers=1, top_k=20):
     parser.add_argument(
         "--lib",
         type=str,
-        choices=["zep", "memos", "mem0", "mem0_graph", "memos_mos", "langmem"],
-        help="Specify the memory framework (zep or memos or mem0 or mem0_graph or memos_mos)",
+        choices=["zep", "memos", "mem0", "mem0_graph", "langmem"],
+        help="Specify the memory framework (zep or memos or mem0 or mem0_graph)",
     )
     parser.add_argument(
         "--version",
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml