enginoid
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 24 additions & 0 deletions b/‎README.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎__init__.py‎ b/‎__init__.py‎
diff --git a/‎cli.py‎
Lines changed: 12 additions & 0 deletions b/‎cli.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 23 additions & 0 deletions b/‎docker-compose.yml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎ingest_to_meilisearch.py‎
Lines changed: 27 additions & 0 deletions b/‎ingest_to_meilisearch.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎ingest_to_qdrant.py‎
Lines changed: 63 additions & 0 deletions b/‎ingest_to_qdrant.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎preprocess_dataset.py‎
Lines changed: 62 additions & 0 deletions b/‎preprocess_dataset.py‎
Lines changed: 62 additions & 0 deletions
@@ -0,0 +1,6 @@
+documents.json
+arxiv-metadata-oai-snapshot.json
+secrets.json
+
+meili_data/
+qdrant_storage/
@@ -0,0 +1,24 @@
+# Arxiv QA
+
+Retrieval-augmented generation example that answers questions from Arxiv abstracts and titles.
+
+## Setup
+
+* Copy `secrets-example.json` and replace with your own key.
+* Fetch `arxiv-metadata-oai-snapshot.json`
+  * `kaggle datasets download -d Cornell-University/arxiv`
+* Run `preprocess_dataset.py`
+   * Input file: `arxiv-metadata-oai-snapshot.json`
+   * Output file: `documents.json` (a bit smaller)
+* `docker compose up -d` to run MeiliSearch and Qdrant
+* Then
+    * `ingest_to_meilisearch.py`
+    * `ingest_to_qdrant.py`
+        * You'll want a GPU 😁, use `nvitop` to check it's using GPU.
+        * Example performance: g5.xlarge (1x A10G), ~600k abstracts, ~12 minutes
+* Finally `query.py` to ask some questions.
+
+# Other tips
+
+* You can connect to a nice server to test Meilisearch keyword lookup on `http://localhost:8080/`
+* `cli.py` could be useful but at the moment only exposes `meilisearch_index` and `meilisearch_client`
@@ -0,0 +1,12 @@
+import subprocess
+
+command = """\
+import meilisearch
+meilisearch_client = meilisearch.Client('http://127.0.0.1:7700')
+meilisearch_index = meilisearch_client.index("papers")
+"""
+
+try:
+    subprocess.run(["ipython", "-i", "-c", command])
+except FileNotFoundError:
+    print("IPython is not installed. Please install it by running: pip install ipython")
@@ -0,0 +1,23 @@
+version: '3'
+services:
+  qdrant:
+    image: qdrant/qdrant
+    ports:
+      - 6333:6333
+    volumes:
+      - ./qdrant_storage:/qdrant/storage
+
+  meilisearch:
+    image: getmeili/meilisearch:v1.2
+    ports:
+      - 7700:7700
+    volumes:
+      - ./meili_data:/meili_data
+  
+  search-ui:
+    image: nginx:latest
+    volumes:
+      - ./search_ui:/usr/share/nginx/html:ro
+    ports:
+      - 8080:80
+    restart: always
@@ -0,0 +1,27 @@
+"""
+This module reads all the docs and ingests them into MeiliSearch.
+"""
+import json
+import meilisearch
+from tqdm import tqdm
+
+def yield_docs():
+    with open("documents.json", "r") as fp:
+        for line in fp:
+            yield json.loads(line)
+
+docs = list(yield_docs())
+
+client = meilisearch.Client('http://127.0.0.1:7700')
+
+index = client.index("papers")
+
+# Replace any . in the doc ID with a - becasue MeiliSearch doesn't like dots
+for doc in docs:
+    doc["id"] = doc["id"].replace(".", "-")
+
+batch_size = 100
+chunked_docs = [docs[i:i + batch_size] for i in range(0, len(docs), batch_size)]
+
+for doc_chunk in tqdm(chunked_docs, desc="Indexing documents"):
+    index.add_documents(doc_chunk, primary_key="id")
@@ -0,0 +1,63 @@
+
+
+from qdrant_client import models, QdrantClient
+import hashlib
+from concurrent.futures import ProcessPoolExecutor
+import json
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+def upload_records_process(documents_chunk):
+    qdrant = QdrantClient()
+
+    qdrant.upload_records("papers", [
+        models.Record(
+            id=hashlib.md5(doc["id"].encode()).hexdigest(),
+            vector=doc["vector"],
+            payload=doc
+        ) for doc in documents_chunk
+    ])
+
+
+print("Loading encoder...")
+encoder = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
+
+print(f"Opening documents file...")
+
+documents_list = []
+with open("documents.json", "r") as fp:
+    for line in fp:
+        documents_list.append(json.loads(line))
+
+print(f"Indexing {len(documents_list)} documents...")
+
+batch_size = 4096
+documents_list_chunked = [documents_list[i:i + batch_size] for i in range(0, len(documents_list), batch_size)]
+
+qdrant = QdrantClient()
+qdrant.recreate_collection(
+    collection_name="papers",
+    vectors_config=models.VectorParams(
+        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
+        distance=models.Distance.COSINE
+    )
+)
+
+# We want to upload the documents in parallel with continuing
+# to encode the next batch of documents. If we don't do this,
+# then we have a lot of GPU idle time while docs are being
+# uploaded to Qdrant.
+upload_executor = ProcessPoolExecutor(max_workers=3)
+
+for documents_chunk in tqdm(documents_list_chunked, desc="Processing document chunks"):
+    abstracts = encoder.encode([doc["abstract"] for doc in documents_chunk])
+    for idx, doc in enumerate(documents_chunk):
+        doc["vector"] = abstracts[idx].tolist()
+
+    upload_executor.submit(upload_records_process, documents_chunk)
+
+# Wait for the executors to finish
+upload_executor.shutdown()
@@ -0,0 +1,62 @@
+import json
+from typing import Generator
+
+def get_dataset_generator(path: str) -> Generator:
+    with open(path, "r") as fp:
+        for line in fp:
+            row = json.loads(line)
+            yield row
+        
+
+def filter_generator(g: Generator, filter_fn):
+    for item in g:
+        if filter_fn(item):
+            yield item
+
+def stop_after(g, num_items):
+    for i, item in enumerate(g):
+        if i == num_items:
+            break
+        yield item
+
+def clean_document(doc):
+    return {
+        "id": doc["id"],
+        "title": doc["title"].replace("\n", " "),
+        "abstract": doc["abstract"],
+        "categories": doc["categories"].split(" "),
+        "update_date": doc["update_date"],
+    }
+
+documents_list = []
+try:
+    with open("documents.json", "r") as fp:
+        for line in fp:
+            documents_list.append(json.loads(line))
+except FileNotFoundError:
+    dataset_generator = get_dataset_generator(
+        path="arxiv-metadata-oai-snapshot.json"
+    )
+
+    def filter_relevant(doc):
+        for category in doc["categories"]:
+            if category.startswith("cs."):
+                return True
+        
+        return False
+
+    documents = map(clean_document, dataset_generator)
+    documents = filter(filter_relevant, documents)
+
+    print(f"Generating in-memory documents structure")
+    documents_list = list(documents)
+
+    print(f"Writing {len(documents_list)} documents...")
+    with open("documents.json", "w") as fp:
+        for doc in documents_list:
+            fp.write(json.dumps(doc) + "\n")
+
+print("Document examples:")
+for doc in documents_list[:3]:
+    print(f"[{doc['update_date']}] {doc['title']} ({doc['categories']})")
+