Skip to content

Commit 0c20349

Browse files
authored
feat/shallow index (#40)
* feat(index): add shallow index support (INDEX_FILE_SHALLOW); builder method for file list; manager load/build shallow with find_files fallback * feat(index): default to shallow index; add ShallowIndexManager; deep rebuild tool; remove build_shallow_index tool; watcher+refresh use shallow; fix server context type * feat(find_files): enforce true glob semantics (* no dir, ** recursive) and unify to shallow index; watcher verified for add/delete * feat(file_summary): return needs_deep_index hint when deep index not available
1 parent 8c87080 commit 0c20349

12 files changed

+491
-39
lines changed

src/code_index_mcp/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
# Directory and file names
66
SETTINGS_DIR = "code_indexer"
77
CONFIG_FILE = "config.json"
8-
INDEX_FILE = "index.json" # JSON index file
8+
INDEX_FILE = "index.json" # JSON index file (deep index)
9+
INDEX_FILE_SHALLOW = "index.shallow.json" # Minimal shallow index (file list)
910

1011
# Supported file extensions for code analysis
1112
# This is the authoritative list used by both old and new indexing systems

src/code_index_mcp/indexing/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
# New JSON-based indexing system
1414
from .json_index_builder import JSONIndexBuilder, IndexMetadata
1515
from .json_index_manager import JSONIndexManager, get_index_manager
16+
from .shallow_index_manager import ShallowIndexManager, get_shallow_index_manager
17+
from .deep_index_manager import DeepIndexManager
1618
from .models import SymbolInfo, FileInfo
1719

1820
__all__ = [
@@ -21,6 +23,9 @@
2123
'JSONIndexBuilder',
2224
'JSONIndexManager',
2325
'get_index_manager',
26+
'ShallowIndexManager',
27+
'get_shallow_index_manager',
28+
'DeepIndexManager',
2429
'SymbolInfo',
2530
'FileInfo',
2631
'IndexMetadata'
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""
2+
Deep Index Manager - Wrapper around JSONIndexManager for deep indexing.
3+
4+
This class provides a clear semantic separation from the shallow manager.
5+
It delegates to the existing JSONIndexManager (symbols + files JSON index).
6+
"""
7+
8+
from __future__ import annotations
9+
10+
from typing import Optional, Dict, Any, List
11+
12+
from .json_index_manager import JSONIndexManager
13+
14+
15+
class DeepIndexManager:
16+
"""Thin wrapper over JSONIndexManager to expose deep-index API."""
17+
18+
def __init__(self) -> None:
19+
self._mgr = JSONIndexManager()
20+
21+
# Expose a subset of API to keep callers simple
22+
def set_project_path(self, project_path: str) -> bool:
23+
return self._mgr.set_project_path(project_path)
24+
25+
def build_index(self, force_rebuild: bool = False) -> bool:
26+
return self._mgr.build_index(force_rebuild=force_rebuild)
27+
28+
def load_index(self) -> bool:
29+
return self._mgr.load_index()
30+
31+
def refresh_index(self) -> bool:
32+
return self._mgr.refresh_index()
33+
34+
def find_files(self, pattern: str = "*") -> List[str]:
35+
return self._mgr.find_files(pattern)
36+
37+
def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]:
38+
return self._mgr.get_file_summary(file_path)
39+
40+
def get_index_stats(self) -> Dict[str, Any]:
41+
return self._mgr.get_index_stats()
42+
43+
def cleanup(self) -> None:
44+
self._mgr.cleanup()
45+
46+

src/code_index_mcp/indexing/json_index_builder.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,31 @@ def _get_supported_files(self) -> List[str]:
274274
logger.debug(f"Found {len(supported_files)} supported files")
275275
return supported_files
276276

277+
def build_shallow_file_list(self) -> List[str]:
278+
"""
279+
Build a minimal shallow index consisting of relative file paths only.
280+
281+
This method does not read file contents. It enumerates supported files
282+
using centralized filtering and returns normalized relative paths with
283+
forward slashes for cross-platform consistency.
284+
285+
Returns:
286+
List of relative file paths (using '/').
287+
"""
288+
try:
289+
absolute_files = self._get_supported_files()
290+
result: List[str] = []
291+
for abs_path in absolute_files:
292+
rel_path = os.path.relpath(abs_path, self.project_path).replace('\\', '/')
293+
# Normalize leading './'
294+
if rel_path.startswith('./'):
295+
rel_path = rel_path[2:]
296+
result.append(rel_path)
297+
return result
298+
except Exception as e:
299+
logger.error(f"Failed to build shallow file list: {e}")
300+
return []
301+
277302
def save_index(self, index: Dict[str, Any], index_path: str) -> bool:
278303
"""
279304
Save index to disk.

src/code_index_mcp/indexing/json_index_manager.py

Lines changed: 108 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@
99
import json
1010
import logging
1111
import os
12+
import re
1213
import tempfile
1314
import threading
1415
import fnmatch
1516
from pathlib import Path
1617
from typing import Dict, List, Optional, Any
1718

1819
from .json_index_builder import JSONIndexBuilder
19-
from ..constants import SETTINGS_DIR, INDEX_FILE
20+
from ..constants import SETTINGS_DIR, INDEX_FILE, INDEX_FILE_SHALLOW
2021

2122
logger = logging.getLogger(__name__)
2223

@@ -29,6 +30,8 @@ def __init__(self):
2930
self.index_builder: Optional[JSONIndexBuilder] = None
3031
self.temp_dir: Optional[str] = None
3132
self.index_path: Optional[str] = None
33+
self.shallow_index_path: Optional[str] = None
34+
self._shallow_file_list: Optional[List[str]] = None
3235
self._lock = threading.RLock()
3336
logger.info("Initialized JSON Index Manager")
3437

@@ -59,6 +62,7 @@ def set_project_path(self, project_path: str) -> bool:
5962
os.makedirs(self.temp_dir, exist_ok=True)
6063

6164
self.index_path = os.path.join(self.temp_dir, INDEX_FILE)
65+
self.shallow_index_path = os.path.join(self.temp_dir, INDEX_FILE_SHALLOW)
6266

6367
logger.info(f"Set project path: {project_path}")
6468
logger.info(f"Index storage: {self.index_path}")
@@ -114,6 +118,52 @@ def load_index(self) -> bool:
114118
logger.error(f"Failed to load index: {e}")
115119
return False
116120

121+
def build_shallow_index(self) -> bool:
122+
"""Build and save the minimal shallow index (file list)."""
123+
with self._lock:
124+
if not self.index_builder or not self.project_path or not self.shallow_index_path:
125+
logger.error("Index builder not initialized for shallow index")
126+
return False
127+
128+
try:
129+
file_list = self.index_builder.build_shallow_file_list()
130+
# Persist as a JSON array for minimal overhead
131+
with open(self.shallow_index_path, 'w', encoding='utf-8') as f:
132+
json.dump(file_list, f, ensure_ascii=False)
133+
self._shallow_file_list = file_list
134+
logger.info(f"Saved shallow index with {len(file_list)} files to {self.shallow_index_path}")
135+
return True
136+
except Exception as e:
137+
logger.error(f"Failed to build shallow index: {e}")
138+
return False
139+
140+
def load_shallow_index(self) -> bool:
141+
"""Load shallow index (file list) from disk into memory."""
142+
with self._lock:
143+
try:
144+
if not self.shallow_index_path or not os.path.exists(self.shallow_index_path):
145+
logger.warning("No existing shallow index found")
146+
return False
147+
with open(self.shallow_index_path, 'r', encoding='utf-8') as f:
148+
data = json.load(f)
149+
if not isinstance(data, list):
150+
logger.error("Shallow index format invalid (expected list)")
151+
return False
152+
# Normalize paths
153+
normalized = []
154+
for p in data:
155+
if isinstance(p, str):
156+
q = p.replace('\\\\', '/').replace('\\', '/')
157+
if q.startswith('./'):
158+
q = q[2:]
159+
normalized.append(q)
160+
self._shallow_file_list = normalized
161+
logger.info(f"Loaded shallow index with {len(normalized)} files")
162+
return True
163+
except Exception as e:
164+
logger.error(f"Failed to load shallow index: {e}")
165+
return False
166+
117167
def refresh_index(self) -> bool:
118168
"""Refresh the index (rebuild and reload)."""
119169
with self._lock:
@@ -123,7 +173,14 @@ def refresh_index(self) -> bool:
123173
return False
124174

125175
def find_files(self, pattern: str = "*") -> List[str]:
126-
"""Find files matching a pattern."""
176+
"""
177+
Find files matching a glob pattern using the SHALLOW file list only.
178+
179+
Notes:
180+
- '*' does not cross '/'
181+
- '**' matches across directories
182+
- Always sources from the shallow index for consistency and speed
183+
"""
127184
with self._lock:
128185
# Input validation
129186
if not isinstance(pattern, str):
@@ -134,18 +191,27 @@ def find_files(self, pattern: str = "*") -> List[str]:
134191
if not pattern:
135192
pattern = "*"
136193

137-
if not self.index_builder or not self.index_builder.in_memory_index:
138-
logger.warning("Index not loaded")
139-
return []
194+
# Normalize to forward slashes
195+
norm_pattern = pattern.replace('\\\\', '/').replace('\\', '/')
196+
197+
# Build glob regex: '*' does not cross '/', '**' crosses directories
198+
regex = self._compile_glob_regex(norm_pattern)
140199

200+
# Always use shallow index for file discovery
141201
try:
142-
files = list(self.index_builder.in_memory_index["files"].keys())
202+
if self._shallow_file_list is None:
203+
# Try load existing shallow index; if missing, build then load
204+
if not self.load_shallow_index():
205+
# If still not available, attempt to build
206+
if self.build_shallow_index():
207+
self.load_shallow_index()
143208

144-
if pattern == "*":
209+
files = list(self._shallow_file_list or [])
210+
211+
if norm_pattern == "*":
145212
return files
146213

147-
# Simple pattern matching
148-
return [f for f in files if fnmatch.fnmatch(f, pattern)]
214+
return [f for f in files if regex.match(f) is not None]
149215

150216
except Exception as e:
151217
logger.error(f"Error finding files: {e}")
@@ -356,6 +422,39 @@ def cleanup(self):
356422
self.index_path = None
357423
logger.info("Cleaned up JSON Index Manager")
358424

425+
@staticmethod
426+
def _compile_glob_regex(pattern: str) -> re.Pattern:
427+
"""
428+
Compile a glob pattern where '*' does not match '/', and '**' matches across directories.
429+
430+
Examples:
431+
src/*.py -> direct children .py under src
432+
**/*.py -> .py at any depth
433+
"""
434+
# Translate glob to regex
435+
i = 0
436+
out = []
437+
special = ".^$+{}[]|()"
438+
while i < len(pattern):
439+
c = pattern[i]
440+
if c == '*':
441+
if i + 1 < len(pattern) and pattern[i + 1] == '*':
442+
# '**' -> match across directories
443+
out.append('.*')
444+
i += 2
445+
continue
446+
else:
447+
out.append('[^/]*')
448+
elif c == '?':
449+
out.append('[^/]')
450+
elif c in special:
451+
out.append('\\' + c)
452+
else:
453+
out.append(c)
454+
i += 1
455+
regex_str = '^' + ''.join(out) + '$'
456+
return re.compile(regex_str)
457+
359458

360459
# Global instance
361460
_index_manager = JSONIndexManager()
@@ -364,4 +463,3 @@ def cleanup(self):
364463
def get_index_manager() -> JSONIndexManager:
365464
"""Get the global index manager instance."""
366465
return _index_manager
367-

0 commit comments

Comments
 (0)