diff --git a/.gitignore b/.gitignore
index 367a552..9539f72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,4 @@ COMMIT_MESSAGE.txt
RELEASE_NOTE.txt
.llm-context/
-.kiro/
\ No newline at end of file
+AGENTS.md
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..886f335
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,25 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+Code Index MCP lives in `src/code_index_mcp/`, with `indexing/` managing builders, `services/` exposing MCP tool implementations, `search/` coordinating query utilities, and `utils/` housing cross-cutting helpers. The lightweight CLI bootstrapper is `run.py`, which adds `src/` to `PYTHONPATH` before invoking `code_index_mcp.server`. Sample corpora for language regression reside under `test/sample-projects/` (for example `python/user_management/`). Reserve `tests/` for runnable suites and avoid checking in generated `__pycache__` artifacts.
+
+## Build, Test, and Development Commands
+Install dependencies with `uv sync` after cloning. Use `uv run code-index-mcp` to launch the MCP server directly, or `uv run python run.py` when you need the local sys.path shim. During development, `uv run code-index-mcp --help` will list available CLI flags, and `uv run python -m code_index_mcp.server` mirrors the published entry point for debugging.
+
+## Coding Style & Naming Conventions
+Target Python 3.10+ and follow the `.pylintrc` configuration: 4-space indentation, 100-character line limit, and restrained function signatures (<= 7 parameters). Modules and functions stay `snake_case`, classes use `PascalCase`, and constants remain uppercase with underscores. Prefer explicit imports from sibling packages (`from .services import ...`) and keep logging to stderr as implemented in `server.py`.
+
+## Testing Guidelines
+Automated tests should live under `tests/`, mirroring the package hierarchy (`tests/indexing/test_shallow_index.py`, etc.). Use `uv run pytest` (with optional `-k` selectors) for unit and integration coverage, and stage representative fixtures inside `test/sample-projects/` when exercising new language strategies. Document expected behaviors in fixtures' README files or inline comments, and fail fast if tree-sitter support is not available for a language you add.
+
+## Commit & Pull Request Guidelines
+Follow the Conventional Commits style seen in history (`feat`, `fix`, `refactor(scope): summary`). Reference issue numbers when relevant and keep subjects under 72 characters. Pull requests should include: 1) a concise problem statement, 2) before/after behavior or performance notes, 3) instructions for reproducing test runs (`uv run pytest`, `uv run code-index-mcp`). Attach updated screenshots or logs when touching developer experience flows, and confirm the file watcher still transitions to "active" in manual smoke tests.
+
+## Agent Workflow Tips
+Always call `set_project_path` before invoking other tools, and prefer `search_code_advanced` with targeted `file_pattern` filters to minimize noise. When editing indexing strategies, run `refresh_index` in between changes to confirm cache rebuilds. Clean up temporary directories via `clear_settings` if you notice stale metadata, and document any new tooling you introduce in this guide.
+
+## Release Preparation Checklist
+- Update the project version everywhere it lives: `pyproject.toml`, `src/code_index_mcp/__init__.py`, and `uv.lock`.
+- Add a release note entry to `RELEASE_NOTE.txt` for the new version.
+- Commit the version bump (plus any release artifacts) and push the branch to `origin`.
+- Create a git tag for the new version and push the tag to `origin`.
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
deleted file mode 100644
index f3b2d5b..0000000
--- a/ARCHITECTURE.md
+++ /dev/null
@@ -1,233 +0,0 @@
-# Code Index MCP System Architecture
-
-## Overview
-
-Code Index MCP is a Model Context Protocol (MCP) server that provides intelligent code indexing and analysis capabilities. The system follows SCIP (Source Code Intelligence Protocol) standards and uses a service-oriented architecture with clear separation of concerns.
-
-## High-Level Architecture
-
-```
-┌─────────────────────────────────────────────────────────────────┐
-│ MCP Interface Layer │
-├─────────────────────────────────────────────────────────────────┤
-│ Service Layer │
-├─────────────────────────────────────────────────────────────────┤
-│ SCIP Core Layer │
-├─────────────────────────────────────────────────────────────────┤
-│ Language Strategies │
-├─────────────────────────────────────────────────────────────────┤
-│ Technical Tools Layer │
-└─────────────────────────────────────────────────────────────────┘
-```
-
-## Layer Responsibilities
-
-### 1. MCP Interface Layer (`server.py`)
-**Purpose**: Exposes MCP tools and handles protocol communication
-
-**Key Components**:
-- MCP tool definitions (`@mcp.tool()`)
-- Error handling and response formatting
-- User interaction and guidance
-
-**MCP Tools**:
-- `set_project_path` - Initialize project indexing
-- `find_files` - File discovery with patterns
-- `get_file_summary` - File analysis and metadata
-- `search_code_advanced` - Content search across files
-- `refresh_index` - Manual index rebuilding
-- `get_file_watcher_status` - File monitoring status
-- `configure_file_watcher` - File watcher settings
-
-### 2. Service Layer (`services/`)
-**Purpose**: Business logic orchestration and workflow management
-
-**Key Services**:
-- `ProjectManagementService` - Project lifecycle and initialization
-- `FileWatcherService` - Real-time file monitoring and auto-refresh
-- `IndexManagementService` - Index rebuild operations
-- `CodeIntelligenceService` - File analysis and symbol intelligence
-- `FileDiscoveryService` - File pattern matching and discovery
-- `SearchService` - Advanced code search capabilities
-
-**Architecture Pattern**: Service delegation with clear business boundaries
-
-### 3. SCIP Core Layer (`scip/core/`)
-**Purpose**: Language-agnostic SCIP protocol implementation
-
-**Core Components**:
-- `SCIPSymbolManager` - Standard SCIP symbol ID generation
-- `LocalReferenceResolver` - Cross-file reference resolution
-- `PositionCalculator` - AST/Tree-sitter position conversion
-- `MonikerManager` - External package dependency handling
-
-**Standards Compliance**: Full SCIP protocol buffer implementation
-
-### 4. Language Strategies (`scip/strategies/`)
-**Purpose**: Language-specific code analysis using two-phase processing
-
-**Strategy Pattern Implementation**:
-- `BaseStrategy` - Abstract interface and common functionality
-- `PythonStrategy` - Python AST analysis
-- `JavaScriptStrategy` - JavaScript/TypeScript Tree-sitter analysis
-- `JavaStrategy` - Java Tree-sitter analysis
-- `ObjectiveCStrategy` - Objective-C Tree-sitter analysis
-- `FallbackStrategy` - Generic text-based analysis
-
-**Two-Phase Analysis**:
-1. **Phase 1**: Symbol definition collection
-2. **Phase 2**: Reference resolution and SCIP document generation
-
-### 5. Technical Tools Layer (`tools/`)
-**Purpose**: Low-level technical capabilities
-
-**Tool Categories**:
-- `filesystem/` - File system operations and pattern matching
-- `scip/` - SCIP index operations and symbol analysis
-- `config/` - Configuration and settings management
-- `monitoring/` - File watching and system monitoring
-
-## Data Flow Architecture
-
-### File Analysis Workflow
-```
-User Request → Service Layer → SCIP Strategy → Core Components → SCIP Documents
-```
-
-### Index Management Workflow
-```
-File Changes → File Watcher → Index Management Service → Strategy Factory → Updated Index
-```
-
-### Search Workflow
-```
-Search Query → Search Service → Advanced Search Tools → Filtered Results
-```
-
-## SCIP Implementation Details
-
-### Symbol ID Format
-```
-scip-{language} {manager} {package} [version] {descriptors}
-```
-
-**Examples**:
-- Local: `scip-python local myproject src/main.py/MyClass#method().`
-- External: `scip-python pip requests 2.31.0 sessions/Session#get().`
-
-### Language Support Strategy
-
-**Parsing Approaches**:
-- **Python**: Native AST module
-- **JavaScript/TypeScript**: Tree-sitter
-- **Java**: Tree-sitter
-- **Objective-C**: Tree-sitter
-- **Others**: Fallback text analysis
-
-**Supported Code Intelligence**:
-- Symbol definitions (functions, classes, variables)
-- Import/export tracking
-- Cross-file reference resolution
-- External dependency management
-- Position-accurate symbol ranges
-
-## Configuration and Extensibility
-
-### Package Manager Integration
-- **Python**: pip, conda, poetry detection
-- **JavaScript**: npm, yarn package.json parsing
-- **Java**: Maven pom.xml, Gradle build files
-- **Configuration-driven**: Easy addition of new package managers
-
-### File Watcher System
-- **Real-time monitoring**: Watchdog-based file system events
-- **Debounced rebuilds**: 4-6 second batching of rapid changes
-- **Configurable patterns**: Customizable include/exclude rules
-- **Thread-safe**: ThreadPoolExecutor for concurrent rebuilds
-
-## Performance Characteristics
-
-### Indexing Performance
-- **Incremental updates**: File-level granular rebuilds
-- **Parallel processing**: Concurrent file analysis
-- **Memory efficient**: Streaming SCIP document generation
-- **Cache optimization**: Symbol table reuse across phases
-
-### Search Performance
-- **Advanced tools**: ripgrep, ugrep, ag integration
-- **Pattern optimization**: Glob-based file filtering
-- **Result streaming**: Large result set handling
-
-## Error Handling and Reliability
-
-### Fault Tolerance
-- **Graceful degradation**: Continue indexing on individual file failures
-- **Error isolation**: Per-file error boundaries
-- **Recovery mechanisms**: Automatic retry on transient failures
-- **Comprehensive logging**: Debug and audit trail support
-
-### Validation
-- **Input sanitization**: Path traversal protection
-- **Range validation**: SCIP position boundary checking
-- **Schema validation**: Protocol buffer structure verification
-
-## Future Architecture Considerations
-
-### Planned Enhancements
-1. **Function Call Relationships**: Complete call graph analysis
-2. **Type Information**: Enhanced semantic analysis
-3. **Cross-repository Navigation**: Multi-project symbol resolution
-4. **Language Server Protocol**: LSP compatibility layer
-5. **Distributed Indexing**: Horizontal scaling support
-
-### Extension Points
-- **Custom strategies**: Plugin architecture for new languages
-- **Analysis plugins**: Custom symbol analyzers
-- **Export formats**: Multiple output format support
-- **Integration APIs**: External tool connectivity
-
-## Directory Structure
-
-```
-src/code_index_mcp/
-├── server.py # MCP interface layer
-├── services/ # Business logic services
-│ ├── project_management_service.py
-│ ├── file_watcher_service.py
-│ ├── index_management_service.py
-│ ├── code_intelligence_service.py
-│ └── ...
-├── scip/ # SCIP implementation
-│ ├── core/ # Language-agnostic core
-│ │ ├── symbol_manager.py
-│ │ ├── local_reference_resolver.py
-│ │ ├── position_calculator.py
-│ │ └── moniker_manager.py
-│ ├── strategies/ # Language-specific strategies
-│ │ ├── base_strategy.py
-│ │ ├── python_strategy.py
-│ │ ├── javascript_strategy.py
-│ │ └── ...
-│ └── factory.py # Strategy selection
-├── tools/ # Technical capabilities
-│ ├── filesystem/
-│ ├── scip/
-│ ├── config/
-│ └── monitoring/
-├── indexing/ # Index management
-└── utils/ # Shared utilities
-```
-
-## Key Design Principles
-
-1. **Standards Compliance**: Full SCIP protocol adherence
-2. **Language Agnostic**: Core components independent of specific languages
-3. **Extensible**: Easy addition of new languages and features
-4. **Performance**: Efficient indexing and search operations
-5. **Reliability**: Fault-tolerant with comprehensive error handling
-6. **Maintainability**: Clear separation of concerns and modular design
-
----
-
-*Last updated: 2025-01-14*
-*Architecture version: 2.1.0*
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index c3f9006..0000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,162 +0,0 @@
-# Changelog
-
-All notable changes to this project will be documented in this file.
-
-## [2.1.1] - 2025-01-15
-
-### Fixed
-- **SCIP Java Strategy**: Simplified Java symbol analysis implementation
- - Refactored JavaStrategy to use streamlined symbol registration methods
- - Removed complex JavaAnalyzer and JavaRelationshipExtractor classes
- - Fixed symbol creation with basic identifier extraction
- - Removed relationships summary calculation that was causing issues
- - Added back to_scip_relationships method for compatibility
- - Streamlined Java AST processing to focus on core symbol definitions
-
-### Improved
-- **Code Maintainability**: Significantly reduced complexity in Java SCIP processing
-- **Performance**: Faster Java file analysis with simplified approach
-- **Reliability**: More stable symbol extraction without complex relationship tracking
-
-## [2.1.0] - 2025-01-13
-
-### Major SCIP Architecture Enhancement
-
-This release completes the migration to SCIP-based code indexing with significant improvements to the core infrastructure and API simplification.
-
-#### Core SCIP Infrastructure
-- **Complete SCIP core components**: Added symbol_manager, position_calculator, reference_resolver, moniker_manager
-- **Two-phase SCIP analysis**: Implemented symbol collection → reference resolution workflow
-- **Unified index management**: New index_provider and unified_index_manager for seamless index operations
-- **SCIP-compliant symbol IDs**: Standard symbol ID generation with cross-file reference support
-
-#### Enhanced Strategy System
-- **All language strategies SCIP-compliant**: Refactored Python, Java, JavaScript, Objective-C strategies
-- **External symbol extraction**: Added dependency tracking and external symbol resolution
-- **Proper SCIP classifications**: Implemented symbol roles and syntax kind detection
-- **Robust file handling**: Enhanced encoding detection and error recovery
-
-#### API Improvements
-- **Simplified find_files response**: Returns clean file path lists instead of complex metadata objects
-- **Enhanced SCIPSymbolAnalyzer**: Replaced legacy query tools with accurate symbol analysis
-- **Improved logging**: Comprehensive logging throughout SCIP indexing pipeline
-
-#### Dependency Updates
-- **pathspec integration**: Better .gitignore parsing and file filtering
-- **Updated requirements**: Added comprehensive dependency list for cross-platform support
-
-#### Technical Improvements
-- **Symbol analysis tools**: New inspection scripts for debugging and development
-- **Enhanced error handling**: Better fallback strategies and error recovery
-- **Testing improvements**: Updated sample projects for multilingual testing
-
-#### Breaking Changes
-- **find_files API**: Now returns `List[str]` instead of complex metadata dictionary
-- **Internal architecture**: Significant refactoring of internal components (no user-facing impact)
-
-## [2.0.0] - 2025-08-11
-
-### 🚀 MAJOR RELEASE - SCIP Architecture Migration
-
-This release represents a **complete architectural overhaul** of the code indexing system, migrating from language-specific analyzers to a unified SCIP-based approach.
-
-#### ✨ New Architecture
-- **Three-layer service architecture**: Service → Tool → Technical Components
-- **Unified SCIP indexing**: Replace 8 language-specific analyzers with single SCIP protobuf system
-- **Service-oriented design**: Clear separation of business logic, technical tools, and low-level operations
-- **Composable components**: Modular design enabling easier testing and maintenance
-
-#### 🔧 Technical Improvements
-- **Tree-sitter AST parsing**: Replace regex-based analysis with proper AST parsing
-- **SCIP protobuf format**: Industry-standard code intelligence format
-- **Reduced complexity**: Simplified from 40K+ lines to ~1K lines of core logic
-- **Better error handling**: Improved exception handling and validation
-- **Enhanced logging**: Better debugging and monitoring capabilities
-
-#### 📦 Backward Compatibility
-- **MCP API unchanged**: All existing MCP tools work without modification
-- **Automatic migration**: Legacy indexes automatically migrated to SCIP format
-- **Same functionality**: All user-facing features preserved and enhanced
-- **No breaking changes**: Seamless upgrade experience
-
-#### 🗑️ Removed Components
-- Language-specific analyzers (C, C++, C#, Go, Java, JavaScript, Objective-C, Python)
-- Legacy indexing models and relationship management
-- Complex duplicate detection and qualified name systems
-- Obsolete builder and scanner components
-- Demo files and temporary utilities
-
-#### 🆕 New Services
-- **ProjectManagementService**: Project lifecycle and configuration management
-- **IndexManagementService**: Index building, rebuilding, and status monitoring
-- **FileDiscoveryService**: Intelligent file discovery with pattern matching
-- **CodeIntelligenceService**: Code analysis and summary generation
-- **SystemManagementService**: File watcher and system configuration
-
-#### 🛠️ New Tool Layer
-- **SCIPIndexTool & SCIPQueryTool**: SCIP operations and querying
-- **FileMatchingTool & FileSystemTool**: File system operations
-- **ProjectConfigTool & SettingsTool**: Configuration management
-- **FileWatcherTool**: Enhanced file monitoring capabilities
-
-#### 📊 Performance Benefits
-- **Faster indexing**: Tree-sitter parsing significantly faster than regex
-- **Lower memory usage**: Streamlined data structures and processing
-- **Better accuracy**: SCIP provides more precise code intelligence
-- **Improved scalability**: Cleaner architecture supports larger codebases
-
-#### 🔄 Migration Guide
-Existing users can upgrade seamlessly:
-1. System automatically detects legacy index format
-2. Migrates to new SCIP format on first run
-3. All existing functionality preserved
-4. No manual intervention required
-
-This release establishes a solid foundation for future enhancements while dramatically simplifying the codebase and improving performance.
-
-## [1.2.1] - 2024-08-06
-
-### Fixed
-- **File Watcher**: Enhanced move event handling for modern editors (VS Code, etc.)
- - Fixed issue where files created via temp-then-move pattern weren't being detected
- - Improved event processing logic to exclusively check destination path for move events
- - Eliminated ambiguous fallback behavior that could cause inconsistent results
-
-### Improved
-- **Code Quality**: Comprehensive Pylint compliance improvements
- - Fixed all f-string logging warnings using lazy % formatting
- - Added proper docstrings to fallback classes
- - Fixed multiple-statements warnings
- - Moved imports to top-level following PEP 8 conventions
- - Added appropriate pylint disables for stub methods
-
-### Technical Details
-- Unified path checking logic across all event types
-- Reduced code complexity in `should_process_event()` method
-- Better error handling with consistent exception management
-- Enhanced debugging capabilities with improved logging
-
-## [1.2.0] - Previous Release
-
-### Added
-- Enhanced find_files functionality with filename search
-- Performance improvements to file discovery
-- Auto-refresh troubleshooting documentation
-
-## [1.1.1] - Previous Release
-
-### Fixed
-- Various bug fixes and stability improvements
-
-## [1.1.0] - Previous Release
-
-### Added
-- Initial file watcher functionality
-- Cross-platform file system monitoring
-
-## [1.0.0] - Initial Release
-
-### Added
-- Core MCP server implementation
-- Code indexing and analysis capabilities
-- Multi-language support
\ No newline at end of file
diff --git a/README.md b/README.md
index f51ea87..5cabcbe 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ The easiest way to get started with any MCP-compatible application:
- **Direct Tree-sitter Integration**: No regex fallbacks for specialized languages - fail fast with clear errors
- **Advanced Search**: Auto-detects and uses the best available tool (ugrep, ripgrep, ag, or grep)
- **Universal File Support**: Comprehensive coverage from advanced AST parsing to basic file indexing
-- **File Analysis**: Deep insights into structure, imports, classes, methods, and complexity metrics
+- **File Analysis**: Deep insights into structure, imports, classes, methods, and complexity metrics after running `build_deep_index`
### 🗂️ **Multi-Language Support**
- **7 Languages with Tree-sitter AST Parsing**: Python, JavaScript, TypeScript, Java, Go, Objective-C, Zig
@@ -81,7 +81,7 @@ The easiest way to get started with any MCP-compatible application:
- **File Watcher**: Automatic index updates when files change
- **Cross-platform**: Native OS file system monitoring
- **Smart Processing**: Batches rapid changes to prevent excessive rebuilds
-- **Rich Metadata**: Captures symbols, references, definitions, and relationships
+- **Shallow Index Refresh**: Watches file changes and keeps the file list current; run a deep rebuild when you need symbol metadata
### ⚡ **Performance & Efficiency**
- **Tree-sitter AST Parsing**: Native syntax parsing for accurate symbol extraction
@@ -218,15 +218,18 @@ Then configure:
| Tool | Description |
|------|-------------|
| **`set_project_path`** | Initialize indexing for a project directory |
-| **`refresh_index`** | Rebuild the project index after file changes |
+| **`refresh_index`** | Rebuild the shallow file index after file changes |
+| **`build_deep_index`** | Generate the full symbol index used by deep analysis |
| **`get_settings_info`** | View current project configuration and status |
+*Run `build_deep_index` when you need symbol-level data; the default shallow index powers quick file discovery.*
+
### 🔍 **Search & Discovery**
| Tool | Description |
|------|-------------|
| **`search_code_advanced`** | Smart search with regex, fuzzy matching, and file filtering |
| **`find_files`** | Locate files using glob patterns (e.g., `**/*.py`) |
-| **`get_file_summary`** | Analyze file structure, functions, imports, and complexity |
+| **`get_file_summary`** | Analyze file structure, functions, imports, and complexity (requires deep index) |
### 🔄 **Monitoring & Auto-refresh**
| Tool | Description |
@@ -263,6 +266,7 @@ Find all TypeScript component files in src/components
Give me a summary of src/api/userService.ts
```
*Uses: `get_file_summary` to show functions, imports, and complexity*
+*Tip: run `build_deep_index` first if you get a `needs_deep_index` response.*
### 🔍 **Advanced Search Examples**
diff --git a/README_ja.md b/README_ja.md
index 76c419a..79059b1 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -66,7 +66,7 @@ Code Index MCPは、AIモデルと複雑なコードベースの橋渡しをす
- **直接Tree-sitter統合**:特化言語で正規表現フォールバックなし - 明確なエラーメッセージで高速フェイル
- **高度な検索**:最適なツール(ugrep、ripgrep、ag、grep)を自動検出・使用
- **汎用ファイルサポート**:高度なAST解析から基本ファイルインデックスまでの包括的カバレッジ
-- **ファイル解析**:構造、インポート、クラス、メソッド、複雑度メトリクスへの深い洞察
+- **ファイル解析**:`build_deep_index` 実行後に構造、インポート、クラス、メソッド、複雑度メトリクスを深く把握
### 🗂️ **多言語サポート**
- **7言語でTree-sitter AST解析**:Python、JavaScript、TypeScript、Java、Go、Objective-C、Zig
@@ -81,7 +81,7 @@ Code Index MCPは、AIモデルと複雑なコードベースの橋渡しをす
- **ファイルウォッチャー**:ファイル変更時の自動インデックス更新
- **クロスプラットフォーム**:ネイティブOSファイルシステム監視
- **スマート処理**:急速な変更をバッチ処理して過度な再構築を防止
-- **豊富なメタデータ**:シンボル、参照、定義、関連性をキャプチャ
+- **浅いインデックス更新**:ファイル変更を監視して最新のファイル一覧を維持し、シンボルが必要な場合は `build_deep_index` を実行
### ⚡ **パフォーマンス・効率性**
- **Tree-sitter AST解析**:正確なシンボル抽出のためのネイティブ構文解析
@@ -240,15 +240,18 @@ pip install code-index-mcp
| ツール | 説明 |
|--------|------|
| **`set_project_path`** | プロジェクトディレクトリのインデックス作成を初期化 |
-| **`refresh_index`** | ファイル変更後にプロジェクトインデックスを再構築 |
+| **`refresh_index`** | ファイル変更後に浅いファイルインデックスを再構築 |
+| **`build_deep_index`** | 深い解析で使う完全なシンボルインデックスを生成 |
| **`get_settings_info`** | 現在のプロジェクト設定と状態を表示 |
+*シンボルレベルのデータが必要な場合は `build_deep_index` を実行してください。デフォルトの浅いインデックスは高速なファイル探索を担います。*
+
### 🔍 **検索・発見**
| ツール | 説明 |
|--------|------|
| **`search_code_advanced`** | 正規表現、ファジーマッチング、ファイルフィルタリング対応のスマート検索 |
| **`find_files`** | globパターンを使用したファイル検索(例:`**/*.py`) |
-| **`get_file_summary`** | ファイル構造、関数、インポート、複雑度の解析 |
+| **`get_file_summary`** | ファイル構造、関数、インポート、複雑度の解析(深いインデックスが必要) |
### 🔄 **監視・自動更新**
| ツール | 説明 |
@@ -285,6 +288,7 @@ src/components で全てのTypeScriptコンポーネントファイルを見つ
src/api/userService.ts の要約を教えてください
```
*使用ツール:`get_file_summary` で関数、インポート、複雑度を表示*
+*ヒント:`needs_deep_index` が返った場合は `build_deep_index` を先に実行してください。*
### 🔍 **高度な検索例**
diff --git a/README_ko.md b/README_ko.md
new file mode 100644
index 0000000..6995b6a
--- /dev/null
+++ b/README_ko.md
@@ -0,0 +1,284 @@
+# 코드 인덱스 MCP
+
+
+
+[](https://modelcontextprotocol.io)
+[](https://www.python.org/)
+[](LICENSE)
+
+**대규모 언어 모델을 위한 지능형 코드 인덱싱과 분석**
+
+고급 검색, 정밀 분석, 유연한 탐색 기능으로 AI가 코드베이스를 이해하고 활용하는 방식을 혁신하세요.
+
+
+
+
+
+
+
+## 개요
+
+Code Index MCP는 [Model Context Protocol](https://modelcontextprotocol.io) 기반 MCP 서버로, AI 어시스턴트와 복잡한 코드베이스 사이를 연결합니다. 빠른 인덱싱, 강력한 검색, 정밀한 코드 분석을 제공하여 AI가 프로젝트 구조를 정확히 파악하고 효과적으로 지원하도록 돕습니다.
+
+**이럴 때 안성맞춤:** 코드 리뷰, 리팩터링, 문서화, 디버깅 지원, 아키텍처 분석
+
+## 빠른 시작
+
+### 🚀 **권장 설정 (대부분의 사용자)**
+
+어떤 MCP 호환 애플리케이션에서도 몇 단계만으로 시작할 수 있습니다.
+
+**사전 준비:** Python 3.10+ 및 [uv](https://github.com/astral-sh/uv)
+
+1. **MCP 설정에 서버 추가** (예: `claude_desktop_config.json` 또는 `~/.claude.json`)
+ ```json
+ {
+ "mcpServers": {
+ "code-index": {
+ "command": "uvx",
+ "args": ["code-index-mcp"]
+ }
+ }
+ }
+ ```
+
+2. **애플리케이션 재시작** – `uvx`가 설치와 실행을 자동으로 처리합니다.
+
+3. **사용 시작** (AI 어시스턴트에게 아래 프롬프트를 전달)
+ ```
+ 프로젝트 경로를 /Users/dev/my-react-app 으로 설정해줘
+ 이 프로젝트에서 모든 TypeScript 파일을 찾아줘
+ "authentication" 관련 함수를 검색해줘
+ src/App.tsx 파일을 분석해줘
+ ```
+
+## 대표 사용 사례
+
+**코드 리뷰:** "예전 API를 사용하는 부분을 모두 찾아줘"
+**리팩터링 지원:** "이 함수는 어디에서 호출되나요?"
+**프로젝트 학습:** "이 React 프로젝트의 핵심 컴포넌트를 보여줘"
+**디버깅:** "에러 처리 로직이 있는 파일을 찾아줘"
+
+## 주요 기능
+
+### 🧠 **지능형 검색과 분석**
+- **듀얼 전략 아키텍처:** 7개 핵심 언어는 전용 tree-sitter 파서를 사용하고, 그 외 50+ 파일 형식은 폴백 전략으로 처리
+- **직접 Tree-sitter 통합:** 특화 언어에 정규식 폴백 없음 – 문제 시 즉시 실패하고 명확한 오류 메시지 제공
+- **고급 검색:** ugrep, ripgrep, ag, grep 중 최적의 도구를 자동 선택해 활용
+- **범용 파일 지원:** 정교한 AST 분석부터 기본 파일 인덱싱까지 폭넓게 커버
+- **파일 분석:** `build_deep_index` 실행 후 구조, 임포트, 클래스, 메서드, 복잡도 지표를 심층적으로 파악
+
+### 🗂️ **다중 언어 지원**
+- **Tree-sitter AST 분석(7종):** Python, JavaScript, TypeScript, Java, Go, Objective-C, Zig
+- **폴백 전략(50+ 형식):** C/C++, Rust, Ruby, PHP 등 대부분의 프로그래밍 언어 지원
+- **문서 및 설정 파일:** Markdown, JSON, YAML, XML 등 상황에 맞는 처리
+- **웹 프론트엔드:** Vue, React, Svelte, HTML, CSS, SCSS
+- **데이터 계층:** SQL, NoSQL, 스토어드 프로시저, 마이그레이션 스크립트
+- **구성 파일:** JSON, YAML, XML, Markdown
+- **[지원 파일 전체 목록 보기](#지원-파일-형식)**
+
+### 🔄 **실시간 모니터링 & 자동 새로고침**
+- **파일 워처:** 파일 변경 시 자동으로 얕은 인덱스(파일 목록) 갱신
+- **크로스 플랫폼:** 운영체제 기본 파일시스템 이벤트 활용
+- **스마트 처리:** 빠른 변경을 묶어 과도한 재빌드를 방지
+- **얕은 인덱스 갱신:** 파일 목록을 최신 상태로 유지하며, 심볼 데이터가 필요하면 `build_deep_index`를 실행
+
+### ⚡ **성능 & 효율성**
+- **Tree-sitter AST 파싱:** 정확한 심볼 추출을 위한 네이티브 구문 분석
+- **지속 캐싱:** 인덱스를 저장해 이후 응답 속도를 극대화
+- **스마트 필터링:** 빌드 디렉터리·임시 파일을 자동 제외
+- **메모리 효율:** 대규모 코드베이스를 염두에 둔 설계
+- **직접 의존성:** 불필요한 폴백 없이 명확한 오류 메시지 제공
+
+## 지원 파일 형식
+
+
+💻 프로그래밍 언어 (클릭하여 확장)
+
+**전용 Tree-sitter 전략 언어:**
+- **Python** (`.py`, `.pyw`) – 클래스/메서드 추출 및 호출 추적이 포함된 완전 AST 분석
+- **JavaScript** (`.js`, `.jsx`, `.mjs`, `.cjs`) – ES6+ 클래스와 함수를 tree-sitter로 파싱
+- **TypeScript** (`.ts`, `.tsx`) – 인터페이스를 포함한 타입 인지 심볼 추출
+- **Java** (`.java`) – 클래스 계층, 메서드 시그니처, 호출 관계 분석
+- **Go** (`.go`) – 구조체 메서드, 리시버 타입, 함수 분석
+- **Objective-C** (`.m`, `.mm`) – 클래스/인스턴스 메서드를 +/- 표기로 구분
+- **Zig** (`.zig`, `.zon`) – 함수와 구조체를 tree-sitter AST로 분석
+
+**기타 모든 프로그래밍 언어:**
+나머지 언어는 **폴백 파싱 전략**으로 기본 메타데이터와 파일 인덱싱을 제공합니다. 예:
+- **시스템/저수준:** C/C++ (`.c`, `.cpp`, `.h`, `.hpp`), Rust (`.rs`)
+- **객체지향:** C# (`.cs`), Kotlin (`.kt`), Scala (`.scala`), Swift (`.swift`)
+- **스크립트:** Ruby (`.rb`), PHP (`.php`), Shell (`.sh`, `.bash`)
+- **그 외 40+ 형식** – 폴백 전략으로 빠른 탐색 가능
+
+
+
+
+🌐 웹 프론트엔드 & UI
+
+- 프레임워크: Vue (`.vue`), Svelte (`.svelte`), Astro (`.astro`)
+- 스타일링: CSS (`.css`, `.scss`, `.less`, `.sass`, `.stylus`, `.styl`), HTML (`.html`)
+- 템플릿: Handlebars (`.hbs`, `.handlebars`), EJS (`.ejs`), Pug (`.pug`)
+
+
+
+
+🗄️ 데이터 계층 & SQL
+
+- **SQL 변형:** 표준 SQL (`.sql`, `.ddl`, `.dml`), 데이터베이스별 방언 (`.mysql`, `.postgresql`, `.psql`, `.sqlite`, `.mssql`, `.oracle`, `.ora`, `.db2`)
+- **DB 객체:** 프로시저/함수 (`.proc`, `.procedure`, `.func`, `.function`), 뷰/트리거/인덱스 (`.view`, `.trigger`, `.index`)
+- **마이그레이션 도구:** 마이그레이션 파일 (`.migration`, `.seed`, `.fixture`, `.schema`), 도구 구성 (`.liquibase`, `.flyway`)
+- **NoSQL & 그래프:** 질의 언어 (`.cql`, `.cypher`, `.sparql`, `.gql`)
+
+
+
+
+📄 문서 & 설정 파일
+
+- Markdown (`.md`, `.mdx`)
+- 구성 파일 (`.json`, `.xml`, `.yml`, `.yaml`)
+
+
+
+## 사용 가능한 도구
+
+### 🏗️ **프로젝트 관리**
+| 도구 | 설명 |
+|------|------|
+| **`set_project_path`** | 프로젝트 디렉터리의 인덱스를 초기화 |
+| **`refresh_index`** | 파일 변경 후 얕은 파일 인덱스를 재생성 |
+| **`build_deep_index`** | 심층 분석에 사용하는 전체 심볼 인덱스를 생성 |
+| **`get_settings_info`** | 현재 프로젝트 설정과 상태를 확인 |
+
+*심볼 레벨 데이터가 필요하면 `build_deep_index`를 실행하세요. 기본 얕은 인덱스는 빠른 파일 탐색을 담당합니다.*
+
+### 🔍 **검색 & 탐색**
+| 도구 | 설명 |
+|------|------|
+| **`search_code_advanced`** | 정규식, 퍼지 매칭, 파일 필터링을 지원하는 스마트 검색 |
+| **`find_files`** | 글롭 패턴으로 파일 찾기 (예: `**/*.py`) |
+| **`get_file_summary`** | 파일 구조, 함수, 임포트, 복잡도를 분석 (심층 인덱스 필요) |
+
+### 🔄 **모니터링 & 자동 새로고침**
+| 도구 | 설명 |
+|------|------|
+| **`get_file_watcher_status`** | 파일 워처 상태와 구성을 확인 |
+| **`configure_file_watcher`** | 자동 새로고침 설정 (활성/비활성, 지연 시간, 추가 제외 패턴) |
+
+### 🛠️ **시스템 & 유지 관리**
+| 도구 | 설명 |
+|------|------|
+| **`create_temp_directory`** | 인덱스 저장용 임시 디렉터리를 생성 |
+| **`check_temp_directory`** | 인덱스 저장 위치와 권한을 확인 |
+| **`clear_settings`** | 모든 설정과 캐시 데이터를 초기화 |
+| **`refresh_search_tools`** | 사용 가능한 검색 도구를 재검색 (ugrep, ripgrep 등) |
+
+## 사용 예시
+
+### 🧭 **빠른 시작 워크플로**
+
+**1. 프로젝트 초기화**
+```
+프로젝트 경로를 /Users/dev/my-react-app 으로 설정해줘
+```
+*프로젝트를 설정하고 얕은 인덱스를 생성합니다.*
+
+**2. 프로젝트 구조 탐색**
+```
+src/components 안의 TypeScript 컴포넌트 파일을 모두 찾아줘
+```
+*사용 도구: `find_files` (`src/components/**/*.tsx`)*
+
+**3. 핵심 파일 분석**
+```
+src/api/userService.ts 요약을 알려줘
+```
+*사용 도구: `get_file_summary` (함수, 임포트, 복잡도 표시)*
+*팁: `needs_deep_index` 응답이 나오면 먼저 `build_deep_index`를 실행하세요.*
+
+### 🔍 **고급 검색 예시**
+
+
+코드 패턴 검색
+
+```
+"get.*Data"에 해당하는 함수 호출을 정규식으로 찾아줘
+```
+*예: `getData()`, `getUserData()`, `getFormData()`*
+
+
+
+
+퍼지 함수 검색
+
+```
+'authUser'와 유사한 인증 관련 함수를 찾아줘
+```
+*예: `authenticateUser`, `authUserToken`, `userAuthCheck`*
+
+
+
+
+언어별 검색
+
+```
+Python 파일에서만 "API_ENDPOINT" 를 찾아줘
+```
+*`search_code_advanced` + `file_pattern="*.py"`*
+
+
+
+
+자동 새로고침 설정
+
+```
+파일 변경 시 자동으로 인덱스를 새로고침하도록 설정해줘
+```
+*`configure_file_watcher`로 활성화 및 지연 시간 설정*
+
+
+
+
+프로젝트 유지 관리
+
+```
+새 컴포넌트를 추가했어. 프로젝트 인덱스를 다시 빌드해줘
+```
+*`refresh_index`로 빠르게 얕은 인덱스를 업데이트*
+
+
+
+## 문제 해결
+
+### 🔄 **자동 새로고침이 동작하지 않을 때**
+- 환경 문제로 `watchdog`가 빠졌다면 설치: `pip install watchdog`
+- 수동 새로고침: 변경 후 `refresh_index` 도구 실행
+- 워처 상태 확인: `get_file_watcher_status` 도구로 활성 여부 점검
+
+## 개발 & 기여
+
+### 🛠️ **소스에서 실행하기**
+```bash
+git clone https://github.com/johnhuang316/code-index-mcp.git
+cd code-index-mcp
+uv sync
+uv run code-index-mcp
+```
+
+### 🧪 **디버깅 도구**
+```bash
+npx @modelcontextprotocol/inspector uvx code-index-mcp
+```
+
+### 🤝 **기여 안내**
+Pull Request를 언제든 환영합니다. 변경 사항과 테스트 방법을 함께 공유해주세요.
+
+---
+
+### 📄 **라이선스**
+[MIT License](LICENSE)
+
+### 🌍 **번역본**
+- [English](README.md)
+- [繁體中文](README_zh.md)
+- [日本語](README_ja.md)
diff --git a/README_zh.md b/README_zh.md
index 5a61fbb..1e9c5ae 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -66,7 +66,7 @@
- **直接 Tree-sitter 整合**:專業化語言無正則表達式備用 - 快速失敗並提供清晰錯誤訊息
- **進階搜尋**:自動偵測並使用最佳工具(ugrep、ripgrep、ag 或 grep)
- **通用檔案支援**:從進階 AST 解析到基本檔案索引的全面覆蓋
-- **檔案分析**:深入了解結構、匯入、類別、方法和複雜度指標
+- **檔案分析**:執行 `build_deep_index` 後深入了解結構、匯入、類別、方法和複雜度指標
### 🗂️ **多語言支援**
- **7 種語言使用 Tree-sitter AST 解析**:Python、JavaScript、TypeScript、Java、Go、Objective-C、Zig
@@ -81,7 +81,7 @@
- **檔案監控器**:檔案變更時自動更新索引
- **跨平台**:原生作業系統檔案系統監控
- **智慧處理**:批次處理快速變更以防止過度重建
-- **豐富元資料**:捕獲符號、引用、定義和關聯性
+- **淺層索引更新**:監控檔案變更並維持檔案清單最新;需要符號資料時請執行 `build_deep_index`
### ⚡ **效能與效率**
- **Tree-sitter AST 解析**:原生語法解析以實現準確的符號提取
@@ -240,15 +240,18 @@ pip install code-index-mcp
| 工具 | 描述 |
|------|------|
| **`set_project_path`** | 為專案目錄初始化索引 |
-| **`refresh_index`** | 在檔案變更後重建專案索引 |
+| **`refresh_index`** | 在檔案變更後重建淺層檔案索引 |
+| **`build_deep_index`** | 產生供深度分析使用的完整符號索引 |
| **`get_settings_info`** | 檢視目前專案配置和狀態 |
+*需要符號層級資料時,請執行 `build_deep_index`;預設的淺層索引提供快速檔案探索。*
+
### 🔍 **搜尋與探索**
| 工具 | 描述 |
|------|------|
| **`search_code_advanced`** | 智慧搜尋,支援正規表達式、模糊匹配和檔案篩選 |
| **`find_files`** | 使用萬用字元模式尋找檔案(例如 `**/*.py`) |
-| **`get_file_summary`** | 分析檔案結構、函式、匯入和複雜度 |
+| **`get_file_summary`** | 分析檔案結構、函式、匯入和複雜度(需要深度索引) |
### 🔄 **監控與自動刷新**
| 工具 | 描述 |
@@ -285,6 +288,7 @@ pip install code-index-mcp
給我 src/api/userService.ts 的摘要
```
*使用:`get_file_summary` 顯示函式、匯入和複雜度*
+*提示:若收到 `needs_deep_index` 回應,請先執行 `build_deep_index`。*
### 🔍 **進階搜尋範例**
diff --git a/RELEASE_NOTE.txt b/RELEASE_NOTE.txt
new file mode 100644
index 0000000..8a744bb
--- /dev/null
+++ b/RELEASE_NOTE.txt
@@ -0,0 +1,7 @@
+## 2.4.1 - Search Filtering Alignment
+
+### Highlights
+- Code search now shares the central FileFilter blacklist, keeping results consistent with indexing (no more `node_modules` noise).
+- CLI search strategies emit the appropriate exclusion flags automatically (ripgrep, ugrep, ag, grep).
+- Basic fallback search prunes excluded directories during traversal, avoiding unnecessary IO.
+- Added regression coverage for the new filtering behaviour (`tests/search/test_search_filters.py`).
diff --git a/SCIP_OFFICIAL_STANDARDS.md b/SCIP_OFFICIAL_STANDARDS.md
deleted file mode 100644
index 763b56c..0000000
--- a/SCIP_OFFICIAL_STANDARDS.md
+++ /dev/null
@@ -1,337 +0,0 @@
-# SCIP (Source Code Intelligence Protocol) Official Standards
-
-*This document contains only the official SCIP standards as defined by Sourcegraph, without any project-specific implementations.*
-
-## Overview
-
-SCIP (pronounced "skip") is a language-agnostic protocol for indexing source code to power code navigation functionality such as Go to definition, Find references, and Find implementations. It is a recursive acronym that stands for "SCIP Code Intelligence Protocol."
-
-**Official Repository**: https://github.com/sourcegraph/scip
-
-## Core Design Principles (Official)
-
-### Primary Goals
-1. **Support code navigation at IDE-level fidelity** - Provide excellent code navigation experience
-2. **Make indexer creation easy** by:
- - Enabling cross-repository navigation
- - Supporting file-level incremental indexing
- - Facilitating parallel indexing
- - Supporting multi-language indexer development
-
-### Design Philosophy
-> "SCIP is meant to be a transmission format for sending data from some producers to some consumers -- it is not meant as a storage format for querying."
-
-### Technical Design Decisions
-1. **Protobuf Schema**
- - Relatively compact binary format
- - Supports easy code generation
- - Enables streaming reads/writes
- - Maintains forward/backward compatibility
-
-2. **String-based Identifiers**
- - Prefer human-readable string IDs for symbols
- - Avoid integer ID mapping tables
- - Improve debuggability
- - Limit potential bug impact
-
-3. **Data Encoding Approach**
- - Avoid direct graph encoding
- - Use document and array-based approaches
- - Enable streaming capabilities
- - Minimize memory consumption during indexing
-
-### Non-Goals
-- Not focused on code modification tools
-- Not optimizing for consumer-side tooling
-- Not prioritizing uncompressed data compactness
-- Not serving as a standalone query engine
-
-## Protocol Buffer Schema (Official)
-
-### Main Message Types
-
-```protobuf
-syntax = "proto3";
-package scip;
-
-message Index {
- Metadata metadata = 1;
- repeated Document documents = 2;
- repeated SymbolInformation external_symbols = 3;
-}
-
-message Metadata {
- ProtocolVersion version = 1;
- ToolInfo tool_info = 2;
- string project_root = 3;
- TextEncoding text_encoding = 4;
-}
-
-message Document {
- string language = 4;
- string relative_path = 1;
- repeated Occurrence occurrences = 2;
- repeated SymbolInformation symbols = 3;
- string text = 5;
-}
-
-message Symbol {
- string scheme = 1;
- Package package = 2;
- repeated Descriptor descriptors = 3;
-}
-
-message SymbolInformation {
- string symbol = 1;
- repeated string documentation = 3;
- repeated Relationship relationships = 4;
- SymbolKind kind = 5;
- string display_name = 6;
- Signature signature_documentation = 7;
- repeated string enclosing_symbol = 8;
-}
-
-message Occurrence {
- Range range = 1;
- string symbol = 2;
- int32 symbol_roles = 3;
- repeated Diagnostic override_documentation = 4;
- SyntaxKind syntax_kind = 5;
-}
-
-message Range {
- repeated int32 start = 1; // [line, column]
- repeated int32 end = 2; // [line, column]
-}
-```
-
-## Official Symbol Format Specification
-
-### Symbol Grammar (Official)
-```
- ::= ' ' ' ' ()+ | 'local '
- ::= ' ' ' '
- ::= UTF-8 string (escape spaces with double space)
- ::= | | | | | | |
-```
-
-### Symbol Components
-
-**Scheme**: Identifies the symbol's origin/context
-- UTF-8 string
-- Escape spaces with double space
-
-**Package**: Includes manager, name, and version
-- Manager: Package manager identifier
-- Package name: Unique package identifier
-- Version: Package version
-
-**Descriptors**: Represent nested/hierarchical symbol structure
-- Form a fully qualified name
-- Support various symbol types
-
-**Local Symbols**: Only for entities within a single Document
-- Format: `local `
-- Used for file-scoped symbols
-
-### Encoding Rules (Official)
-- Descriptors form a fully qualified name
-- Local symbols are only for entities within a single Document
-- Symbols must uniquely identify an entity across a package
-- Supports escaping special characters in identifiers
-
-## Enumerations (Official)
-
-### ProtocolVersion
-```protobuf
-enum ProtocolVersion {
- UnspecifiedProtocolVersion = 0;
-}
-```
-
-### TextEncoding
-```protobuf
-enum TextEncoding {
- UnspecifiedTextEncoding = 0;
- UTF8 = 1;
- UTF16 = 2;
-}
-```
-
-### SymbolRole
-```protobuf
-enum SymbolRole {
- UnspecifiedSymbolRole = 0;
- Definition = 1;
- Import = 2;
- WriteAccess = 4;
- ReadAccess = 8;
- Generated = 16;
- Test = 32;
-}
-```
-
-### SymbolKind
-```protobuf
-enum SymbolKind {
- UnspecifiedSymbolKind = 0;
- Array = 1;
- Boolean = 2;
- Class = 3;
- Constant = 4;
- Constructor = 5;
- Enum = 6;
- EnumMember = 7;
- Event = 8;
- Field = 9;
- File = 10;
- Function = 11;
- Interface = 12;
- Key = 13;
- Method = 14;
- Module = 15;
- Namespace = 16;
- Null = 17;
- Number = 18;
- Object = 19;
- Operator = 20;
- Package = 21;
- Property = 22;
- String = 23;
- Struct = 24;
- TypeParameter = 25;
- Variable = 26;
- Macro = 27;
-}
-```
-
-### SyntaxKind
-```protobuf
-enum SyntaxKind {
- UnspecifiedSyntaxKind = 0;
- Comment = 1;
- PunctuationDelimiter = 2;
- PunctuationBracket = 3;
- Keyword = 4;
- // ... (additional syntax kinds)
- IdentifierKeyword = 13;
- IdentifierOperator = 14;
- IdentifierBuiltin = 15;
- IdentifierNull = 16;
- IdentifierConstant = 17;
- IdentifierMutableGlobal = 18;
- IdentifierParameter = 19;
- IdentifierLocal = 20;
- IdentifierShadowed = 21;
- IdentifierNamespace = 22;
- IdentifierFunction = 23;
- IdentifierFunctionDefinition = 24;
- IdentifierMacro = 25;
- IdentifierMacroDefinition = 26;
- IdentifierType = 27;
- IdentifierBuiltinType = 28;
- IdentifierAttribute = 29;
-}
-```
-
-## Official Position and Range Specification
-
-### Coordinate System
-- **Line numbers**: 0-indexed
-- **Column numbers**: 0-indexed character positions
-- **UTF-8/UTF-16 aware**: Proper Unicode handling
-
-### Range Format
-```protobuf
-message Range {
- repeated int32 start = 1; // [line, column]
- repeated int32 end = 2; // [line, column]
-}
-```
-
-### Requirements
-- Start position must be <= end position
-- Ranges must be within document boundaries
-- Character-level precision required
-
-## Official Language Support
-
-### Currently Supported (Official Implementations)
-- **TypeScript/JavaScript**: scip-typescript
-- **Java**: scip-java (also supports Scala, Kotlin)
-- **Python**: In development
-
-### Language Bindings Available
-- **Rich bindings**: Go, Rust
-- **Auto-generated bindings**: TypeScript, Haskell
-- **CLI tools**: scip CLI for index manipulation
-
-## Performance Characteristics (Official Claims)
-
-### Compared to LSIF
-- **10x speedup** in CI environments
-- **4x smaller** compressed payload size
-- **Better streaming**: Enables processing without loading entire index
-- **Lower memory usage**: Document-based processing
-
-### Design Benefits
-- Static typing from Protobuf schema
-- More ergonomic debugging
-- Reduced runtime errors
-- Smaller index files
-
-## Official Tools and Ecosystem
-
-### SCIP CLI
-- Index manipulation and conversion
-- LSIF compatibility support
-- Debugging and inspection tools
-
-### Official Indexers
-- **scip-typescript**: `npm install -g @sourcegraph/scip-typescript`
-- **scip-java**: Available as Docker image, Java launcher, fat jar
-
-### Integration Support
-- GitLab Code Intelligence (via LSIF conversion)
-- Sourcegraph native support
-- VS Code extensions (community)
-
-## Standards Compliance Requirements
-
-### For SCIP Index Producers
-1. Must generate valid Protocol Buffer format
-2. Must follow symbol ID format specification
-3. Must provide accurate position information
-4. Should support streaming output
-5. Must handle UTF-8/UTF-16 encoding correctly
-
-### For SCIP Index Consumers
-1. Must handle streaming input
-2. Should support all standard symbol kinds
-3. Must respect symbol role classifications
-4. Should provide graceful error handling
-5. Must support position range validation
-
-## Official Documentation Sources
-
-### Primary Sources
-- **Main Repository**: https://github.com/sourcegraph/scip
-- **Protocol Schema**: https://github.com/sourcegraph/scip/blob/main/scip.proto
-- **Design Document**: https://github.com/sourcegraph/scip/blob/main/DESIGN.md
-- **Announcement Blog**: https://sourcegraph.com/blog/announcing-scip
-
-### Language-Specific Documentation
-- **Java**: https://github.com/sourcegraph/scip-java
-- **TypeScript**: https://github.com/sourcegraph/scip-typescript
-
-### Community Resources
-- **Bindings**: Available for Go, Rust, TypeScript, Haskell
-- **Examples**: Implementation examples in official repositories
-- **Issues**: Bug reports and feature requests on GitHub
-
----
-
-*This document contains only official SCIP standards as defined by Sourcegraph.*
-*Last updated: 2025-01-14*
-*SCIP Version: Compatible with official v0.3.x specification*
-*Source: Official Sourcegraph SCIP repositories and documentation*
\ No newline at end of file
diff --git a/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md b/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md
deleted file mode 100644
index 25d4e8c..0000000
--- a/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md
+++ /dev/null
@@ -1,372 +0,0 @@
-# SCIPSymbolAnalyzer Refactoring Plan
-
-## 🎯 Overview
-
-This document outlines a comprehensive refactoring plan for the `SCIPSymbolAnalyzer` class to transform it from a monolithic architecture into a modular, extensible, and maintainable system that supports multiple programming languages with proper separation of concerns.
-
-## 🔍 Current Architecture Problems
-
-### 1. **Monolithic Design Issues**
-- All language-specific logic is mixed within a single class
-- The `_extract_imports` method contains Python, Objective-C, and Zig-specific logic
-- Lack of extensibility - adding new languages requires modifying the core class
-- Violation of Single Responsibility Principle
-
-### 2. **Dependency Processing Chaos**
-- Methods like `_classify_zig_import`, `_categorize_import` are scattered throughout the codebase
-- No unified dependency classification standard
-- Language-specific standard library lists are hardcoded
-- Inconsistent dependency type mapping
-
-### 3. **Symbol Resolution Complexity**
-- Position detection logic is complex and error-prone
-- Three-layer position detection strategy is difficult to maintain
-- Symbol ID parsing logic lacks flexibility
-- Mixed concerns between symbol extraction and position calculation
-
-### 4. **Poor Language Support Scalability**
-- Each new language requires core class modifications
-- No clear plugin architecture
-- Language-specific logic embedded in generic methods
-- Difficult to test language-specific features in isolation
-
-## 🏗️ Proposed Refactoring Architecture
-
-### Phase 1: Language Plugin System
-
-```python
-# New architecture design
-class LanguageAnalyzer(ABC):
- """Language-specific analyzer interface"""
-
- @abstractmethod
- def extract_imports(self, document, imports: ImportGroup) -> None:
- """Extract import information from SCIP document"""
-
- @abstractmethod
- def classify_dependency(self, module_name: str) -> str:
- """Classify dependency as standard_library, third_party, or local"""
-
- @abstractmethod
- def extract_symbol_metadata(self, symbol_info) -> Dict[str, Any]:
- """Extract language-specific symbol metadata"""
-
- @abstractmethod
- def get_standard_library_modules(self) -> Set[str]:
- """Return set of standard library module names"""
-
-class ZigAnalyzer(LanguageAnalyzer):
- """Zig language-specific analyzer"""
-
-class PythonAnalyzer(LanguageAnalyzer):
- """Python language-specific analyzer"""
-
-class ObjectiveCAnalyzer(LanguageAnalyzer):
- """Objective-C language-specific analyzer"""
-
-class LanguageAnalyzerFactory:
- """Factory for creating language-specific analyzers"""
-
- def get_analyzer(self, language: str) -> LanguageAnalyzer:
- """Get appropriate analyzer for language"""
-```
-
-### Phase 2: Dependency Management System
-
-```python
-class DependencyClassifier:
- """Unified dependency classification system"""
-
- def __init__(self):
- self.language_configs = {
- 'python': PythonDependencyConfig(),
- 'zig': ZigDependencyConfig(),
- 'javascript': JavaScriptDependencyConfig()
- }
-
- def classify_import(self, import_path: str, language: str) -> str:
- """Classify import based on language-specific rules"""
-
-class DependencyConfig(ABC):
- """Language-specific dependency configuration"""
-
- @abstractmethod
- def get_stdlib_modules(self) -> Set[str]:
- """Return standard library modules for this language"""
-
- @abstractmethod
- def classify_import(self, import_path: str) -> str:
- """Classify import path for this language"""
-
- @abstractmethod
- def normalize_import_path(self, raw_path: str) -> str:
- """Normalize import path for consistent processing"""
-```
-
-### Phase 3: Position Resolution System
-
-```python
-class PositionResolver:
- """Unified symbol position resolution system"""
-
- def __init__(self):
- self.strategies = [
- SCIPOccurrenceStrategy(), # High confidence
- TreeSitterStrategy(), # Medium confidence
- HeuristicStrategy() # Fallback
- ]
-
- def resolve_position(self, symbol, document) -> LocationInfo:
- """Resolve symbol position using strategy pattern"""
-
-class PositionStrategy(ABC):
- """Base class for position resolution strategies"""
-
- @abstractmethod
- def try_resolve(self, symbol, document) -> Optional[LocationInfo]:
- """Attempt to resolve symbol position"""
-
- @abstractmethod
- def get_confidence_level(self) -> str:
- """Return confidence level: 'high', 'medium', 'low'"""
-```
-
-## 📋 Detailed Implementation Plan
-
-### **Phase 1: Architecture Separation (Week 1)**
-
-#### 1.1 Create Language Analyzer Interface
-```
-src/code_index_mcp/tools/scip/analyzers/
-├── base.py # Base interfaces and common utilities
-├── python_analyzer.py # Python-specific analysis logic
-├── zig_analyzer.py # Zig-specific analysis logic
-├── objc_analyzer.py # Objective-C-specific analysis logic
-├── javascript_analyzer.py # JavaScript/TypeScript analysis logic
-└── factory.py # Analyzer factory and registry
-```
-
-**Tasks:**
-- [ ] Define `LanguageAnalyzer` abstract base class
-- [ ] Extract Python-specific logic to `PythonAnalyzer`
-- [ ] Move Zig logic from current implementation to `ZigAnalyzer`
-- [ ] Migrate Objective-C logic to `ObjectiveCAnalyzer`
-- [ ] Create factory pattern for analyzer instantiation
-
-#### 1.2 Extract Language-Specific Logic
-- [ ] Move `_classify_zig_import` to `ZigAnalyzer`
-- [ ] Move Python stdlib detection to `PythonAnalyzer`
-- [ ] Move Objective-C framework detection to `ObjectiveCAnalyzer`
-- [ ] Create language-specific symbol metadata extraction
-
-### **Phase 2: Dependency Processing Refactoring (Week 2)**
-
-#### 2.1 Create Dependency Management Module
-```
-src/code_index_mcp/tools/scip/dependencies/
-├── classifier.py # Main dependency classifier
-├── configs/ # Language-specific configurations
-│ ├── __init__.py
-│ ├── python.py # Python dependency rules
-│ ├── zig.py # Zig dependency rules
-│ ├── javascript.py # JavaScript dependency rules
-│ └── base.py # Base configuration class
-├── registry.py # Dependency registry and caching
-└── normalizer.py # Import path normalization
-```
-
-**Tasks:**
-- [ ] Create unified `DependencyClassifier` class
-- [ ] Implement language-specific configuration classes
-- [ ] Standardize dependency type constants
-- [ ] Add configurable standard library lists
-- [ ] Implement caching for dependency classification results
-
-#### 2.2 Standardize Dependency Classification
-- [ ] Define consistent classification types: `standard_library`, `third_party`, `local`
-- [ ] Create configurable standard library lists per language
-- [ ] Support custom classification rules
-- [ ] Implement dependency version detection where applicable
-
-### **Phase 3: Symbol Resolution Refactoring (Week 3)**
-
-#### 3.1 Modularize Position Detection
-```
-src/code_index_mcp/tools/scip/position/
-├── resolver.py # Main position resolver
-├── strategies/ # Position detection strategies
-│ ├── __init__.py
-│ ├── scip_occurrence.py # SCIP occurrence-based detection
-│ ├── tree_sitter.py # Tree-sitter AST-based detection
-│ ├── heuristic.py # Heuristic fallback detection
-│ └── base.py # Base strategy interface
-├── calculator.py # Position calculation utilities
-└── confidence.py # Confidence level management
-```
-
-**Tasks:**
-- [ ] Implement strategy pattern for position resolution
-- [ ] Separate SCIP occurrence processing logic
-- [ ] Extract tree-sitter position calculation
-- [ ] Create heuristic fallback mechanisms
-- [ ] Add confidence level tracking
-
-#### 3.2 Improve Symbol Parsing
-- [ ] Refactor `_extract_name_from_scip_symbol` method
-- [ ] Unify Symbol ID format processing
-- [ ] Support additional SCIP symbol formats
-- [ ] Add robust error handling for malformed symbols
-
-### **Phase 4: Relationship Analysis Refactoring (Week 4)**
-
-#### 4.1 Separate Relationship Analysis Logic
-```
-src/code_index_mcp/tools/scip/relationships/
-├── analyzer.py # Main relationship analyzer
-├── types.py # Relationship type definitions
-├── builder.py # Relationship construction logic
-├── extractors/ # Relationship extraction strategies
-│ ├── __init__.py
-│ ├── call_extractor.py # Function call relationships
-│ ├── inheritance_extractor.py # Class inheritance
-│ └── reference_extractor.py # Symbol references
-└── formatter.py # Relationship output formatting
-```
-
-**Tasks:**
-- [ ] Extract relationship analysis from main analyzer
-- [ ] Implement relationship type system
-- [ ] Create relationship builders for different types
-- [ ] Add relationship validation logic
-
-#### 4.2 Optimize Relationship Detection
-- [ ] Improve function call detection accuracy
-- [ ] Support additional relationship types (inheritance, interfaces, etc.)
-- [ ] Add cross-file relationship resolution
-- [ ] Implement relationship confidence scoring
-
-### **Phase 5: Integration and Testing (Week 5)**
-
-#### 5.1 Integrate New Architecture
-- [ ] Update `SCIPSymbolAnalyzer` to use new plugin system
-- [ ] Create adapter layer for backward compatibility
-- [ ] Update configuration and initialization logic
-- [ ] Add performance monitoring
-
-#### 5.2 Comprehensive Testing
-- [ ] Unit tests for each language analyzer
-- [ ] Integration tests for dependency classification
-- [ ] Position resolution accuracy tests
-- [ ] Performance benchmark tests
-- [ ] Memory usage optimization tests
-
-## 🎯 Refactoring Goals
-
-### **Maintainability Improvements**
-- ✅ **Single Responsibility**: Each class focuses on specific functionality
-- ✅ **Open/Closed Principle**: Easy to add new language support without modifying existing code
-- ✅ **Dependency Injection**: Components are replaceable and testable
-- ✅ **Clear Separation of Concerns**: Position detection, dependency classification, and symbol analysis are separate
-
-### **Performance Optimizations**
-- ✅ **Lazy Loading**: Only load required language analyzers
-- ✅ **Caching Mechanisms**: Cache symbol resolution and dependency classification results
-- ✅ **Parallel Processing**: Support multi-file parallel analysis
-- ✅ **Memory Efficiency**: Reduce memory footprint through better data structures
-
-### **Extensibility Features**
-- ✅ **Plugin System**: Third-party language support through plugins
-- ✅ **Configuration-Driven**: Configurable analysis rules and standards
-- ✅ **Stable API**: Backward-compatible interfaces
-- ✅ **Language Agnostic Core**: Core logic independent of specific languages
-
-## 🧪 Testing Strategy
-
-### **Unit Testing Coverage**
-- [ ] Each language analyzer tested independently
-- [ ] Dependency classifier comprehensive test suite
-- [ ] Position resolver strategy tests
-- [ ] Symbol parsing edge case tests
-- [ ] Relationship extraction validation tests
-
-### **Integration Testing**
-- [ ] Cross-language analysis scenarios
-- [ ] End-to-end file analysis workflows
-- [ ] SCIP compliance validation
-- [ ] Performance regression testing
-
-### **Regression Testing**
-- [ ] Existing functionality preservation
-- [ ] Zig dependency processing validation
-- [ ] Python analysis accuracy maintenance
-- [ ] Objective-C framework detection consistency
-
-## 📈 Success Metrics
-
-### **Code Quality Improvements**
-- **Cyclomatic Complexity**: Reduce from current >50 to <10 per method
-- **Test Coverage**: Achieve >90% code coverage
-- **Maintainability Index**: Improve from current score to >80
-
-### **Performance Targets**
-- **Analysis Speed**: <500ms per file (currently ~2s)
-- **Memory Usage**: <50MB for 1000-file project (currently ~200MB)
-- **Accuracy**: >95% symbol position accuracy
-
-### **Extensibility Goals**
-- **New Language Addition**: <2 hours to add basic support
-- **Plugin Development**: Third-party plugin support
-- **Configuration Flexibility**: Runtime configuration changes
-
-## 🚀 Migration Plan
-
-### **Phase 1: Preparation (Week 1)**
-- Create new module structure
-- Implement base interfaces
-- Set up testing framework
-
-### **Phase 2: Gradual Migration (Weeks 2-4)**
-- Migrate one language at a time
-- Maintain backward compatibility
-- Add comprehensive tests for each component
-
-### **Phase 3: Integration (Week 5)**
-- Integrate all components
-- Performance optimization
-- Final testing and validation
-
-### **Phase 4: Documentation and Cleanup (Week 6)**
-- Update documentation
-- Remove deprecated code
-- Finalize API documentation
-
-## 🔧 Implementation Notes
-
-### **Backward Compatibility**
-- Maintain existing public API during transition
-- Create adapter layer for legacy code
-- Gradual deprecation of old methods
-
-### **Configuration Management**
-- Use dependency injection for configurability
-- Support runtime configuration updates
-- Provide sensible defaults for all languages
-
-### **Error Handling**
-- Implement comprehensive error handling at each layer
-- Provide detailed error messages for debugging
-- Graceful degradation when analyzers fail
-
-### **Logging and Monitoring**
-- Add structured logging throughout the system
-- Implement performance metrics collection
-- Create debugging tools for complex analysis scenarios
-
----
-
-**Status**: 📋 Planning Phase
-**Priority**: 🔥 High
-**Estimated Effort**: 6 weeks
-**Dependencies**: None
-
-This refactoring will establish a solid foundation for supporting additional programming languages and maintaining high code quality as the system grows.
\ No newline at end of file
diff --git a/benchmark_scip_framework.py b/benchmark_scip_framework.py
deleted file mode 100644
index 88d05f5..0000000
--- a/benchmark_scip_framework.py
+++ /dev/null
@@ -1,1017 +0,0 @@
-"""SCIP Framework Performance Benchmark Suite - Comprehensive performance testing and analysis."""
-
-import os
-import time
-import tempfile
-import statistics
-import gc
-import psutil
-import threading
-from pathlib import Path
-from typing import Dict, List, Any, Tuple, Optional
-from dataclasses import dataclass, asdict
-from concurrent.futures import ThreadPoolExecutor, as_completed
-
-from src.code_index_mcp.scip.framework import (
- SCIPFrameworkAPI, SCIPConfig, create_scip_framework,
- PythonSCIPIndexFactory, JavaScriptSCIPIndexFactory, JavaSCIPIndexFactory,
- SCIPCacheManager, StreamingIndexer
-)
-
-
-@dataclass
-class BenchmarkResult:
- """Benchmark result data structure."""
- test_name: str
- file_count: int
- total_time: float
- memory_usage_mb: float
- symbols_generated: int
- occurrences_generated: int
- cache_hit_rate: float
- throughput_files_per_sec: float
- throughput_symbols_per_sec: float
- error_count: int
- additional_metrics: Dict[str, Any]
-
-
-@dataclass
-class SystemMetrics:
- """System resource metrics."""
- cpu_percent: float
- memory_percent: float
- memory_available_mb: float
- disk_io_read_mb: float
- disk_io_write_mb: float
-
-
-class PerformanceMonitor:
- """Real-time performance monitoring during benchmarks."""
-
- def __init__(self):
- self.monitoring = False
- self.metrics_history: List[SystemMetrics] = []
- self.monitor_thread: Optional[threading.Thread] = None
- self.process = psutil.Process()
-
- def start_monitoring(self, interval: float = 0.5):
- """Start performance monitoring."""
- self.monitoring = True
- self.metrics_history.clear()
- self.monitor_thread = threading.Thread(target=self._monitor_loop, args=(interval,))
- self.monitor_thread.daemon = True
- self.monitor_thread.start()
-
- def stop_monitoring(self) -> List[SystemMetrics]:
- """Stop monitoring and return collected metrics."""
- self.monitoring = False
- if self.monitor_thread:
- self.monitor_thread.join(timeout=2.0)
- return self.metrics_history.copy()
-
- def _monitor_loop(self, interval: float):
- """Monitor system metrics in a loop."""
- while self.monitoring:
- try:
- # Get current metrics
- memory_info = self.process.memory_info()
-
- metrics = SystemMetrics(
- cpu_percent=self.process.cpu_percent(),
- memory_percent=self.process.memory_percent(),
- memory_available_mb=memory_info.rss / 1024 / 1024,
- disk_io_read_mb=0.0, # Simplified for demo
- disk_io_write_mb=0.0
- )
-
- self.metrics_history.append(metrics)
- time.sleep(interval)
-
- except Exception as e:
- print(f"Monitoring error: {e}")
- break
-
-
-class SCIPFrameworkBenchmark:
- """Comprehensive benchmark suite for SCIP framework."""
-
- def __init__(self):
- self.results: List[BenchmarkResult] = []
- self.monitor = PerformanceMonitor()
-
- def run_all_benchmarks(self) -> Dict[str, Any]:
- """Run complete benchmark suite."""
- print("=== SCIP Framework Performance Benchmark Suite ===")
- print(f"System: {psutil.cpu_count()} CPUs, {psutil.virtual_memory().total // 1024**3} GB RAM")
-
- with tempfile.TemporaryDirectory() as temp_dir:
- # Create test projects of various sizes
- small_project = self.create_test_project(temp_dir, "small", 50)
- medium_project = self.create_test_project(temp_dir, "medium", 200)
- large_project = self.create_test_project(temp_dir, "large", 1000)
-
- # Run benchmarks
- benchmark_suite = [
- ("Small Project (50 files)", small_project, {'max_workers': 2, 'batch_size': 10}),
- ("Medium Project (200 files)", medium_project, {'max_workers': 4, 'batch_size': 50}),
- ("Large Project (1000 files)", large_project, {'max_workers': 8, 'batch_size': 100}),
- ]
-
- for test_name, project_path, config_overrides in benchmark_suite:
- print(f"\n🏃 Running: {test_name}")
-
- # Basic index generation benchmark
- result = self.benchmark_index_generation(test_name, project_path, config_overrides)
- self.results.append(result)
-
- # Caching performance benchmark
- cache_result = self.benchmark_caching_performance(f"{test_name} - Caching", project_path, config_overrides)
- self.results.append(cache_result)
-
- # Streaming performance benchmark
- streaming_result = self.benchmark_streaming_performance(f"{test_name} - Streaming", project_path, config_overrides)
- self.results.append(streaming_result)
-
- # Multi-language benchmark
- multi_lang_project = self.create_multi_language_project(temp_dir)
- multi_result = self.benchmark_multi_language(multi_lang_project)
- self.results.append(multi_result)
-
- # Memory stress test
- memory_result = self.benchmark_memory_usage(large_project)
- self.results.append(memory_result)
-
- # Concurrent processing benchmark
- concurrent_result = self.benchmark_concurrent_processing(medium_project)
- self.results.append(concurrent_result)
-
- # Generate comprehensive report
- return self.generate_benchmark_report()
-
- def create_test_project(self, base_dir: str, project_name: str, file_count: int) -> str:
- """Create test project with specified number of files."""
- project_dir = os.path.join(base_dir, project_name)
- os.makedirs(project_dir, exist_ok=True)
-
- # Generate Python files with varying complexity
- for i in range(file_count):
- file_path = os.path.join(project_dir, f"module_{i:04d}.py")
- content = self.generate_python_file_content(i, file_count)
-
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(content)
-
- return project_dir
-
- def create_multi_language_project(self, base_dir: str) -> str:
- """Create project with multiple programming languages."""
- project_dir = os.path.join(base_dir, "multi_language")
- os.makedirs(project_dir, exist_ok=True)
-
- # Python files
- for i in range(30):
- file_path = os.path.join(project_dir, f"python_module_{i}.py")
- with open(file_path, 'w') as f:
- f.write(self.generate_python_file_content(i, 30))
-
- # JavaScript files
- for i in range(20):
- file_path = os.path.join(project_dir, f"js_module_{i}.js")
- with open(file_path, 'w') as f:
- f.write(self.generate_javascript_file_content(i))
-
- # Java files
- for i in range(15):
- file_path = os.path.join(project_dir, f"JavaClass_{i}.java")
- with open(file_path, 'w') as f:
- f.write(self.generate_java_file_content(i))
-
- return project_dir
-
- def generate_python_file_content(self, file_index: int, total_files: int) -> str:
- """Generate Python file content with realistic complexity."""
- imports_count = min(5, file_index % 8 + 1)
- classes_count = file_index % 3 + 1
- functions_count = file_index % 5 + 2
-
- content = f'"""Module {file_index} - Generated for performance testing."""\n\n'
-
- # Add imports
- for i in range(imports_count):
- import_target = f"module_{(file_index + i) % total_files:04d}"
- content += f"from {import_target} import Class{i}, function_{i}\n"
-
- content += "\nimport os\nimport sys\nfrom typing import List, Dict, Optional\n\n"
-
- # Add classes
- for class_i in range(classes_count):
- content += f'''
-class Class{file_index}_{class_i}:
- """Test class {class_i} in module {file_index}."""
-
- def __init__(self, value: int = 0):
- self.value = value
- self.data: Dict[str, int] = {{}}
- self.items: List[str] = []
-
- def process_data(self, input_data: List[int]) -> Dict[str, int]:
- """Process input data and return results."""
- result = {{}}
- for i, item in enumerate(input_data):
- key = f"item_{{i}}"
- result[key] = item * self.value
- return result
-
- def calculate_total(self, multiplier: float = 1.0) -> float:
- """Calculate total value."""
- return sum(self.data.values()) * multiplier
-
- def add_item(self, item: str) -> None:
- """Add item to collection."""
- if item not in self.items:
- self.items.append(item)
-
- @property
- def item_count(self) -> int:
- """Get number of items."""
- return len(self.items)
-'''
-
- # Add functions
- for func_i in range(functions_count):
- content += f'''
-def function_{file_index}_{func_i}(param1: int, param2: str = "default") -> Tuple[int, str]:
- """Function {func_i} in module {file_index}."""
- processed_value = param1 * {func_i + 1}
- processed_string = f"{{param2}}_{{processed_value}}"
-
- # Some processing logic
- if processed_value > 100:
- processed_value = processed_value // 2
-
- return processed_value, processed_string
-
-def helper_function_{file_index}_{func_i}(data: List[Any]) -> Optional[Any]:
- """Helper function for function_{func_i}."""
- if not data:
- return None
-
- return data[0] if len(data) == 1 else data
-'''
-
- # Add module-level variables
- content += f'''
-# Module-level variables
-MODULE_ID = {file_index}
-MODULE_NAME = "module_{file_index:04d}"
-DEFAULT_CONFIG = {{
- "enabled": True,
- "max_items": {file_index * 10 + 100},
- "timeout": {file_index * 2 + 30}
-}}
-'''
-
- return content
-
- def generate_javascript_file_content(self, file_index: int) -> str:
- """Generate JavaScript file content."""
- return f'''
-// JavaScript module {file_index} for performance testing
-const express = require('express');
-const {{ EventEmitter }} = require('events');
-
-class Service{file_index} extends EventEmitter {{
- constructor(config = {{}}) {{
- super();
- this.config = config;
- this.data = new Map();
- this.active = false;
- }}
-
- async initialize() {{
- this.active = true;
- this.emit('initialized', {{ serviceId: {file_index} }});
- }}
-
- processData(input) {{
- const result = [];
- for (const item of input) {{
- result.push({{
- id: item.id,
- value: item.value * {file_index},
- timestamp: Date.now()
- }});
- }}
- return result;
- }}
-
- async asyncOperation(delay = 100) {{
- return new Promise(resolve => {{
- setTimeout(() => {{
- resolve({{ result: 'completed', serviceId: {file_index} }});
- }}, delay);
- }});
- }}
-}}
-
-function helper{file_index}(data) {{
- return data.map(item => ({{
- ...item,
- processed: true,
- serviceId: {file_index}
- }}));
-}}
-
-const config{file_index} = {{
- serviceId: {file_index},
- enabled: true,
- maxConnections: {file_index * 10 + 50}
-}};
-
-module.exports = {{
- Service{file_index},
- helper{file_index},
- config{file_index}
-}};
-'''
-
- def generate_java_file_content(self, file_index: int) -> str:
- """Generate Java file content."""
- return f'''
-package com.benchmark.test;
-
-import java.util.*;
-import java.util.concurrent.ConcurrentHashMap;
-import java.time.LocalDateTime;
-
-/**
- * Test class {file_index} for performance benchmarking.
- * Demonstrates various Java language features.
- */
-public class JavaClass_{file_index} {{
- private final int classId;
- private final Map data;
- private final List items;
- private boolean active;
-
- /**
- * Constructor for JavaClass_{file_index}.
- *
- * @param classId Unique identifier for this class
- */
- public JavaClass_{file_index}(int classId) {{
- this.classId = classId;
- this.data = new ConcurrentHashMap<>();
- this.items = new ArrayList<>();
- this.active = false;
- }}
-
- /**
- * Initialize the class with default values.
- */
- public void initialize() {{
- this.active = true;
- this.data.put("initialized", LocalDateTime.now());
- this.data.put("classId", this.classId);
- }}
-
- /**
- * Process a list of integers and return results.
- *
- * @param input List of integers to process
- * @return Map of processed results
- */
- public Map processNumbers(List input) {{
- Map results = new HashMap<>();
-
- for (int i = 0; i < input.size(); i++) {{
- String key = "result_" + i;
- Integer value = input.get(i) * {file_index} + i;
- results.put(key, value);
- }}
-
- return results;
- }}
-
- /**
- * Add item to the collection.
- *
- * @param item Item to add
- * @return true if item was added, false if it already exists
- */
- public boolean addItem(String item) {{
- if (item == null || item.trim().isEmpty()) {{
- return false;
- }}
-
- if (!items.contains(item)) {{
- items.add(item);
- return true;
- }}
-
- return false;
- }}
-
- /**
- * Get total count of items.
- *
- * @return Number of items in collection
- */
- public int getItemCount() {{
- return items.size();
- }}
-
- /**
- * Check if class is active.
- *
- * @return true if active, false otherwise
- */
- public boolean isActive() {{
- return active;
- }}
-
- /**
- * Set active status.
- *
- * @param active New active status
- */
- public void setActive(boolean active) {{
- this.active = active;
- if (active) {{
- data.put("lastActivated", LocalDateTime.now());
- }}
- }}
-
- @Override
- public String toString() {{
- return String.format("JavaClass_%d{{classId=%d, active=%s, items=%d}}",
- {file_index}, classId, active, items.size());
- }}
-
- @Override
- public boolean equals(Object obj) {{
- if (this == obj) return true;
- if (obj == null || getClass() != obj.getClass()) return false;
- JavaClass_{file_index} other = (JavaClass_{file_index}) obj;
- return classId == other.classId;
- }}
-
- @Override
- public int hashCode() {{
- return Objects.hash(classId);
- }}
-}}
-'''
-
- def benchmark_index_generation(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult:
- """Benchmark basic index generation performance."""
- print(f" 📊 Index generation benchmark...")
-
- # Configure framework
- config = SCIPConfig(
- project_root=project_path,
- cache_enabled=False, # Disable cache for pure generation benchmark
- validate_compliance=True,
- **config_overrides
- )
-
- framework = SCIPFrameworkAPI(config)
-
- # Count files
- file_count = len(list(Path(project_path).rglob("*.py")))
-
- # Start monitoring
- self.monitor.start_monitoring()
-
- # Run benchmark
- start_time = time.time()
- start_memory = psutil.Process().memory_info().rss / 1024 / 1024
-
- try:
- index = framework.create_complete_index()
-
- end_time = time.time()
- end_memory = psutil.Process().memory_info().rss / 1024 / 1024
-
- # Stop monitoring
- metrics_history = self.monitor.stop_monitoring()
-
- # Calculate metrics
- total_time = end_time - start_time
- memory_usage = end_memory - start_memory
-
- symbols_count = sum(len(doc.symbols) for doc in index.documents)
- occurrences_count = sum(len(doc.occurrences) for doc in index.occurrences)
-
- throughput_files = file_count / total_time if total_time > 0 else 0
- throughput_symbols = symbols_count / total_time if total_time > 0 else 0
-
- # Additional metrics
- avg_cpu = statistics.mean([m.cpu_percent for m in metrics_history]) if metrics_history else 0
- peak_memory = max([m.memory_available_mb for m in metrics_history]) if metrics_history else end_memory
-
- result = BenchmarkResult(
- test_name=test_name,
- file_count=file_count,
- total_time=total_time,
- memory_usage_mb=memory_usage,
- symbols_generated=symbols_count,
- occurrences_generated=occurrences_count,
- cache_hit_rate=0.0, # No cache in this test
- throughput_files_per_sec=throughput_files,
- throughput_symbols_per_sec=throughput_symbols,
- error_count=0,
- additional_metrics={
- 'avg_cpu_percent': avg_cpu,
- 'peak_memory_mb': peak_memory,
- 'documents_generated': len(index.documents),
- 'external_symbols': len(index.external_symbols)
- }
- )
-
- print(f" ✓ {file_count} files, {symbols_count} symbols in {total_time:.2f}s")
- print(f" ✓ {throughput_files:.1f} files/sec, {throughput_symbols:.1f} symbols/sec")
-
- return result
-
- except Exception as e:
- self.monitor.stop_monitoring()
- print(f" ❌ Benchmark failed: {e}")
-
- return BenchmarkResult(
- test_name=f"{test_name} (FAILED)",
- file_count=file_count,
- total_time=0,
- memory_usage_mb=0,
- symbols_generated=0,
- occurrences_generated=0,
- cache_hit_rate=0.0,
- throughput_files_per_sec=0,
- throughput_symbols_per_sec=0,
- error_count=1,
- additional_metrics={'error': str(e)}
- )
-
- def benchmark_caching_performance(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult:
- """Benchmark caching system performance."""
- print(f" 🗂️ Caching performance benchmark...")
-
- config = SCIPConfig(
- project_root=project_path,
- cache_enabled=True,
- **config_overrides
- )
-
- framework = SCIPFrameworkAPI(config)
- file_count = len(list(Path(project_path).rglob("*.py")))
-
- # First run to populate cache
- start_time = time.time()
- index1 = framework.create_complete_index()
- first_run_time = time.time() - start_time
-
- # Second run with cache
- start_time = time.time()
- index2 = framework.create_complete_index()
- second_run_time = time.time() - start_time
-
- # Get cache statistics
- cache_stats = framework.get_cache_statistics()
- hit_rate = float(cache_stats.get('hit_rate', '0%').rstrip('%')) / 100.0
-
- symbols_count = sum(len(doc.symbols) for doc in index2.documents)
-
- result = BenchmarkResult(
- test_name=test_name,
- file_count=file_count,
- total_time=second_run_time,
- memory_usage_mb=0, # Not measured in this test
- symbols_generated=symbols_count,
- occurrences_generated=0,
- cache_hit_rate=hit_rate,
- throughput_files_per_sec=file_count / second_run_time if second_run_time > 0 else 0,
- throughput_symbols_per_sec=symbols_count / second_run_time if second_run_time > 0 else 0,
- error_count=0,
- additional_metrics={
- 'first_run_time': first_run_time,
- 'second_run_time': second_run_time,
- 'cache_speedup': first_run_time / second_run_time if second_run_time > 0 else 0,
- 'cache_entries': cache_stats.get('memory_entries', 0)
- }
- )
-
- speedup = first_run_time / second_run_time if second_run_time > 0 else 0
- print(f" ✓ Cache hit rate: {hit_rate:.1%}, speedup: {speedup:.1f}x")
-
- return result
-
- def benchmark_streaming_performance(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult:
- """Benchmark streaming indexer performance."""
- print(f" 🌊 Streaming performance benchmark...")
-
- config = SCIPConfig(
- project_root=project_path,
- cache_enabled=True,
- **config_overrides
- )
-
- framework = SCIPFrameworkAPI(config)
- python_files = list(Path(project_path).rglob("*.py"))
- file_paths = [str(f) for f in python_files]
-
- # Create streaming indexer
- python_factory = PythonSCIPIndexFactory(project_path)
- cache_manager = SCIPCacheManager()
- streaming_indexer = StreamingIndexer(
- factory=python_factory,
- cache_manager=cache_manager,
- max_workers=config_overrides.get('max_workers', 4),
- chunk_size=config_overrides.get('batch_size', 50) // 2
- )
-
- # Track progress
- progress_updates = []
- def track_progress(progress):
- progress_updates.append({
- 'percentage': progress.progress_percentage,
- 'elapsed': progress.elapsed_time
- })
-
- streaming_indexer.add_progress_callback(track_progress)
-
- # Run streaming benchmark
- start_time = time.time()
-
- documents = []
- for doc in streaming_indexer.index_files_streaming(file_paths):
- documents.append(doc)
-
- total_time = time.time() - start_time
-
- symbols_count = sum(len(doc.symbols) for doc in documents)
- occurrences_count = sum(len(doc.occurrences) for doc in documents)
-
- result = BenchmarkResult(
- test_name=test_name,
- file_count=len(file_paths),
- total_time=total_time,
- memory_usage_mb=0,
- symbols_generated=symbols_count,
- occurrences_generated=occurrences_count,
- cache_hit_rate=0.0,
- throughput_files_per_sec=len(file_paths) / total_time if total_time > 0 else 0,
- throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0,
- error_count=0,
- additional_metrics={
- 'progress_updates': len(progress_updates),
- 'avg_chunk_time': total_time / max(1, len(progress_updates)),
- 'documents_streamed': len(documents)
- }
- )
-
- print(f" ✓ Streamed {len(documents)} documents in {total_time:.2f}s")
-
- return result
-
- def benchmark_multi_language(self, project_path: str) -> BenchmarkResult:
- """Benchmark multi-language processing."""
- print(f" 🌐 Multi-language performance benchmark...")
-
- config = SCIPConfig(
- project_root=project_path,
- max_workers=6,
- supported_languages={'python', 'javascript', 'java'}
- )
-
- framework = SCIPFrameworkAPI(config)
-
- # Count files by language
- python_files = len(list(Path(project_path).rglob("*.py")))
- js_files = len(list(Path(project_path).rglob("*.js")))
- java_files = len(list(Path(project_path).rglob("*.java")))
- total_files = python_files + js_files + java_files
-
- # Run benchmark
- start_time = time.time()
- index = framework.create_complete_index()
- total_time = time.time() - start_time
-
- symbols_count = sum(len(doc.symbols) for doc in index.documents)
-
- result = BenchmarkResult(
- test_name="Multi-Language Processing",
- file_count=total_files,
- total_time=total_time,
- memory_usage_mb=0,
- symbols_generated=symbols_count,
- occurrences_generated=0,
- cache_hit_rate=0.0,
- throughput_files_per_sec=total_files / total_time if total_time > 0 else 0,
- throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0,
- error_count=0,
- additional_metrics={
- 'python_files': python_files,
- 'javascript_files': js_files,
- 'java_files': java_files,
- 'languages_processed': 3,
- 'documents_generated': len(index.documents)
- }
- )
-
- print(f" ✓ {total_files} files ({python_files} Python, {js_files} JS, {java_files} Java)")
- print(f" ✓ {symbols_count} symbols in {total_time:.2f}s")
-
- return result
-
- def benchmark_memory_usage(self, project_path: str) -> BenchmarkResult:
- """Benchmark memory usage under load."""
- print(f" 🧠 Memory usage benchmark...")
-
- # Configure for memory stress testing
- config = SCIPConfig(
- project_root=project_path,
- max_workers=1, # Single worker to control memory usage
- batch_size=10, # Small batches
- cache_enabled=True
- )
-
- framework = SCIPFrameworkAPI(config)
- file_count = len(list(Path(project_path).rglob("*.py")))
-
- # Monitor memory throughout the process
- self.monitor.start_monitoring(interval=0.1) # High frequency monitoring
-
- process = psutil.Process()
- initial_memory = process.memory_info().rss / 1024 / 1024
-
- start_time = time.time()
-
- # Process with memory monitoring
- index = framework.create_complete_index()
-
- total_time = time.time() - start_time
- final_memory = process.memory_info().rss / 1024 / 1024
-
- # Stop monitoring and analyze
- metrics_history = self.monitor.stop_monitoring()
-
- if metrics_history:
- peak_memory = max(m.memory_available_mb for m in metrics_history)
- avg_memory = statistics.mean(m.memory_available_mb for m in metrics_history)
- else:
- peak_memory = final_memory
- avg_memory = final_memory
-
- memory_growth = final_memory - initial_memory
- symbols_count = sum(len(doc.symbols) for doc in index.documents)
-
- result = BenchmarkResult(
- test_name="Memory Usage Analysis",
- file_count=file_count,
- total_time=total_time,
- memory_usage_mb=memory_growth,
- symbols_generated=symbols_count,
- occurrences_generated=0,
- cache_hit_rate=0.0,
- throughput_files_per_sec=file_count / total_time if total_time > 0 else 0,
- throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0,
- error_count=0,
- additional_metrics={
- 'initial_memory_mb': initial_memory,
- 'final_memory_mb': final_memory,
- 'peak_memory_mb': peak_memory,
- 'avg_memory_mb': avg_memory,
- 'memory_efficiency_mb_per_symbol': memory_growth / symbols_count if symbols_count > 0 else 0,
- 'monitoring_samples': len(metrics_history)
- }
- )
-
- print(f" ✓ Memory growth: {memory_growth:.1f} MB (peak: {peak_memory:.1f} MB)")
- print(f" ✓ {memory_growth/symbols_count:.3f} MB per symbol")
-
- return result
-
- def benchmark_concurrent_processing(self, project_path: str) -> BenchmarkResult:
- """Benchmark concurrent processing capabilities."""
- print(f" ⚡ Concurrent processing benchmark...")
-
- python_files = list(Path(project_path).rglob("*.py"))
- file_paths = [str(f) for f in python_files]
-
- # Test different worker counts
- worker_counts = [1, 2, 4, 8]
- results = {}
-
- for workers in worker_counts:
- config = SCIPConfig(
- project_root=project_path,
- max_workers=workers,
- batch_size=50
- )
-
- framework = SCIPFrameworkAPI(config)
-
- start_time = time.time()
- index = framework.create_complete_index()
- elapsed_time = time.time() - start_time
-
- results[workers] = {
- 'time': elapsed_time,
- 'symbols': sum(len(doc.symbols) for doc in index.documents)
- }
-
- # Find optimal worker count
- best_workers = min(results.keys(), key=lambda w: results[w]['time'])
- best_time = results[best_workers]['time']
- sequential_time = results[1]['time']
-
- speedup = sequential_time / best_time if best_time > 0 else 0
- efficiency = speedup / best_workers if best_workers > 0 else 0
-
- result = BenchmarkResult(
- test_name="Concurrent Processing Analysis",
- file_count=len(file_paths),
- total_time=best_time,
- memory_usage_mb=0,
- symbols_generated=results[best_workers]['symbols'],
- occurrences_generated=0,
- cache_hit_rate=0.0,
- throughput_files_per_sec=len(file_paths) / best_time if best_time > 0 else 0,
- throughput_symbols_per_sec=results[best_workers]['symbols'] / best_time if best_time > 0 else 0,
- error_count=0,
- additional_metrics={
- 'optimal_workers': best_workers,
- 'speedup': speedup,
- 'efficiency': efficiency,
- 'worker_results': results,
- 'parallel_efficiency_percent': efficiency * 100
- }
- )
-
- print(f" ✓ Optimal workers: {best_workers}, speedup: {speedup:.1f}x")
- print(f" ✓ Parallel efficiency: {efficiency:.1%}")
-
- return result
-
- def generate_benchmark_report(self) -> Dict[str, Any]:
- """Generate comprehensive benchmark report."""
- if not self.results:
- return {"error": "No benchmark results available"}
-
- # Calculate aggregate statistics
- total_files = sum(r.file_count for r in self.results)
- total_symbols = sum(r.symbols_generated for r in self.results)
- total_time = sum(r.total_time for r in self.results)
-
- # Performance metrics
- avg_throughput_files = statistics.mean([r.throughput_files_per_sec for r in self.results if r.throughput_files_per_sec > 0])
- avg_throughput_symbols = statistics.mean([r.throughput_symbols_per_sec for r in self.results if r.throughput_symbols_per_sec > 0])
-
- # Memory analysis
- memory_results = [r for r in self.results if r.memory_usage_mb > 0]
- avg_memory_usage = statistics.mean([r.memory_usage_mb for r in memory_results]) if memory_results else 0
-
- # Cache performance
- cache_results = [r for r in self.results if r.cache_hit_rate > 0]
- avg_cache_hit_rate = statistics.mean([r.cache_hit_rate for r in cache_results]) if cache_results else 0
-
- # System information
- system_info = {
- 'cpu_count': psutil.cpu_count(),
- 'cpu_freq_mhz': psutil.cpu_freq().current if psutil.cpu_freq() else 0,
- 'memory_total_gb': psutil.virtual_memory().total / 1024**3,
- 'memory_available_gb': psutil.virtual_memory().available / 1024**3,
- 'disk_usage_percent': psutil.disk_usage('/').percent if os.name != 'nt' else psutil.disk_usage('C:\\').percent
- }
-
- # Performance summary
- performance_summary = {
- 'total_benchmarks': len(self.results),
- 'total_files_processed': total_files,
- 'total_symbols_generated': total_symbols,
- 'total_processing_time': total_time,
- 'average_throughput_files_per_sec': avg_throughput_files,
- 'average_throughput_symbols_per_sec': avg_throughput_symbols,
- 'average_memory_usage_mb': avg_memory_usage,
- 'average_cache_hit_rate': avg_cache_hit_rate,
- 'failed_benchmarks': len([r for r in self.results if r.error_count > 0])
- }
-
- # Detailed results
- detailed_results = []
- for result in self.results:
- detailed_results.append(asdict(result))
-
- # Performance recommendations
- recommendations = self.generate_performance_recommendations()
-
- report = {
- 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
- 'system_info': system_info,
- 'performance_summary': performance_summary,
- 'detailed_results': detailed_results,
- 'recommendations': recommendations
- }
-
- # Print summary
- print("\n" + "="*60)
- print("📊 BENCHMARK RESULTS SUMMARY")
- print("="*60)
- print(f"Total benchmarks: {len(self.results)}")
- print(f"Files processed: {total_files:,}")
- print(f"Symbols generated: {total_symbols:,}")
- print(f"Total time: {total_time:.2f} seconds")
- print(f"Average throughput: {avg_throughput_files:.1f} files/sec, {avg_throughput_symbols:.1f} symbols/sec")
- print(f"Average memory usage: {avg_memory_usage:.1f} MB")
- if avg_cache_hit_rate > 0:
- print(f"Average cache hit rate: {avg_cache_hit_rate:.1%}")
- print()
-
- # Print individual results
- for result in self.results:
- status = "✓" if result.error_count == 0 else "❌"
- print(f"{status} {result.test_name}")
- print(f" {result.file_count} files → {result.symbols_generated} symbols in {result.total_time:.2f}s")
- print(f" {result.throughput_files_per_sec:.1f} files/sec, {result.throughput_symbols_per_sec:.1f} symbols/sec")
- if result.cache_hit_rate > 0:
- print(f" Cache hit rate: {result.cache_hit_rate:.1%}")
- print()
-
- return report
-
- def generate_performance_recommendations(self) -> List[str]:
- """Generate performance recommendations based on benchmark results."""
- recommendations = []
-
- # Analyze results for recommendations
- memory_results = [r for r in self.results if r.memory_usage_mb > 0]
- if memory_results:
- avg_memory = statistics.mean([r.memory_usage_mb for r in memory_results])
- if avg_memory > 500: # More than 500 MB
- recommendations.append("Consider reducing batch_size or max_workers to control memory usage")
-
- # Cache performance
- cache_results = [r for r in self.results if r.cache_hit_rate > 0]
- if cache_results:
- avg_cache_rate = statistics.mean([r.cache_hit_rate for r in cache_results])
- if avg_cache_rate < 0.7: # Less than 70% hit rate
- recommendations.append("Cache performance is suboptimal. Consider increasing cache size or optimizing file change detection")
-
- # Throughput analysis
- throughput_results = [r.throughput_files_per_sec for r in self.results if r.throughput_files_per_sec > 0]
- if throughput_results:
- avg_throughput = statistics.mean(throughput_results)
- if avg_throughput < 10: # Less than 10 files per second
- recommendations.append("Consider increasing max_workers or batch_size to improve throughput")
-
- # Concurrent processing
- concurrent_results = [r for r in self.results if 'speedup' in r.additional_metrics]
- if concurrent_results:
- for result in concurrent_results:
- efficiency = result.additional_metrics.get('efficiency', 0)
- if efficiency < 0.5: # Less than 50% efficiency
- recommendations.append("Parallel processing efficiency is low. Consider reducing worker count or optimizing workload distribution")
-
- # General recommendations
- recommendations.extend([
- "Enable caching for repeated operations to improve performance",
- "Use SSD storage for cache directory to reduce I/O latency",
- "Monitor memory usage during large project processing",
- "Consider streaming processing for very large codebases",
- "Validate SCIP compliance only when necessary for better performance"
- ])
-
- return recommendations
-
-
-def run_benchmark_suite():
- """Main function to run the complete benchmark suite."""
- benchmark = SCIPFrameworkBenchmark()
-
- try:
- report = benchmark.run_all_benchmarks()
-
- # Save report to file
- import json
- report_path = "scip_framework_benchmark_report.json"
- with open(report_path, 'w', encoding='utf-8') as f:
- json.dump(report, f, indent=2, ensure_ascii=False)
-
- print(f"📄 Detailed benchmark report saved to: {report_path}")
-
- # Print recommendations
- print("\n🎯 PERFORMANCE RECOMMENDATIONS:")
- for i, rec in enumerate(report['recommendations'], 1):
- print(f"{i}. {rec}")
-
- return report
-
- except Exception as e:
- print(f"❌ Benchmark suite failed: {e}")
- import traceback
- traceback.print_exc()
- return None
-
-
-if __name__ == "__main__":
- run_benchmark_suite()
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 548c91d..428e2d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "code-index-mcp"
-version = "2.2.0"
+version = "2.4.1"
description = "Code indexing and analysis tools for LLMs using MCP"
readme = "README.md"
requires-python = ">=3.10"
diff --git a/src/code_index_mcp/__init__.py b/src/code_index_mcp/__init__.py
index 3ac3936..f47ee02 100644
--- a/src/code_index_mcp/__init__.py
+++ b/src/code_index_mcp/__init__.py
@@ -3,4 +3,5 @@
A Model Context Protocol server for code indexing, searching, and analysis.
"""
-__version__ = "2.2.0"
+__version__ = "2.4.1"
+
diff --git a/src/code_index_mcp/constants.py b/src/code_index_mcp/constants.py
index d1d4235..159e31a 100644
--- a/src/code_index_mcp/constants.py
+++ b/src/code_index_mcp/constants.py
@@ -5,7 +5,8 @@
# Directory and file names
SETTINGS_DIR = "code_indexer"
CONFIG_FILE = "config.json"
-INDEX_FILE = "index.json" # JSON index file
+INDEX_FILE = "index.json" # JSON index file (deep index)
+INDEX_FILE_SHALLOW = "index.shallow.json" # Minimal shallow index (file list)
# Supported file extensions for code analysis
# This is the authoritative list used by both old and new indexing systems
diff --git a/src/code_index_mcp/indexing/__init__.py b/src/code_index_mcp/indexing/__init__.py
index 512ad3f..e779911 100644
--- a/src/code_index_mcp/indexing/__init__.py
+++ b/src/code_index_mcp/indexing/__init__.py
@@ -13,6 +13,8 @@
# New JSON-based indexing system
from .json_index_builder import JSONIndexBuilder, IndexMetadata
from .json_index_manager import JSONIndexManager, get_index_manager
+from .shallow_index_manager import ShallowIndexManager, get_shallow_index_manager
+from .deep_index_manager import DeepIndexManager
from .models import SymbolInfo, FileInfo
__all__ = [
@@ -21,6 +23,9 @@
'JSONIndexBuilder',
'JSONIndexManager',
'get_index_manager',
+ 'ShallowIndexManager',
+ 'get_shallow_index_manager',
+ 'DeepIndexManager',
'SymbolInfo',
'FileInfo',
'IndexMetadata'
diff --git a/src/code_index_mcp/indexing/deep_index_manager.py b/src/code_index_mcp/indexing/deep_index_manager.py
new file mode 100644
index 0000000..6558703
--- /dev/null
+++ b/src/code_index_mcp/indexing/deep_index_manager.py
@@ -0,0 +1,46 @@
+"""
+Deep Index Manager - Wrapper around JSONIndexManager for deep indexing.
+
+This class provides a clear semantic separation from the shallow manager.
+It delegates to the existing JSONIndexManager (symbols + files JSON index).
+"""
+
+from __future__ import annotations
+
+from typing import Optional, Dict, Any, List
+
+from .json_index_manager import JSONIndexManager
+
+
+class DeepIndexManager:
+ """Thin wrapper over JSONIndexManager to expose deep-index API."""
+
+ def __init__(self) -> None:
+ self._mgr = JSONIndexManager()
+
+ # Expose a subset of API to keep callers simple
+ def set_project_path(self, project_path: str) -> bool:
+ return self._mgr.set_project_path(project_path)
+
+ def build_index(self, force_rebuild: bool = False) -> bool:
+ return self._mgr.build_index(force_rebuild=force_rebuild)
+
+ def load_index(self) -> bool:
+ return self._mgr.load_index()
+
+ def refresh_index(self) -> bool:
+ return self._mgr.refresh_index()
+
+ def find_files(self, pattern: str = "*") -> List[str]:
+ return self._mgr.find_files(pattern)
+
+ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]:
+ return self._mgr.get_file_summary(file_path)
+
+ def get_index_stats(self) -> Dict[str, Any]:
+ return self._mgr.get_index_stats()
+
+ def cleanup(self) -> None:
+ self._mgr.cleanup()
+
+
diff --git a/src/code_index_mcp/indexing/json_index_builder.py b/src/code_index_mcp/indexing/json_index_builder.py
index 0f95c5b..c12d694 100644
--- a/src/code_index_mcp/indexing/json_index_builder.py
+++ b/src/code_index_mcp/indexing/json_index_builder.py
@@ -8,9 +8,10 @@
import logging
import os
import time
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from dataclasses import dataclass, asdict
from pathlib import Path
-from typing import Dict, List, Optional, Any
+from typing import Dict, List, Optional, Any, Tuple
from .strategies import StrategyFactory
from .models import SymbolInfo, FileInfo
@@ -44,18 +45,18 @@ class JSONIndexBuilder:
def __init__(self, project_path: str, additional_excludes: Optional[List[str]] = None):
from ..utils import FileFilter
-
+
# Input validation
if not isinstance(project_path, str):
raise ValueError(f"Project path must be a string, got {type(project_path)}")
-
+
project_path = project_path.strip()
if not project_path:
raise ValueError("Project path cannot be empty")
-
+
if not os.path.isdir(project_path):
raise ValueError(f"Project path does not exist: {project_path}")
-
+
self.project_path = project_path
self.in_memory_index: Optional[Dict[str, Any]] = None
self.strategy_factory = StrategyFactory()
@@ -70,14 +71,53 @@ def __init__(self, project_path: str, additional_excludes: Optional[List[str]] =
fallback = len(self.strategy_factory.get_fallback_extensions())
logger.info(f"Specialized parsers: {specialized} extensions, Fallback coverage: {fallback} extensions")
- def build_index(self) -> Dict[str, Any]:
+ def _process_file(self, file_path: str, specialized_extensions: set) -> Optional[Tuple[Dict, Dict, str, bool]]:
+ """
+ Process a single file - designed for parallel execution.
+
+ Args:
+ file_path: Path to the file to process
+ specialized_extensions: Set of extensions with specialized parsers
+
+ Returns:
+ Tuple of (symbols, file_info, language, is_specialized) or None on error
+ """
+ try:
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ content = f.read()
+
+ ext = Path(file_path).suffix.lower()
+ rel_path = os.path.relpath(file_path, self.project_path).replace('\\', '/')
+
+ # Get appropriate strategy
+ strategy = self.strategy_factory.get_strategy(ext)
+
+ # Track strategy usage
+ is_specialized = ext in specialized_extensions
+
+ # Parse file using strategy
+ symbols, file_info = strategy.parse_file(rel_path, content)
+
+ logger.debug(f"Parsed {rel_path}: {len(symbols)} symbols ({file_info.language})")
+
+ return (symbols, {rel_path: file_info}, file_info.language, is_specialized)
+
+ except Exception as e:
+ logger.warning(f"Error processing {file_path}: {e}")
+ return None
+
+ def build_index(self, parallel: bool = True, max_workers: Optional[int] = None) -> Dict[str, Any]:
"""
- Build the complete index using Strategy pattern.
+ Build the complete index using Strategy pattern with parallel processing.
+
+ Args:
+ parallel: Whether to use parallel processing (default: True)
+ max_workers: Maximum number of worker processes/threads (default: CPU count)
Returns:
Complete JSON index with metadata, symbols, and file information
"""
- logger.info("Building JSON index using Strategy pattern...")
+ logger.info(f"Building JSON index using Strategy pattern (parallel={parallel})...")
start_time = time.time()
all_symbols = {}
@@ -89,38 +129,66 @@ def build_index(self) -> Dict[str, Any]:
# Get specialized extensions for tracking
specialized_extensions = set(self.strategy_factory.get_specialized_extensions())
- # Traverse project files
- for file_path in self._get_supported_files():
- try:
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
- content = f.read()
-
- ext = Path(file_path).suffix.lower()
-
- # Convert to relative path first
- rel_path = os.path.relpath(file_path, self.project_path).replace('\\', '/')
-
- # Get appropriate strategy
- strategy = self.strategy_factory.get_strategy(ext)
-
- # Track strategy usage
- if ext in specialized_extensions:
- specialized_count += 1
- else:
- fallback_count += 1
-
- # Parse file using strategy with relative path
- symbols, file_info = strategy.parse_file(rel_path, content)
-
- # Add to index
- all_symbols.update(symbols)
- all_files[rel_path] = file_info
- languages.add(file_info.language)
-
- logger.debug(f"Parsed {rel_path}: {len(symbols)} symbols ({file_info.language})")
-
- except Exception as e:
- logger.warning(f"Error processing {file_path}: {e}")
+ # Get list of files to process
+ files_to_process = self._get_supported_files()
+ total_files = len(files_to_process)
+
+ if total_files == 0:
+ logger.warning("No files to process")
+ return self._create_empty_index()
+
+ logger.info(f"Processing {total_files} files...")
+
+ if parallel and total_files > 1:
+ # Use ThreadPoolExecutor for I/O-bound file reading
+ # ProcessPoolExecutor has issues with strategy sharing
+ if max_workers is None:
+ max_workers = min(os.cpu_count() or 4, total_files)
+
+ logger.info(f"Using parallel processing with {max_workers} workers")
+
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ # Submit all tasks
+ future_to_file = {
+ executor.submit(self._process_file, file_path, specialized_extensions): file_path
+ for file_path in files_to_process
+ }
+
+ # Process completed tasks
+ processed = 0
+ for future in as_completed(future_to_file):
+ file_path = future_to_file[future]
+ result = future.result()
+
+ if result:
+ symbols, file_info_dict, language, is_specialized = result
+ all_symbols.update(symbols)
+ all_files.update(file_info_dict)
+ languages.add(language)
+
+ if is_specialized:
+ specialized_count += 1
+ else:
+ fallback_count += 1
+
+ processed += 1
+ if processed % 100 == 0:
+ logger.debug(f"Processed {processed}/{total_files} files")
+ else:
+ # Sequential processing
+ logger.info("Using sequential processing")
+ for file_path in files_to_process:
+ result = self._process_file(file_path, specialized_extensions)
+ if result:
+ symbols, file_info_dict, language, is_specialized = result
+ all_symbols.update(symbols)
+ all_files.update(file_info_dict)
+ languages.add(language)
+
+ if is_specialized:
+ specialized_count += 1
+ else:
+ fallback_count += 1
# Build index metadata
metadata = IndexMetadata(
@@ -151,6 +219,25 @@ def build_index(self) -> Dict[str, Any]:
return index
+ def _create_empty_index(self) -> Dict[str, Any]:
+ """Create an empty index structure."""
+ metadata = IndexMetadata(
+ project_path=self.project_path,
+ indexed_files=0,
+ index_version="2.0.0-strategy",
+ timestamp=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+ languages=[],
+ total_symbols=0,
+ specialized_parsers=0,
+ fallback_files=0
+ )
+
+ return {
+ "metadata": asdict(metadata),
+ "symbols": {},
+ "files": {}
+ }
+
def get_index(self) -> Optional[Dict[str, Any]]:
"""Get the current in-memory index."""
return self.in_memory_index
@@ -187,6 +274,31 @@ def _get_supported_files(self) -> List[str]:
logger.debug(f"Found {len(supported_files)} supported files")
return supported_files
+ def build_shallow_file_list(self) -> List[str]:
+ """
+ Build a minimal shallow index consisting of relative file paths only.
+
+ This method does not read file contents. It enumerates supported files
+ using centralized filtering and returns normalized relative paths with
+ forward slashes for cross-platform consistency.
+
+ Returns:
+ List of relative file paths (using '/').
+ """
+ try:
+ absolute_files = self._get_supported_files()
+ result: List[str] = []
+ for abs_path in absolute_files:
+ rel_path = os.path.relpath(abs_path, self.project_path).replace('\\', '/')
+ # Normalize leading './'
+ if rel_path.startswith('./'):
+ rel_path = rel_path[2:]
+ result.append(rel_path)
+ return result
+ except Exception as e:
+ logger.error(f"Failed to build shallow file list: {e}")
+ return []
+
def save_index(self, index: Dict[str, Any], index_path: str) -> bool:
"""
Save index to disk.
@@ -284,16 +396,16 @@ def get_file_symbols(self, file_path: str) -> List[Dict[str, Any]]:
# Work directly with global symbols for this file
global_symbols = self.in_memory_index.get("symbols", {})
result = []
-
+
# Find all symbols for this file directly from global symbols
for symbol_id, symbol_data in global_symbols.items():
symbol_file = symbol_data.get("file", "").replace("\\", "/")
-
+
# Check if this symbol belongs to our file
if symbol_file == file_path:
symbol_type = symbol_data.get("type", "unknown")
symbol_name = symbol_id.split("::")[-1] # Extract symbol name from ID
-
+
# Create symbol info
symbol_info = {
"name": symbol_name,
@@ -301,7 +413,7 @@ def get_file_symbols(self, file_path: str) -> List[Dict[str, Any]]:
"line": symbol_data.get("line"),
"signature": symbol_data.get("signature")
}
-
+
# Categorize by type
if symbol_type in ["function", "method"]:
result.append(symbol_info)
@@ -310,7 +422,7 @@ def get_file_symbols(self, file_path: str) -> List[Dict[str, Any]]:
# Sort by line number for consistent ordering
result.sort(key=lambda x: x.get("line", 0))
-
+
return result
except Exception as e:
diff --git a/src/code_index_mcp/indexing/json_index_manager.py b/src/code_index_mcp/indexing/json_index_manager.py
index d4564f3..ec320e4 100644
--- a/src/code_index_mcp/indexing/json_index_manager.py
+++ b/src/code_index_mcp/indexing/json_index_manager.py
@@ -9,28 +9,32 @@
import json
import logging
import os
+import re
import tempfile
import threading
+import fnmatch
from pathlib import Path
from typing import Dict, List, Optional, Any
from .json_index_builder import JSONIndexBuilder
-from ..constants import SETTINGS_DIR, INDEX_FILE
+from ..constants import SETTINGS_DIR, INDEX_FILE, INDEX_FILE_SHALLOW
logger = logging.getLogger(__name__)
class JSONIndexManager:
"""Manages JSON-based code index lifecycle and storage."""
-
+
def __init__(self):
self.project_path: Optional[str] = None
self.index_builder: Optional[JSONIndexBuilder] = None
self.temp_dir: Optional[str] = None
self.index_path: Optional[str] = None
+ self.shallow_index_path: Optional[str] = None
+ self._shallow_file_list: Optional[List[str]] = None
self._lock = threading.RLock()
logger.info("Initialized JSON Index Manager")
-
+
def set_project_path(self, project_path: str) -> bool:
"""Set the project path and initialize index storage."""
with self._lock:
@@ -39,67 +43,68 @@ def set_project_path(self, project_path: str) -> bool:
if not project_path or not isinstance(project_path, str):
logger.error(f"Invalid project path: {project_path}")
return False
-
+
project_path = project_path.strip()
if not project_path:
logger.error("Project path cannot be empty")
return False
-
+
if not os.path.isdir(project_path):
logger.error(f"Project path does not exist: {project_path}")
return False
-
+
self.project_path = project_path
self.index_builder = JSONIndexBuilder(project_path)
-
+
# Create temp directory for index storage
project_hash = hashlib.md5(project_path.encode()).hexdigest()[:12]
self.temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, project_hash)
os.makedirs(self.temp_dir, exist_ok=True)
-
+
self.index_path = os.path.join(self.temp_dir, INDEX_FILE)
-
+ self.shallow_index_path = os.path.join(self.temp_dir, INDEX_FILE_SHALLOW)
+
logger.info(f"Set project path: {project_path}")
logger.info(f"Index storage: {self.index_path}")
return True
-
+
except Exception as e:
logger.error(f"Failed to set project path: {e}")
return False
-
+
def build_index(self, force_rebuild: bool = False) -> bool:
"""Build or rebuild the index."""
with self._lock:
if not self.index_builder or not self.project_path:
logger.error("Index builder not initialized")
return False
-
+
try:
# Check if we need to rebuild
if not force_rebuild and self._is_index_fresh():
logger.info("Index is fresh, skipping rebuild")
return True
-
+
logger.info("Building JSON index...")
index = self.index_builder.build_index()
-
+
# Save to disk
self.index_builder.save_index(index, self.index_path)
-
+
logger.info(f"Successfully built index with {len(index['symbols'])} symbols")
return True
-
+
except Exception as e:
logger.error(f"Failed to build index: {e}")
return False
-
+
def load_index(self) -> bool:
"""Load existing index from disk."""
with self._lock:
if not self.index_builder or not self.index_path:
logger.error("Index manager not initialized")
return False
-
+
try:
index = self.index_builder.load_index(self.index_path)
if index:
@@ -108,11 +113,57 @@ def load_index(self) -> bool:
else:
logger.warning("No existing index found")
return False
-
+
except Exception as e:
logger.error(f"Failed to load index: {e}")
return False
-
+
+ def build_shallow_index(self) -> bool:
+ """Build and save the minimal shallow index (file list)."""
+ with self._lock:
+ if not self.index_builder or not self.project_path or not self.shallow_index_path:
+ logger.error("Index builder not initialized for shallow index")
+ return False
+
+ try:
+ file_list = self.index_builder.build_shallow_file_list()
+ # Persist as a JSON array for minimal overhead
+ with open(self.shallow_index_path, 'w', encoding='utf-8') as f:
+ json.dump(file_list, f, ensure_ascii=False)
+ self._shallow_file_list = file_list
+ logger.info(f"Saved shallow index with {len(file_list)} files to {self.shallow_index_path}")
+ return True
+ except Exception as e:
+ logger.error(f"Failed to build shallow index: {e}")
+ return False
+
+ def load_shallow_index(self) -> bool:
+ """Load shallow index (file list) from disk into memory."""
+ with self._lock:
+ try:
+ if not self.shallow_index_path or not os.path.exists(self.shallow_index_path):
+ logger.warning("No existing shallow index found")
+ return False
+ with open(self.shallow_index_path, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+ if not isinstance(data, list):
+ logger.error("Shallow index format invalid (expected list)")
+ return False
+ # Normalize paths
+ normalized = []
+ for p in data:
+ if isinstance(p, str):
+ q = p.replace('\\\\', '/').replace('\\', '/')
+ if q.startswith('./'):
+ q = q[2:]
+ normalized.append(q)
+ self._shallow_file_list = normalized
+ logger.info(f"Loaded shallow index with {len(normalized)} files")
+ return True
+ except Exception as e:
+ logger.error(f"Failed to load shallow index: {e}")
+ return False
+
def refresh_index(self) -> bool:
"""Refresh the index (rebuild and reload)."""
with self._lock:
@@ -120,49 +171,64 @@ def refresh_index(self) -> bool:
if self.build_index(force_rebuild=True):
return self.load_index()
return False
-
+
def find_files(self, pattern: str = "*") -> List[str]:
- """Find files matching a pattern."""
+ """
+ Find files matching a glob pattern using the SHALLOW file list only.
+
+ Notes:
+ - '*' does not cross '/'
+ - '**' matches across directories
+ - Always sources from the shallow index for consistency and speed
+ """
with self._lock:
# Input validation
if not isinstance(pattern, str):
logger.error(f"Pattern must be a string, got {type(pattern)}")
return []
-
+
pattern = pattern.strip()
if not pattern:
pattern = "*"
-
- if not self.index_builder or not self.index_builder.in_memory_index:
- logger.warning("Index not loaded")
- return []
-
+
+ # Normalize to forward slashes
+ norm_pattern = pattern.replace('\\\\', '/').replace('\\', '/')
+
+ # Build glob regex: '*' does not cross '/', '**' crosses directories
+ regex = self._compile_glob_regex(norm_pattern)
+
+ # Always use shallow index for file discovery
try:
- files = list(self.index_builder.in_memory_index["files"].keys())
-
- if pattern == "*":
+ if self._shallow_file_list is None:
+ # Try load existing shallow index; if missing, build then load
+ if not self.load_shallow_index():
+ # If still not available, attempt to build
+ if self.build_shallow_index():
+ self.load_shallow_index()
+
+ files = list(self._shallow_file_list or [])
+
+ if norm_pattern == "*":
return files
-
- # Simple pattern matching
- import fnmatch
- return [f for f in files if fnmatch.fnmatch(f, pattern)]
-
+
+ return [f for f in files if regex.match(f) is not None]
+
except Exception as e:
logger.error(f"Error finding files: {e}")
return []
-
+
def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Get summary information for a file.
-
+
This method attempts to retrieve comprehensive file information including
symbol counts, functions, classes, methods, and imports. If the index
is not loaded, it will attempt auto-initialization to restore from the
most recent index state.
-
+
Args:
file_path: Relative path to the file
-
+
Returns:
Dictionary containing file summary information, or None if not found
"""
@@ -171,38 +237,38 @@ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]:
if not isinstance(file_path, str):
logger.error(f"File path must be a string, got {type(file_path)}")
return None
-
+
file_path = file_path.strip()
if not file_path:
logger.error("File path cannot be empty")
return None
-
+
# Try to load cached index if not ready
if not self.index_builder or not self.index_builder.in_memory_index:
if not self._try_load_cached_index():
logger.warning("Index not loaded and no cached index available")
return None
-
+
try:
# Normalize file path
file_path = file_path.replace('\\', '/')
if file_path.startswith('./'):
file_path = file_path[2:]
-
+
# Get file info
file_info = self.index_builder.in_memory_index["files"].get(file_path)
if not file_info:
logger.warning(f"File not found in index: {file_path}")
return None
-
+
# Get symbols in file
symbols = self.index_builder.get_file_symbols(file_path)
-
+
# Categorize symbols by signature
functions = []
classes = []
methods = []
-
+
for s in symbols:
signature = s.get("signature", "")
if signature:
@@ -210,7 +276,7 @@ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]:
# Method: contains class context
methods.append(s)
elif signature.startswith("def "):
- # Function: starts with def but no class context
+ # Function: starts with def but no class context
functions.append(s)
elif signature.startswith("class ") or signature is None:
# Class: starts with class or has no signature
@@ -227,7 +293,7 @@ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]:
else:
# Default to function
functions.append(s)
-
+
return {
"file_path": file_path,
"language": file_info["language"],
@@ -239,63 +305,26 @@ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]:
"imports": file_info.get("imports", []),
"exports": file_info.get("exports", [])
}
-
+
except Exception as e:
logger.error(f"Error getting file summary: {e}")
return None
-
- def search_symbols(self, query: str, symbol_type: Optional[str] = None) -> List[Dict[str, Any]]:
- """Search for symbols by name."""
- with self._lock:
- if not self.index_builder or not self.index_builder.in_memory_index:
- logger.warning("Index not loaded")
- return []
-
- try:
- results = []
- query_lower = query.lower()
-
- for symbol_id, symbol_data in self.index_builder.in_memory_index["symbols"].items():
- # Filter by type if specified
- if symbol_type and symbol_data.get("type") != symbol_type:
- continue
-
- # Check if query matches symbol name
- if query_lower in symbol_id.lower():
- results.append({
- "id": symbol_id,
- **symbol_data
- })
-
- return results[:50] # Limit results
-
- except Exception as e:
- logger.error(f"Error searching symbols: {e}")
- return []
-
- def get_symbol_callers(self, symbol_name: str) -> List[str]:
- """Get all symbols that call the given symbol."""
- with self._lock:
- if not self.index_builder:
- return []
-
- return self.index_builder.get_callers(symbol_name)
-
+
def get_index_stats(self) -> Dict[str, Any]:
"""Get statistics about the current index."""
with self._lock:
if not self.index_builder or not self.index_builder.in_memory_index:
return {"status": "not_loaded"}
-
+
try:
index = self.index_builder.in_memory_index
metadata = index["metadata"]
-
+
symbol_counts = {}
for symbol_data in index["symbols"].values():
symbol_type = symbol_data.get("type", "unknown")
symbol_counts[symbol_type] = symbol_counts.get(symbol_type, 0) + 1
-
+
return {
"status": "loaded",
"project_path": metadata["project_path"],
@@ -306,51 +335,51 @@ def get_index_stats(self) -> Dict[str, Any]:
"index_version": metadata["index_version"],
"timestamp": metadata["timestamp"]
}
-
+
except Exception as e:
logger.error(f"Error getting index stats: {e}")
return {"status": "error", "error": str(e)}
-
+
def _is_index_fresh(self) -> bool:
"""Check if the current index is fresh."""
if not self.index_path or not os.path.exists(self.index_path):
return False
-
+
try:
- from ..utils import FileFilter
- file_filter = FileFilter()
-
+ from code_index_mcp.utils.file_filter import FileFilter as _FileFilter # pylint: disable=C0415
+ file_filter = _FileFilter()
+
# Simple freshness check - index exists and is recent
index_mtime = os.path.getmtime(self.index_path)
base_path = Path(self.project_path)
-
+
# Check if any source files are newer than index
for root, dirs, files in os.walk(self.project_path):
# Filter directories using centralized logic
dirs[:] = [d for d in dirs if not file_filter.should_exclude_directory(d)]
-
+
for file in files:
file_path = Path(root) / file
if file_filter.should_process_path(file_path, base_path):
if os.path.getmtime(str(file_path)) > index_mtime:
return False
-
+
return True
-
+
except Exception as e:
logger.warning(f"Error checking index freshness: {e}")
return False
-
+
def _try_load_cached_index(self, expected_project_path: Optional[str] = None) -> bool:
"""
Try to load a cached index file if available.
-
+
This is a simplified version of auto-initialization that only loads
a cached index if we can verify it matches the expected project.
-
+
Args:
expected_project_path: Optional path to verify against cached index
-
+
Returns:
True if cached index was loaded successfully, False otherwise.
"""
@@ -358,28 +387,28 @@ def _try_load_cached_index(self, expected_project_path: Optional[str] = None) ->
# First try to load from current index_path if set
if self.index_path and os.path.exists(self.index_path):
return self.load_index()
-
+
# If expected project path provided, try to find its cache
if expected_project_path:
project_hash = hashlib.md5(expected_project_path.encode()).hexdigest()[:12]
temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, project_hash)
index_path = os.path.join(temp_dir, INDEX_FILE)
-
+
if os.path.exists(index_path):
# Verify the cached index matches the expected project
with open(index_path, 'r', encoding='utf-8') as f:
index_data = json.load(f)
cached_project = index_data.get('metadata', {}).get('project_path')
-
+
if cached_project == expected_project_path:
self.temp_dir = temp_dir
self.index_path = index_path
return self.load_index()
else:
logger.warning(f"Cached index project mismatch: {cached_project} != {expected_project_path}")
-
+
return False
-
+
except Exception as e:
logger.debug(f"Failed to load cached index: {e}")
return False
@@ -393,6 +422,39 @@ def cleanup(self):
self.index_path = None
logger.info("Cleaned up JSON Index Manager")
+ @staticmethod
+ def _compile_glob_regex(pattern: str) -> re.Pattern:
+ """
+ Compile a glob pattern where '*' does not match '/', and '**' matches across directories.
+
+ Examples:
+ src/*.py -> direct children .py under src
+ **/*.py -> .py at any depth
+ """
+ # Translate glob to regex
+ i = 0
+ out = []
+ special = ".^$+{}[]|()"
+ while i < len(pattern):
+ c = pattern[i]
+ if c == '*':
+ if i + 1 < len(pattern) and pattern[i + 1] == '*':
+ # '**' -> match across directories
+ out.append('.*')
+ i += 2
+ continue
+ else:
+ out.append('[^/]*')
+ elif c == '?':
+ out.append('[^/]')
+ elif c in special:
+ out.append('\\' + c)
+ else:
+ out.append(c)
+ i += 1
+ regex_str = '^' + ''.join(out) + '$'
+ return re.compile(regex_str)
+
# Global instance
_index_manager = JSONIndexManager()
@@ -400,4 +462,4 @@ def cleanup(self):
def get_index_manager() -> JSONIndexManager:
"""Get the global index manager instance."""
- return _index_manager
\ No newline at end of file
+ return _index_manager
diff --git a/src/code_index_mcp/indexing/shallow_index_manager.py b/src/code_index_mcp/indexing/shallow_index_manager.py
new file mode 100644
index 0000000..530c593
--- /dev/null
+++ b/src/code_index_mcp/indexing/shallow_index_manager.py
@@ -0,0 +1,155 @@
+"""
+Shallow Index Manager - Manages a minimal file-list-only index.
+
+This manager builds and loads a shallow index consisting of relative file
+paths only. It is optimized for fast initialization and filename-based
+search/browsing. Content parsing and symbol extraction are not performed.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import os
+import tempfile
+import threading
+from typing import List, Optional
+import re
+
+from .json_index_builder import JSONIndexBuilder
+from ..constants import SETTINGS_DIR, INDEX_FILE_SHALLOW
+
+logger = logging.getLogger(__name__)
+
+
+class ShallowIndexManager:
+ """Manage shallow (file-list) index lifecycle and storage."""
+
+ def __init__(self) -> None:
+ self.project_path: Optional[str] = None
+ self.index_builder: Optional[JSONIndexBuilder] = None
+ self.temp_dir: Optional[str] = None
+ self.index_path: Optional[str] = None
+ self._file_list: Optional[List[str]] = None
+ self._lock = threading.RLock()
+
+ def set_project_path(self, project_path: str) -> bool:
+ with self._lock:
+ try:
+ if not isinstance(project_path, str) or not project_path.strip():
+ logger.error("Invalid project path for shallow index")
+ return False
+ project_path = project_path.strip()
+ if not os.path.isdir(project_path):
+ logger.error(f"Project path does not exist: {project_path}")
+ return False
+
+ self.project_path = project_path
+ self.index_builder = JSONIndexBuilder(project_path)
+
+ project_hash = hashlib.md5(project_path.encode()).hexdigest()[:12]
+ self.temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, project_hash)
+ os.makedirs(self.temp_dir, exist_ok=True)
+ self.index_path = os.path.join(self.temp_dir, INDEX_FILE_SHALLOW)
+ return True
+ except Exception as e: # noqa: BLE001 - centralized logging
+ logger.error(f"Failed to set project path (shallow): {e}")
+ return False
+
+ def build_index(self) -> bool:
+ """Build and persist the shallow file list index."""
+ with self._lock:
+ if not self.index_builder or not self.index_path:
+ logger.error("ShallowIndexManager not initialized")
+ return False
+ try:
+ file_list = self.index_builder.build_shallow_file_list()
+ with open(self.index_path, 'w', encoding='utf-8') as f:
+ json.dump(file_list, f, ensure_ascii=False)
+ self._file_list = file_list
+ logger.info(f"Built shallow index with {len(file_list)} files")
+ return True
+ except Exception as e: # noqa: BLE001
+ logger.error(f"Failed to build shallow index: {e}")
+ return False
+
+ def load_index(self) -> bool:
+ """Load shallow index from disk to memory."""
+ with self._lock:
+ try:
+ if not self.index_path or not os.path.exists(self.index_path):
+ return False
+ with open(self.index_path, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+ if isinstance(data, list):
+ # Normalize slashes/prefix
+ normalized: List[str] = []
+ for p in data:
+ if isinstance(p, str):
+ q = p.replace('\\\\', '/').replace('\\', '/')
+ if q.startswith('./'):
+ q = q[2:]
+ normalized.append(q)
+ self._file_list = normalized
+ return True
+ return False
+ except Exception as e: # noqa: BLE001
+ logger.error(f"Failed to load shallow index: {e}")
+ return False
+
+ def get_file_list(self) -> List[str]:
+ with self._lock:
+ return list(self._file_list or [])
+
+ def find_files(self, pattern: str = "*") -> List[str]:
+ with self._lock:
+ if not isinstance(pattern, str):
+ return []
+ norm = (pattern.strip() or "*").replace('\\\\','/').replace('\\','/')
+ regex = self._compile_glob_regex(norm)
+ files = self._file_list or []
+ if norm == "*":
+ return list(files)
+ return [f for f in files if regex.match(f) is not None]
+
+ @staticmethod
+ def _compile_glob_regex(pattern: str) -> re.Pattern:
+ i = 0
+ out = []
+ special = ".^$+{}[]|()"
+ while i < len(pattern):
+ c = pattern[i]
+ if c == '*':
+ if i + 1 < len(pattern) and pattern[i + 1] == '*':
+ out.append('.*')
+ i += 2
+ continue
+ else:
+ out.append('[^/]*')
+ elif c == '?':
+ out.append('[^/]')
+ elif c in special:
+ out.append('\\' + c)
+ else:
+ out.append(c)
+ i += 1
+ return re.compile('^' + ''.join(out) + '$')
+
+ def cleanup(self) -> None:
+ with self._lock:
+ self.project_path = None
+ self.index_builder = None
+ self.temp_dir = None
+ self.index_path = None
+ self._file_list = None
+
+
+# Global singleton
+_shallow_manager = ShallowIndexManager()
+
+
+def get_shallow_index_manager() -> ShallowIndexManager:
+ return _shallow_manager
+
+
diff --git a/src/code_index_mcp/indexing/strategies/java_strategy.py b/src/code_index_mcp/indexing/strategies/java_strategy.py
index b1c9845..af2ff8e 100644
--- a/src/code_index_mcp/indexing/strategies/java_strategy.py
+++ b/src/code_index_mcp/indexing/strategies/java_strategy.py
@@ -1,10 +1,9 @@
"""
-Java parsing strategy using tree-sitter.
+Java parsing strategy using tree-sitter - Optimized single-pass version.
"""
import logging
-import re
-from typing import Dict, List, Tuple, Optional
+from typing import Dict, List, Tuple, Optional, Set
from .base_strategy import ParsingStrategy
from ..models import SymbolInfo, FileInfo
@@ -15,7 +14,7 @@
class JavaParsingStrategy(ParsingStrategy):
- """Java-specific parsing strategy."""
+ """Java-specific parsing strategy - Single Pass Optimized."""
def __init__(self):
self.java_language = tree_sitter.Language(language())
@@ -27,31 +26,40 @@ def get_supported_extensions(self) -> List[str]:
return ['.java']
def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]:
- """Parse Java file using tree-sitter."""
- return self._tree_sitter_parse(file_path, content)
-
- def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]:
- """Parse using tree-sitter."""
+ """Parse Java file using tree-sitter with single-pass optimization."""
symbols = {}
functions = []
classes = []
imports = []
package = None
+
+ # Symbol lookup index for O(1) access
+ symbol_lookup = {} # name -> symbol_id mapping
parser = tree_sitter.Parser(self.java_language)
try:
tree = parser.parse(content.encode('utf8'))
- # Phase 1: Extract symbol definitions
- self._traverse_java_node(tree.root_node, content, file_path, symbols, functions, classes, imports)
- # Phase 2: Analyze method calls and build relationships
- self._analyze_java_calls(tree, content, symbols, file_path)
-
- # Extract package info
+
+ # Extract package info first
for node in tree.root_node.children:
if node.type == 'package_declaration':
package = self._extract_java_package(node, content)
break
+
+ # Single-pass traversal that handles everything
+ context = TraversalContext(
+ content=content,
+ file_path=file_path,
+ symbols=symbols,
+ functions=functions,
+ classes=classes,
+ imports=imports,
+ symbol_lookup=symbol_lookup
+ )
+
+ self._traverse_node_single_pass(tree.root_node, context)
+
except Exception as e:
logger.warning(f"Error parsing Java file {file_path}: {e}")
@@ -65,36 +73,90 @@ def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, Sy
return symbols, file_info
-
- def _traverse_java_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo],
- functions: List[str], classes: List[str], imports: List[str]):
- """Traverse Java AST node."""
+ def _traverse_node_single_pass(self, node, context: 'TraversalContext',
+ current_class: Optional[str] = None,
+ current_method: Optional[str] = None):
+ """Single-pass traversal that extracts symbols and analyzes calls."""
+
+ # Handle class declarations
if node.type == 'class_declaration':
- name = self._get_java_class_name(node, content)
+ name = self._get_java_class_name(node, context.content)
if name:
- symbol_id = self._create_symbol_id(file_path, name)
- symbols[symbol_id] = SymbolInfo(
+ symbol_id = self._create_symbol_id(context.file_path, name)
+ symbol_info = SymbolInfo(
type="class",
- file=file_path,
+ file=context.file_path,
line=node.start_point[0] + 1
)
- classes.append(name)
-
+ context.symbols[symbol_id] = symbol_info
+ context.symbol_lookup[name] = symbol_id
+ context.classes.append(name)
+
+ # Traverse class body with updated context
+ for child in node.children:
+ self._traverse_node_single_pass(child, context, current_class=name, current_method=current_method)
+ return
+
+ # Handle method declarations
elif node.type == 'method_declaration':
- name = self._get_java_method_name(node, content)
+ name = self._get_java_method_name(node, context.content)
if name:
- symbol_id = self._create_symbol_id(file_path, name)
- symbols[symbol_id] = SymbolInfo(
+ # Build full method name with class context
+ if current_class:
+ full_name = f"{current_class}.{name}"
+ else:
+ full_name = name
+
+ symbol_id = self._create_symbol_id(context.file_path, full_name)
+ symbol_info = SymbolInfo(
type="method",
- file=file_path,
+ file=context.file_path,
line=node.start_point[0] + 1,
- signature=self._get_java_method_signature(node, content)
+ signature=self._get_java_method_signature(node, context.content)
)
- functions.append(name)
-
- # Continue traversing children
+ context.symbols[symbol_id] = symbol_info
+ context.symbol_lookup[full_name] = symbol_id
+ context.symbol_lookup[name] = symbol_id # Also index by method name alone
+ context.functions.append(full_name)
+
+ # Traverse method body with updated context
+ for child in node.children:
+ self._traverse_node_single_pass(child, context, current_class=current_class,
+ current_method=symbol_id)
+ return
+
+ # Handle method invocations (calls)
+ elif node.type == 'method_invocation':
+ if current_method:
+ called_method = self._get_called_method_name(node, context.content)
+ if called_method:
+ # Use O(1) lookup instead of O(n) iteration
+ if called_method in context.symbol_lookup:
+ symbol_id = context.symbol_lookup[called_method]
+ symbol_info = context.symbols[symbol_id]
+ if current_method not in symbol_info.called_by:
+ symbol_info.called_by.append(current_method)
+ else:
+ # Try to find method with class prefix
+ for name, sid in context.symbol_lookup.items():
+ if name.endswith(f".{called_method}"):
+ symbol_info = context.symbols[sid]
+ if current_method not in symbol_info.called_by:
+ symbol_info.called_by.append(current_method)
+ break
+
+ # Handle import declarations
+ elif node.type == 'import_declaration':
+ import_text = context.content[node.start_byte:node.end_byte]
+ # Extract the import path (remove 'import' keyword and semicolon)
+ import_path = import_text.replace('import', '').replace(';', '').strip()
+ if import_path:
+ context.imports.append(import_path)
+
+ # Continue traversing children for other node types
for child in node.children:
- self._traverse_java_node(child, content, file_path, symbols, functions, classes, imports)
+ self._traverse_node_single_pass(child, context, current_class=current_class,
+ current_method=current_method)
def _get_java_class_name(self, node, content: str) -> Optional[str]:
for child in node.children:
@@ -117,34 +179,31 @@ def _extract_java_package(self, node, content: str) -> Optional[str]:
return content[child.start_byte:child.end_byte]
return None
- def _analyze_java_calls(self, tree, content: str, symbols: Dict[str, SymbolInfo], file_path: str):
- """Analyze Java method calls for relationships."""
- self._find_java_calls(tree.root_node, content, symbols, file_path)
-
- def _find_java_calls(self, node, content: str, symbols: Dict[str, SymbolInfo], file_path: str, current_method: str = None):
- """Recursively find Java method calls."""
- if node.type == 'method_declaration':
- method_name = self._get_java_method_name(node, content)
- if method_name:
- current_method = self._create_symbol_id(file_path, method_name)
-
- elif node.type == 'method_invocation':
- if current_method:
- called_method = self._get_called_method_name(node, content)
- if called_method:
- # Find the called method in symbols and add relationship
- for symbol_id, symbol_info in symbols.items():
- if called_method in symbol_id.split("::")[-1]:
- if current_method not in symbol_info.called_by:
- symbol_info.called_by.append(current_method)
-
- # Continue traversing children
- for child in node.children:
- self._find_java_calls(child, content, symbols, file_path, current_method)
-
def _get_called_method_name(self, node, content: str) -> Optional[str]:
"""Extract called method name from method invocation node."""
+ # Handle obj.method() pattern - look for the method name after the dot
for child in node.children:
- if child.type == 'identifier':
+ if child.type == 'field_access':
+ # For field_access nodes, get the field (method) name
+ for subchild in child.children:
+ if subchild.type == 'identifier' and subchild.start_byte > child.start_byte:
+ # Get the rightmost identifier (the method name)
+ return content[subchild.start_byte:subchild.end_byte]
+ elif child.type == 'identifier':
+ # Direct method call without object reference
return content[child.start_byte:child.end_byte]
return None
+
+
+class TraversalContext:
+ """Context object to pass state during single-pass traversal."""
+
+ def __init__(self, content: str, file_path: str, symbols: Dict,
+ functions: List, classes: List, imports: List, symbol_lookup: Dict):
+ self.content = content
+ self.file_path = file_path
+ self.symbols = symbols
+ self.functions = functions
+ self.classes = classes
+ self.imports = imports
+ self.symbol_lookup = symbol_lookup
\ No newline at end of file
diff --git a/src/code_index_mcp/indexing/strategies/python_strategy.py b/src/code_index_mcp/indexing/strategies/python_strategy.py
index 2cf62cd..a09d00c 100644
--- a/src/code_index_mcp/indexing/strategies/python_strategy.py
+++ b/src/code_index_mcp/indexing/strategies/python_strategy.py
@@ -1,10 +1,10 @@
"""
-Python parsing strategy using AST.
+Python parsing strategy using AST - Optimized single-pass version.
"""
import ast
import logging
-from typing import Dict, List, Tuple, Optional
+from typing import Dict, List, Tuple, Optional, Set
from .base_strategy import ParsingStrategy
from ..models import SymbolInfo, FileInfo
@@ -12,7 +12,7 @@
class PythonParsingStrategy(ParsingStrategy):
- """Python-specific parsing strategy using Python's built-in AST."""
+ """Python-specific parsing strategy using Python's built-in AST - Single Pass Optimized."""
def get_language_name(self) -> str:
return "python"
@@ -21,7 +21,7 @@ def get_supported_extensions(self) -> List[str]:
return ['.py', '.pyw']
def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]:
- """Parse Python file using AST."""
+ """Parse Python file using AST with single-pass optimization."""
symbols = {}
functions = []
classes = []
@@ -29,10 +29,9 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo
try:
tree = ast.parse(content)
- # Phase 1: Extract symbol definitions
- self._visit_ast_node(tree, symbols, functions, classes, imports, file_path, content)
- # Phase 2: Analyze function calls and build relationships
- self._analyze_calls(tree, symbols, file_path)
+ # Single-pass visitor that handles everything at once
+ visitor = SinglePassVisitor(symbols, functions, classes, imports, file_path)
+ visitor.visit(tree)
except SyntaxError as e:
logger.warning(f"Syntax error in Python file {file_path}: {e}")
except Exception as e:
@@ -46,158 +45,161 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo
)
return symbols, file_info
+
+
+class SinglePassVisitor(ast.NodeVisitor):
+ """Single-pass AST visitor that extracts symbols and analyzes calls in one traversal."""
- def _visit_ast_node(self, node: ast.AST, symbols: Dict, functions: List,
- classes: List, imports: List, file_path: str, content: str):
- """Visit AST nodes and extract symbols."""
- # Track processed nodes to avoid duplicates
- processed_nodes = set()
-
- # First pass: handle classes and mark their methods as processed
- for child in ast.walk(node):
- if isinstance(child, ast.ClassDef):
- self._handle_class(child, symbols, classes, file_path, functions)
- # Mark all methods in this class as processed
- for class_child in child.body:
- if isinstance(class_child, ast.FunctionDef):
- processed_nodes.add(id(class_child))
-
- # Second pass: handle standalone functions and imports
- for child in ast.walk(node):
- if isinstance(child, ast.FunctionDef) and id(child) not in processed_nodes:
- self._handle_function(child, symbols, functions, file_path)
- elif isinstance(child, (ast.Import, ast.ImportFrom)):
- self._handle_import(child, imports)
-
- def _handle_function(self, node: ast.FunctionDef, symbols: Dict, functions: List, file_path: str):
- """Handle function definition."""
- func_name = node.name
- symbol_id = self._create_symbol_id(file_path, func_name)
+ def __init__(self, symbols: Dict[str, SymbolInfo], functions: List[str],
+ classes: List[str], imports: List[str], file_path: str):
+ self.symbols = symbols
+ self.functions = functions
+ self.classes = classes
+ self.imports = imports
+ self.file_path = file_path
- # Extract function signature
- signature = self._extract_function_signature(node)
+ # Context tracking for call analysis
+ self.current_function_stack = []
+ self.current_class = None
- # Extract docstring
- docstring = ast.get_docstring(node)
+ # Symbol lookup index for O(1) access
+ self.symbol_lookup = {} # name -> symbol_id mapping for fast lookups
- symbols[symbol_id] = SymbolInfo(
- type="function",
- file=file_path,
- line=node.lineno,
- signature=signature,
- docstring=docstring
- )
- functions.append(func_name)
+ # Track processed nodes to avoid duplicates
+ self.processed_nodes: Set[int] = set()
- def _handle_class(self, node: ast.ClassDef, symbols: Dict, classes: List, file_path: str, functions: List = None):
- """Handle class definition."""
+ def visit_ClassDef(self, node: ast.ClassDef):
+ """Visit class definition - extract symbol and analyze in single pass."""
class_name = node.name
- symbol_id = self._create_symbol_id(file_path, class_name)
+ symbol_id = self._create_symbol_id(self.file_path, class_name)
# Extract docstring
docstring = ast.get_docstring(node)
- symbols[symbol_id] = SymbolInfo(
+ # Create symbol info
+ symbol_info = SymbolInfo(
type="class",
- file=file_path,
+ file=self.file_path,
line=node.lineno,
docstring=docstring
)
- classes.append(class_name)
- # Handle methods within the class
+ # Store in symbols and lookup index
+ self.symbols[symbol_id] = symbol_info
+ self.symbol_lookup[class_name] = symbol_id
+ self.classes.append(class_name)
+
+ # Track class context for method processing
+ old_class = self.current_class
+ self.current_class = class_name
+
+ # Process class body (including methods)
for child in node.body:
if isinstance(child, ast.FunctionDef):
- method_name = f"{class_name}.{child.name}"
- method_symbol_id = self._create_symbol_id(file_path, method_name)
-
- method_signature = self._extract_function_signature(child)
- method_docstring = ast.get_docstring(child)
-
- symbols[method_symbol_id] = SymbolInfo(
- type="method",
- file=file_path,
- line=child.lineno,
- signature=method_signature,
- docstring=method_docstring
- )
-
- # Add method to functions list if provided
- if functions is not None:
- functions.append(method_name)
-
- def _handle_import(self, node, imports: List):
- """Handle import statements."""
- if isinstance(node, ast.Import):
- for alias in node.names:
- imports.append(alias.name)
- elif isinstance(node, ast.ImportFrom):
- if node.module:
- for alias in node.names:
- imports.append(f"{node.module}.{alias.name}")
+ self._handle_method(child, class_name)
+ else:
+ # Visit other nodes in class body
+ self.visit(child)
+
+ # Restore previous class context
+ self.current_class = old_class
- def _extract_function_signature(self, node: ast.FunctionDef) -> str:
- """Extract function signature from AST node."""
- # Build basic signature
- args = []
+ def visit_FunctionDef(self, node: ast.FunctionDef):
+ """Visit function definition - extract symbol and track context."""
+ # Skip if this is a method (already handled by ClassDef)
+ if self.current_class:
+ return
- # Regular arguments
- for arg in node.args.args:
- args.append(arg.arg)
+ # Skip if already processed
+ node_id = id(node)
+ if node_id in self.processed_nodes:
+ return
+ self.processed_nodes.add(node_id)
- # Varargs (*args)
- if node.args.vararg:
- args.append(f"*{node.args.vararg.arg}")
+ func_name = node.name
+ symbol_id = self._create_symbol_id(self.file_path, func_name)
- # Keyword arguments (**kwargs)
- if node.args.kwarg:
- args.append(f"**{node.args.kwarg.arg}")
+ # Extract function signature and docstring
+ signature = self._extract_function_signature(node)
+ docstring = ast.get_docstring(node)
- signature = f"def {node.name}({', '.join(args)}):"
- return signature
-
- def _analyze_calls(self, tree: ast.AST, symbols: Dict[str, SymbolInfo], file_path: str):
- """Analyze function calls and build caller-callee relationships."""
- visitor = CallAnalysisVisitor(symbols, file_path)
- visitor.visit(tree)
-
-
-class CallAnalysisVisitor(ast.NodeVisitor):
- """AST visitor to analyze function calls and build caller-callee relationships."""
-
- def __init__(self, symbols: Dict[str, SymbolInfo], file_path: str):
- self.symbols = symbols
- self.file_path = file_path
- self.current_function_stack = []
- self.current_class = None
-
- def visit_ClassDef(self, node: ast.ClassDef):
- """Visit class definition and track context."""
- self.current_class = node.name
+ # Create symbol info
+ symbol_info = SymbolInfo(
+ type="function",
+ file=self.file_path,
+ line=node.lineno,
+ signature=signature,
+ docstring=docstring
+ )
+
+ # Store in symbols and lookup index
+ self.symbols[symbol_id] = symbol_info
+ self.symbol_lookup[func_name] = symbol_id
+ self.functions.append(func_name)
+
+ # Track function context for call analysis
+ function_id = f"{self.file_path}::{func_name}"
+ self.current_function_stack.append(function_id)
+
+ # Visit function body to analyze calls
self.generic_visit(node)
- self.current_class = None
+
+ # Pop function from stack
+ self.current_function_stack.pop()
- def visit_FunctionDef(self, node: ast.FunctionDef):
- """Visit function definition and track context."""
- # File path is already relative after our fix
- relative_path = self.file_path
+ def _handle_method(self, node: ast.FunctionDef, class_name: str):
+ """Handle method definition within a class."""
+ method_name = f"{class_name}.{node.name}"
+ method_symbol_id = self._create_symbol_id(self.file_path, method_name)
- # Handle methods within classes
- if self.current_class:
- function_id = f"{relative_path}::{self.current_class}.{node.name}"
- else:
- function_id = f"{relative_path}::{node.name}"
-
+ method_signature = self._extract_function_signature(node)
+ method_docstring = ast.get_docstring(node)
+
+ # Create symbol info
+ symbol_info = SymbolInfo(
+ type="method",
+ file=self.file_path,
+ line=node.lineno,
+ signature=method_signature,
+ docstring=method_docstring
+ )
+
+ # Store in symbols and lookup index
+ self.symbols[method_symbol_id] = symbol_info
+ self.symbol_lookup[method_name] = method_symbol_id
+ self.symbol_lookup[node.name] = method_symbol_id # Also index by method name alone
+ self.functions.append(method_name)
+
+ # Track method context for call analysis
+ function_id = f"{self.file_path}::{method_name}"
self.current_function_stack.append(function_id)
- # Visit all child nodes within this function
- self.generic_visit(node)
+ # Visit method body to analyze calls
+ for child in node.body:
+ self.visit(child)
- # Pop the function from stack when done
+ # Pop method from stack
self.current_function_stack.pop()
+ def visit_Import(self, node: ast.Import):
+ """Handle import statements."""
+ for alias in node.names:
+ self.imports.append(alias.name)
+ self.generic_visit(node)
+
+ def visit_ImportFrom(self, node: ast.ImportFrom):
+ """Handle from...import statements."""
+ if node.module:
+ for alias in node.names:
+ self.imports.append(f"{node.module}.{alias.name}")
+ self.generic_visit(node)
+
def visit_Call(self, node: ast.Call):
- """Visit function call and record relationship."""
+ """Visit function call and record relationship using O(1) lookup."""
+ if not self.current_function_stack:
+ self.generic_visit(node)
+ return
+
try:
# Get the function name being called
called_function = None
@@ -208,28 +210,55 @@ def visit_Call(self, node: ast.Call):
elif isinstance(node.func, ast.Attribute):
# Method call: obj.method() or module.function()
called_function = node.func.attr
-
- if called_function and self.current_function_stack:
+
+ if called_function:
# Get the current calling function
caller_function = self.current_function_stack[-1]
- # Look for the called function in our symbols and add relationship
- for symbol_id, symbol_info in self.symbols.items():
+ # Use O(1) lookup instead of O(n) iteration
+ # First try exact match
+ if called_function in self.symbol_lookup:
+ symbol_id = self.symbol_lookup[called_function]
+ symbol_info = self.symbols[symbol_id]
if symbol_info.type in ["function", "method"]:
- # Extract just the function/method name from the symbol ID
- symbol_name = symbol_id.split("::")[-1]
-
- # Check for exact match or method name match (ClassName.method)
- if (symbol_name == called_function or
- symbol_name.endswith(f".{called_function}")):
- # Add caller to the called function's called_by list
- if caller_function not in symbol_info.called_by:
- symbol_info.called_by.append(caller_function)
- break
+ if caller_function not in symbol_info.called_by:
+ symbol_info.called_by.append(caller_function)
+ else:
+ # Try method name match for any class
+ for name, symbol_id in self.symbol_lookup.items():
+ if name.endswith(f".{called_function}"):
+ symbol_info = self.symbols[symbol_id]
+ if symbol_info.type in ["function", "method"]:
+ if caller_function not in symbol_info.called_by:
+ symbol_info.called_by.append(caller_function)
+ break
except Exception:
# Silently handle parsing errors for complex call patterns
pass
-
+
# Continue visiting child nodes
self.generic_visit(node)
+ def _create_symbol_id(self, file_path: str, symbol_name: str) -> str:
+ """Create a unique symbol ID."""
+ return f"{file_path}::{symbol_name}"
+
+ def _extract_function_signature(self, node: ast.FunctionDef) -> str:
+ """Extract function signature from AST node."""
+ # Build basic signature
+ args = []
+
+ # Regular arguments
+ for arg in node.args.args:
+ args.append(arg.arg)
+
+ # Varargs (*args)
+ if node.args.vararg:
+ args.append(f"*{node.args.vararg.arg}")
+
+ # Keyword arguments (**kwargs)
+ if node.args.kwarg:
+ args.append(f"**{node.args.kwarg.arg}")
+
+ signature = f"def {node.name}({', '.join(args)}):"
+ return signature
\ No newline at end of file
diff --git a/src/code_index_mcp/indexing/strategies/typescript_strategy.py b/src/code_index_mcp/indexing/strategies/typescript_strategy.py
index efd2ec9..05ed04d 100644
--- a/src/code_index_mcp/indexing/strategies/typescript_strategy.py
+++ b/src/code_index_mcp/indexing/strategies/typescript_strategy.py
@@ -1,9 +1,9 @@
"""
-TypeScript parsing strategy using tree-sitter.
+TypeScript parsing strategy using tree-sitter - Optimized single-pass version.
"""
import logging
-from typing import Dict, List, Tuple, Optional
+from typing import Dict, List, Tuple, Optional, Set
from .base_strategy import ParsingStrategy
from ..models import SymbolInfo, FileInfo
@@ -14,7 +14,7 @@
class TypeScriptParsingStrategy(ParsingStrategy):
- """TypeScript-specific parsing strategy using tree-sitter."""
+ """TypeScript-specific parsing strategy using tree-sitter - Single Pass Optimized."""
def __init__(self):
self.ts_language = tree_sitter.Language(language_typescript())
@@ -26,19 +26,32 @@ def get_supported_extensions(self) -> List[str]:
return ['.ts', '.tsx']
def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]:
- """Parse TypeScript file using tree-sitter."""
+ """Parse TypeScript file using tree-sitter with single-pass optimization."""
symbols = {}
functions = []
classes = []
imports = []
exports = []
+ # Symbol lookup index for O(1) access
+ symbol_lookup = {} # name -> symbol_id mapping
+
parser = tree_sitter.Parser(self.ts_language)
tree = parser.parse(content.encode('utf8'))
- # Phase 1: Extract symbols
- self._traverse_ts_node(tree.root_node, content, file_path, symbols, functions, classes, imports, exports)
- # Phase 2: Analyze function calls using tree-sitter
- self._analyze_ts_calls_with_tree_sitter(tree.root_node, content, file_path, symbols)
+
+ # Single-pass traversal that handles everything
+ context = TraversalContext(
+ content=content,
+ file_path=file_path,
+ symbols=symbols,
+ functions=functions,
+ classes=classes,
+ imports=imports,
+ exports=exports,
+ symbol_lookup=symbol_lookup
+ )
+
+ self._traverse_node_single_pass(tree.root_node, context)
file_info = FileInfo(
language=self.get_language_name(),
@@ -50,63 +63,145 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo
return symbols, file_info
- def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo],
- functions: List[str], classes: List[str], imports: List[str], exports: List[str]):
- """Traverse TypeScript AST node."""
+ def _traverse_node_single_pass(self, node, context: 'TraversalContext',
+ current_function: Optional[str] = None,
+ current_class: Optional[str] = None):
+ """Single-pass traversal that extracts symbols and analyzes calls."""
+
+ # Handle function declarations
if node.type == 'function_declaration':
- name = self._get_function_name(node, content)
+ name = self._get_function_name(node, context.content)
if name:
- symbol_id = self._create_symbol_id(file_path, name)
- signature = self._get_ts_function_signature(node, content)
- symbols[symbol_id] = SymbolInfo(
+ symbol_id = self._create_symbol_id(context.file_path, name)
+ signature = self._get_ts_function_signature(node, context.content)
+ symbol_info = SymbolInfo(
type="function",
- file=file_path,
+ file=context.file_path,
line=node.start_point[0] + 1,
signature=signature
)
- functions.append(name)
-
+ context.symbols[symbol_id] = symbol_info
+ context.symbol_lookup[name] = symbol_id
+ context.functions.append(name)
+
+ # Traverse function body with updated context
+ func_context = f"{context.file_path}::{name}"
+ for child in node.children:
+ self._traverse_node_single_pass(child, context, current_function=func_context,
+ current_class=current_class)
+ return
+
+ # Handle class declarations
elif node.type == 'class_declaration':
- name = self._get_class_name(node, content)
+ name = self._get_class_name(node, context.content)
if name:
- symbol_id = self._create_symbol_id(file_path, name)
- symbols[symbol_id] = SymbolInfo(
+ symbol_id = self._create_symbol_id(context.file_path, name)
+ symbol_info = SymbolInfo(
type="class",
- file=file_path,
+ file=context.file_path,
line=node.start_point[0] + 1
)
- classes.append(name)
+ context.symbols[symbol_id] = symbol_info
+ context.symbol_lookup[name] = symbol_id
+ context.classes.append(name)
+ # Traverse class body with updated context
+ for child in node.children:
+ self._traverse_node_single_pass(child, context, current_function=current_function,
+ current_class=name)
+ return
+
+ # Handle interface declarations
elif node.type == 'interface_declaration':
- name = self._get_interface_name(node, content)
+ name = self._get_interface_name(node, context.content)
if name:
- symbol_id = self._create_symbol_id(file_path, name)
- symbols[symbol_id] = SymbolInfo(
+ symbol_id = self._create_symbol_id(context.file_path, name)
+ symbol_info = SymbolInfo(
type="interface",
- file=file_path,
+ file=context.file_path,
line=node.start_point[0] + 1
)
- classes.append(name) # Group interfaces with classes for simplicity
+ context.symbols[symbol_id] = symbol_info
+ context.symbol_lookup[name] = symbol_id
+ context.classes.append(name) # Group interfaces with classes
+
+ # Traverse interface body with updated context
+ for child in node.children:
+ self._traverse_node_single_pass(child, context, current_function=current_function,
+ current_class=name)
+ return
+ # Handle method definitions
elif node.type == 'method_definition':
- method_name = self._get_method_name(node, content)
- class_name = self._find_parent_class(node, content)
- if method_name and class_name:
- full_name = f"{class_name}.{method_name}"
- symbol_id = self._create_symbol_id(file_path, full_name)
- signature = self._get_ts_function_signature(node, content)
- symbols[symbol_id] = SymbolInfo(
+ method_name = self._get_method_name(node, context.content)
+ if method_name and current_class:
+ full_name = f"{current_class}.{method_name}"
+ symbol_id = self._create_symbol_id(context.file_path, full_name)
+ signature = self._get_ts_function_signature(node, context.content)
+ symbol_info = SymbolInfo(
type="method",
- file=file_path,
+ file=context.file_path,
line=node.start_point[0] + 1,
signature=signature
)
- # Add method to functions list for consistency
- functions.append(full_name)
+ context.symbols[symbol_id] = symbol_info
+ context.symbol_lookup[full_name] = symbol_id
+ context.symbol_lookup[method_name] = symbol_id # Also index by method name alone
+ context.functions.append(full_name)
+
+ # Traverse method body with updated context
+ method_context = f"{context.file_path}::{full_name}"
+ for child in node.children:
+ self._traverse_node_single_pass(child, context, current_function=method_context,
+ current_class=current_class)
+ return
+
+ # Handle function calls
+ elif node.type == 'call_expression' and current_function:
+ # Extract the function being called
+ called_function = None
+ if node.children:
+ func_node = node.children[0]
+ if func_node.type == 'identifier':
+ # Direct function call
+ called_function = context.content[func_node.start_byte:func_node.end_byte]
+ elif func_node.type == 'member_expression':
+ # Method call (obj.method or this.method)
+ for child in func_node.children:
+ if child.type == 'property_identifier':
+ called_function = context.content[child.start_byte:child.end_byte]
+ break
+
+ # Add relationship using O(1) lookup
+ if called_function:
+ if called_function in context.symbol_lookup:
+ symbol_id = context.symbol_lookup[called_function]
+ symbol_info = context.symbols[symbol_id]
+ if current_function not in symbol_info.called_by:
+ symbol_info.called_by.append(current_function)
+ else:
+ # Try to find method with class prefix
+ for name, sid in context.symbol_lookup.items():
+ if name.endswith(f".{called_function}"):
+ symbol_info = context.symbols[sid]
+ if current_function not in symbol_info.called_by:
+ symbol_info.called_by.append(current_function)
+ break
+
+ # Handle import declarations
+ elif node.type == 'import_statement':
+ import_text = context.content[node.start_byte:node.end_byte]
+ context.imports.append(import_text)
- # Continue traversing children
+ # Handle export declarations
+ elif node.type in ['export_statement', 'export_default_declaration']:
+ export_text = context.content[node.start_byte:node.end_byte]
+ context.exports.append(export_text)
+
+ # Continue traversing children for other node types
for child in node.children:
- self._traverse_ts_node(child, content, file_path, symbols, functions, classes, imports, exports)
+ self._traverse_node_single_pass(child, context, current_function=current_function,
+ current_class=current_class)
def _get_function_name(self, node, content: str) -> Optional[str]:
"""Extract function name from tree-sitter node."""
@@ -136,65 +231,21 @@ def _get_method_name(self, node, content: str) -> Optional[str]:
return content[child.start_byte:child.end_byte]
return None
- def _find_parent_class(self, node, content: str) -> Optional[str]:
- """Find the parent class of a method."""
- parent = node.parent
- while parent:
- if parent.type in ['class_declaration', 'interface_declaration']:
- return self._get_class_name(parent, content) or self._get_interface_name(parent, content)
- parent = parent.parent
- return None
-
def _get_ts_function_signature(self, node, content: str) -> str:
"""Extract TypeScript function signature."""
return content[node.start_byte:node.end_byte].split('\n')[0].strip()
- def _analyze_ts_calls_with_tree_sitter(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo],
- current_function: Optional[str] = None, current_class: Optional[str] = None):
- """Analyze TypeScript function calls using tree-sitter AST."""
- # Track function/method context
- if node.type == 'function_declaration':
- func_name = self._get_function_name(node, content)
- if func_name:
- current_function = f"{file_path}::{func_name}"
- elif node.type == 'method_definition':
- method_name = self._get_method_name(node, content)
- parent_class = self._find_parent_class(node, content)
- if method_name and parent_class:
- current_function = f"{file_path}::{parent_class}.{method_name}"
- elif node.type == 'class_declaration':
- current_class = self._get_class_name(node, content)
-
- # Detect function calls
- if node.type == 'call_expression' and current_function:
- # Extract the function being called
- called_function = None
- if node.children:
- func_node = node.children[0]
- if func_node.type == 'identifier':
- # Direct function call
- called_function = content[func_node.start_byte:func_node.end_byte]
- elif func_node.type == 'member_expression':
- # Method call (obj.method or this.method)
- for child in func_node.children:
- if child.type == 'property_identifier':
- called_function = content[child.start_byte:child.end_byte]
- break
-
- # Add relationship if we found the called function
- if called_function:
- for symbol_id, symbol_info in symbols.items():
- if symbol_info.type in ["function", "method"]:
- symbol_name = symbol_id.split("::")[-1]
- # Check for exact match or method name match
- if (symbol_name == called_function or
- symbol_name.endswith(f".{called_function}")):
- if current_function not in symbol_info.called_by:
- symbol_info.called_by.append(current_function)
- break
-
- # Recursively process children
- for child in node.children:
- self._analyze_ts_calls_with_tree_sitter(child, content, file_path, symbols, current_function, current_class)
+class TraversalContext:
+ """Context object to pass state during single-pass traversal."""
+ def __init__(self, content: str, file_path: str, symbols: Dict,
+ functions: List, classes: List, imports: List, exports: List, symbol_lookup: Dict):
+ self.content = content
+ self.file_path = file_path
+ self.symbols = symbols
+ self.functions = functions
+ self.classes = classes
+ self.imports = imports
+ self.exports = exports
+ self.symbol_lookup = symbol_lookup
\ No newline at end of file
diff --git a/src/code_index_mcp/search/ag.py b/src/code_index_mcp/search/ag.py
index e2506a2..aa3eb33 100644
--- a/src/code_index_mcp/search/ag.py
+++ b/src/code_index_mcp/search/ag.py
@@ -27,7 +27,8 @@ def search(
context_lines: int = 0,
file_pattern: Optional[str] = None,
fuzzy: bool = False,
- regex: bool = False
+ regex: bool = False,
+ max_line_length: Optional[int] = None
) -> Dict[str, List[Tuple[int, str]]]:
"""
Execute a search using The Silver Searcher (ag).
@@ -40,6 +41,7 @@ def search(
file_pattern: File pattern to filter
fuzzy: Enable word boundary matching (not true fuzzy search)
regex: Enable regex pattern matching
+ max_line_length: Optional. Limit the length of lines when context_lines is used
"""
# ag prints line numbers and groups by file by default, which is good.
# --noheading is used to be consistent with other tools' output format.
@@ -93,6 +95,26 @@ def search(
cmd.extend(['-G', regex_pattern])
+ processed_patterns = set()
+ exclude_dirs = getattr(self, 'exclude_dirs', [])
+ exclude_file_patterns = getattr(self, 'exclude_file_patterns', [])
+
+ for directory in exclude_dirs:
+ normalized = directory.strip()
+ if not normalized or normalized in processed_patterns:
+ continue
+ cmd.extend(['--ignore', normalized])
+ processed_patterns.add(normalized)
+
+ for pattern in exclude_file_patterns:
+ normalized = pattern.strip()
+ if not normalized or normalized in processed_patterns:
+ continue
+ if normalized.startswith('!'):
+ normalized = normalized[1:]
+ cmd.extend(['--ignore', normalized])
+ processed_patterns.add(normalized)
+
# Add -- to treat pattern as a literal argument, preventing injection
cmd.append('--')
cmd.append(search_pattern)
@@ -116,10 +138,10 @@ def search(
if process.returncode > 1:
raise RuntimeError(f"ag failed with exit code {process.returncode}: {process.stderr}")
- return parse_search_output(process.stdout, base_path)
+ return parse_search_output(process.stdout, base_path, max_line_length)
except FileNotFoundError:
raise RuntimeError("'ag' (The Silver Searcher) not found. Please install it and ensure it's in your PATH.")
except Exception as e:
# Re-raise other potential exceptions like permission errors
- raise RuntimeError(f"An error occurred while running ag: {e}")
+ raise RuntimeError(f"An error occurred while running ag: {e}")
diff --git a/src/code_index_mcp/search/base.py b/src/code_index_mcp/search/base.py
index 038e6b5..5e4c63b 100644
--- a/src/code_index_mcp/search/base.py
+++ b/src/code_index_mcp/search/base.py
@@ -10,17 +10,25 @@
import subprocess
import sys
from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Tuple, Any
+from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING
from ..indexing.qualified_names import normalize_file_path
-def parse_search_output(output: str, base_path: str) -> Dict[str, List[Tuple[int, str]]]:
+if TYPE_CHECKING: # pragma: no cover
+ from ..utils.file_filter import FileFilter
+
+def parse_search_output(
+ output: str,
+ base_path: str,
+ max_line_length: Optional[int] = None
+) -> Dict[str, List[Tuple[int, str]]]:
"""
Parse the output of command-line search tools (grep, ag, rg).
Args:
output: The raw output from the command-line tool.
base_path: The base path of the project to make file paths relative.
+ max_line_length: Optional maximum line length to truncate long lines.
Returns:
A dictionary where keys are file paths and values are lists of (line_number, line_content) tuples.
@@ -33,26 +41,53 @@ def parse_search_output(output: str, base_path: str) -> Dict[str, List[Tuple[int
if not line.strip():
continue
try:
- # Handle Windows paths which might have a drive letter, e.g., C:
+ # Try to parse as a matched line first (format: path:linenum:content)
parts = line.split(':', 2)
- if sys.platform == "win32" and len(parts[0]) == 1 and parts[1].startswith('\\'):
- # Re-join drive letter with the rest of the path
+
+ # Check if this might be a context line (format: path-linenum-content)
+ # Context lines use '-' as separator in grep/ag output
+ if len(parts) < 3 and '-' in line:
+ # Try to parse as context line
+ # Match pattern: path-linenum-content or path-linenum-\tcontent
+ match = re.match(r'^(.*?)-(\d+)[-\t](.*)$', line)
+ if match:
+ file_path_abs = match.group(1)
+ line_number_str = match.group(2)
+ content = match.group(3)
+ else:
+ # If regex doesn't match, skip this line
+ continue
+ elif sys.platform == "win32" and len(parts) >= 3 and len(parts[0]) == 1 and parts[1].startswith('\\'):
+ # Handle Windows paths with drive letter (e.g., C:\path\file.txt)
file_path_abs = f"{parts[0]}:{parts[1]}"
line_number_str = parts[2].split(':', 1)[0]
- content = parts[2].split(':', 1)[1]
- else:
+ content = parts[2].split(':', 1)[1] if ':' in parts[2] else parts[2]
+ elif len(parts) >= 3:
+ # Standard format: path:linenum:content
file_path_abs = parts[0]
line_number_str = parts[1]
content = parts[2]
+ else:
+ # Line doesn't match any expected format
+ continue
line_number = int(line_number_str)
- # Make the file path relative to the base_path
- relative_path = os.path.relpath(file_path_abs, normalized_base_path)
+ # If the path is already relative (doesn't start with /), keep it as is
+ # Otherwise, make it relative to the base_path
+ if os.path.isabs(file_path_abs):
+ relative_path = os.path.relpath(file_path_abs, normalized_base_path)
+ else:
+ # Path is already relative, use it as is
+ relative_path = file_path_abs
# Normalize path separators for consistency
relative_path = normalize_file_path(relative_path)
+ # Truncate content if it exceeds max_line_length
+ if max_line_length and len(content) > max_line_length:
+ content = content[:max_line_length] + '... (truncated)'
+
if relative_path not in results:
results[relative_path] = []
results[relative_path].append((line_number, content))
@@ -150,6 +185,16 @@ class SearchStrategy(ABC):
Each strategy is responsible for searching code using a specific tool or method.
"""
+ def configure_excludes(self, file_filter: Optional['FileFilter']) -> None:
+ """Configure shared exclusion settings for the strategy."""
+ self.file_filter = file_filter
+ if file_filter:
+ self.exclude_dirs = sorted(set(file_filter.exclude_dirs))
+ self.exclude_file_patterns = sorted(set(file_filter.exclude_files))
+ else:
+ self.exclude_dirs = []
+ self.exclude_file_patterns = []
+
@property
@abstractmethod
def name(self) -> str:
@@ -175,7 +220,8 @@ def search(
context_lines: int = 0,
file_pattern: Optional[str] = None,
fuzzy: bool = False,
- regex: bool = False
+ regex: bool = False,
+ max_line_length: Optional[int] = None
) -> Dict[str, List[Tuple[int, str]]]:
"""
Execute a search using the specific strategy.
@@ -193,4 +239,3 @@ def search(
A dictionary mapping filenames to lists of (line_number, line_content) tuples.
"""
pass
-
diff --git a/src/code_index_mcp/search/basic.py b/src/code_index_mcp/search/basic.py
index 57aab77..9ef1846 100644
--- a/src/code_index_mcp/search/basic.py
+++ b/src/code_index_mcp/search/basic.py
@@ -1,9 +1,10 @@
"""
Basic, pure-Python search strategy.
"""
+import fnmatch
import os
import re
-import fnmatch
+from pathlib import Path
from typing import Dict, List, Optional, Tuple
from .base import SearchStrategy, create_word_boundary_pattern, is_safe_regex_pattern
@@ -46,7 +47,8 @@ def search(
context_lines: int = 0,
file_pattern: Optional[str] = None,
fuzzy: bool = False,
- regex: bool = False
+ regex: bool = False,
+ max_line_length: Optional[int] = None
) -> Dict[str, List[Tuple[int, str]]]:
"""
Execute a basic, line-by-line search.
@@ -60,6 +62,7 @@ def search(
file_pattern: File pattern to filter
fuzzy: Enable word boundary matching
regex: Enable regex pattern matching
+ max_line_length: Optional. Limit the length of lines when context_lines is used
"""
results: Dict[str, List[Tuple[int, str]]] = {}
@@ -81,28 +84,38 @@ def search(
except re.error as e:
raise ValueError(f"Invalid regex pattern: {pattern}, error: {e}")
- for root, _, files in os.walk(base_path):
+ file_filter = getattr(self, 'file_filter', None)
+ base = Path(base_path)
+
+ for root, dirs, files in os.walk(base_path):
+ if file_filter:
+ dirs[:] = [d for d in dirs if not file_filter.should_exclude_directory(d)]
+
for file in files:
- # Improved file pattern matching with glob support
if file_pattern and not self._matches_pattern(file, file_pattern):
continue
- file_path = os.path.join(root, file)
+ file_path = Path(root) / file
+
+ if file_filter and not file_filter.should_process_path(file_path, base):
+ continue
+
rel_path = os.path.relpath(file_path, base_path)
-
+
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
for line_num, line in enumerate(f, 1):
if search_regex.search(line):
+ content = line.rstrip('\n')
+ if max_line_length and len(content) > max_line_length:
+ content = content[:max_line_length] + '... (truncated)'
+
if rel_path not in results:
results[rel_path] = []
- # Strip newline for consistent output
- results[rel_path].append((line_num, line.rstrip('\n')))
+ results[rel_path].append((line_num, content))
except (UnicodeDecodeError, PermissionError, OSError):
- # Ignore files that can't be opened or read due to encoding/permission issues
continue
except Exception:
- # Ignore any other unexpected exceptions to maintain robustness
continue
- return results
\ No newline at end of file
+ return results
diff --git a/src/code_index_mcp/search/grep.py b/src/code_index_mcp/search/grep.py
index cd2d18e..f24c469 100644
--- a/src/code_index_mcp/search/grep.py
+++ b/src/code_index_mcp/search/grep.py
@@ -32,7 +32,8 @@ def search(
context_lines: int = 0,
file_pattern: Optional[str] = None,
fuzzy: bool = False,
- regex: bool = False
+ regex: bool = False,
+ max_line_length: Optional[int] = None
) -> Dict[str, List[Tuple[int, str]]]:
"""
Execute a search using standard grep.
@@ -45,6 +46,7 @@ def search(
file_pattern: File pattern to filter
fuzzy: Enable word boundary matching
regex: Enable regex pattern matching
+ max_line_length: Optional. Limit the length of lines when context_lines is used
"""
# -r: recursive, -n: line number
cmd = ['grep', '-r', '-n']
@@ -81,6 +83,27 @@ def search(
# Note: grep's --include uses glob patterns, not regex
cmd.append(f'--include={file_pattern}')
+ exclude_dirs = getattr(self, 'exclude_dirs', [])
+ exclude_file_patterns = getattr(self, 'exclude_file_patterns', [])
+
+ processed_dirs = set()
+ for directory in exclude_dirs:
+ normalized = directory.strip()
+ if not normalized or normalized in processed_dirs:
+ continue
+ cmd.append(f'--exclude-dir={normalized}')
+ processed_dirs.add(normalized)
+
+ processed_files = set()
+ for pattern in exclude_file_patterns:
+ normalized = pattern.strip()
+ if not normalized or normalized in processed_files:
+ continue
+ if normalized.startswith('!'):
+ normalized = normalized[1:]
+ cmd.append(f'--exclude={normalized}')
+ processed_files.add(normalized)
+
# Add -- to treat pattern as a literal argument, preventing injection
cmd.append('--')
cmd.append(search_pattern)
@@ -102,9 +125,9 @@ def search(
if process.returncode > 1:
raise RuntimeError(f"grep failed with exit code {process.returncode}: {process.stderr}")
- return parse_search_output(process.stdout, base_path)
+ return parse_search_output(process.stdout, base_path, max_line_length)
except FileNotFoundError:
raise RuntimeError("'grep' not found. Please install it and ensure it's in your PATH.")
except Exception as e:
- raise RuntimeError(f"An error occurred while running grep: {e}")
+ raise RuntimeError(f"An error occurred while running grep: {e}")
diff --git a/src/code_index_mcp/search/ripgrep.py b/src/code_index_mcp/search/ripgrep.py
index 15dd6c0..8a5c325 100644
--- a/src/code_index_mcp/search/ripgrep.py
+++ b/src/code_index_mcp/search/ripgrep.py
@@ -27,7 +27,8 @@ def search(
context_lines: int = 0,
file_pattern: Optional[str] = None,
fuzzy: bool = False,
- regex: bool = False
+ regex: bool = False,
+ max_line_length: Optional[int] = None
) -> Dict[str, List[Tuple[int, str]]]:
"""
Execute a search using ripgrep.
@@ -40,6 +41,7 @@ def search(
file_pattern: File pattern to filter
fuzzy: Enable word boundary matching (not true fuzzy search)
regex: Enable regex pattern matching
+ max_line_length: Optional. Limit the length of lines when context_lines is used
"""
cmd = ['rg', '--line-number', '--no-heading', '--color=never', '--no-ignore']
@@ -67,6 +69,31 @@ def search(
if file_pattern:
cmd.extend(['--glob', file_pattern])
+ exclude_dirs = getattr(self, 'exclude_dirs', [])
+ exclude_file_patterns = getattr(self, 'exclude_file_patterns', [])
+
+ processed_patterns = set()
+
+ for directory in exclude_dirs:
+ normalized = directory.strip()
+ if not normalized or normalized in processed_patterns:
+ continue
+ cmd.extend(['--glob', f'!**/{normalized}/**'])
+ processed_patterns.add(normalized)
+
+ for pattern in exclude_file_patterns:
+ normalized = pattern.strip()
+ if not normalized or normalized in processed_patterns:
+ continue
+ if normalized.startswith('!'):
+ glob_pattern = normalized
+ elif any(ch in normalized for ch in '*?[') or '/' in normalized:
+ glob_pattern = f'!{normalized}'
+ else:
+ glob_pattern = f'!**/{normalized}'
+ cmd.extend(['--glob', glob_pattern])
+ processed_patterns.add(normalized)
+
# Add -- to treat pattern as a literal argument, preventing injection
cmd.append('--')
cmd.append(search_pattern)
@@ -87,10 +114,10 @@ def search(
if process.returncode > 1:
raise RuntimeError(f"ripgrep failed with exit code {process.returncode}: {process.stderr}")
- return parse_search_output(process.stdout, base_path)
+ return parse_search_output(process.stdout, base_path, max_line_length)
except FileNotFoundError:
raise RuntimeError("ripgrep (rg) not found. Please install it and ensure it's in your PATH.")
except Exception as e:
# Re-raise other potential exceptions like permission errors
- raise RuntimeError(f"An error occurred while running ripgrep: {e}")
+ raise RuntimeError(f"An error occurred while running ripgrep: {e}")
diff --git a/src/code_index_mcp/search/ugrep.py b/src/code_index_mcp/search/ugrep.py
index 69f2cc4..d4302c1 100644
--- a/src/code_index_mcp/search/ugrep.py
+++ b/src/code_index_mcp/search/ugrep.py
@@ -27,7 +27,8 @@ def search(
context_lines: int = 0,
file_pattern: Optional[str] = None,
fuzzy: bool = False,
- regex: bool = False
+ regex: bool = False,
+ max_line_length: Optional[int] = None
) -> Dict[str, List[Tuple[int, str]]]:
"""
Execute a search using the 'ug' command-line tool.
@@ -40,11 +41,12 @@ def search(
file_pattern: File pattern to filter
fuzzy: Enable true fuzzy search (ugrep native support)
regex: Enable regex pattern matching
+ max_line_length: Optional. Limit the length of lines when context_lines is used
"""
if not self.is_available():
return {"error": "ugrep (ug) command not found."}
- cmd = ['ug', '--line-number', '--no-heading']
+ cmd = ['ug', '-r', '--line-number', '--no-heading']
if fuzzy:
# ugrep has native fuzzy search support
@@ -65,7 +67,31 @@ def search(
cmd.extend(['-A', str(context_lines), '-B', str(context_lines)])
if file_pattern:
- cmd.extend(['-g', file_pattern]) # Correct parameter for file patterns
+ cmd.extend(['--include', file_pattern])
+
+ processed_patterns = set()
+ exclude_dirs = getattr(self, 'exclude_dirs', [])
+ exclude_file_patterns = getattr(self, 'exclude_file_patterns', [])
+
+ for directory in exclude_dirs:
+ normalized = directory.strip()
+ if not normalized or normalized in processed_patterns:
+ continue
+ cmd.extend(['--ignore', f'**/{normalized}/**'])
+ processed_patterns.add(normalized)
+
+ for pattern in exclude_file_patterns:
+ normalized = pattern.strip()
+ if not normalized or normalized in processed_patterns:
+ continue
+ if normalized.startswith('!'):
+ ignore_pattern = normalized[1:]
+ elif any(ch in normalized for ch in '*?[') or '/' in normalized:
+ ignore_pattern = normalized
+ else:
+ ignore_pattern = f'**/{normalized}'
+ cmd.extend(['--ignore', ignore_pattern])
+ processed_patterns.add(normalized)
# Add '--' to treat pattern as a literal argument, preventing injection
cmd.append('--')
@@ -89,7 +115,7 @@ def search(
error_output = process.stderr.strip()
return {"error": f"ugrep execution failed with code {process.returncode}", "details": error_output}
- return parse_search_output(process.stdout, base_path)
+ return parse_search_output(process.stdout, base_path, max_line_length)
except FileNotFoundError:
return {"error": "ugrep (ug) command not found. Please ensure it's installed and in your PATH."}
diff --git a/src/code_index_mcp/server.py b/src/code_index_mcp/server.py
index 5892c0a..2d1eb80 100644
--- a/src/code_index_mcp/server.py
+++ b/src/code_index_mcp/server.py
@@ -13,10 +13,9 @@
import logging
from contextlib import asynccontextmanager
from dataclasses import dataclass
-from typing import AsyncIterator, Dict, Any, Optional, List
+from typing import AsyncIterator, Dict, Any, List
# Third-party imports
-from mcp import types
from mcp.server.fastmcp import FastMCP, Context
# Local imports
@@ -60,7 +59,6 @@ class CodeIndexerContext:
base_path: str
settings: ProjectSettings
file_count: int = 0
- index_manager: Optional['UnifiedIndexManager'] = None
file_watcher_service: FileWatcherService = None
@asynccontextmanager
@@ -87,10 +85,6 @@ async def indexer_lifespan(_server: FastMCP) -> AsyncIterator[CodeIndexerContext
if context.file_watcher_service:
context.file_watcher_service.stop_monitoring()
- # Only save index if project path has been set
- if context.base_path and context.index_manager:
- context.index_manager.save_index()
-
# Create the MCP server with lifespan manager
mcp = FastMCP("CodeIndexer", lifespan=indexer_lifespan, dependencies=["pathlib"])
@@ -111,13 +105,7 @@ def get_file_content(file_path: str) -> str:
# Use FileService for simple file reading - this is appropriate for a resource
return FileService(ctx).get_file_content(file_path)
-@mcp.resource("structure://project")
-@handle_mcp_resource_errors
-def get_project_structure() -> str:
- """Get the structure of the project as a JSON tree."""
- ctx = mcp.get_context()
- return ProjectManagementService(ctx).get_project_structure()
-
+# Removed: structure://project resource - not necessary for most workflows
# Removed: settings://stats resource - this information is available via get_settings_info() tool
# and is more of a debugging/technical detail rather than context AI needs
@@ -138,7 +126,8 @@ def search_code_advanced(
context_lines: int = 0,
file_pattern: str = None,
fuzzy: bool = False,
- regex: bool = None
+ regex: bool = None,
+ max_line_length: int = None
) -> Dict[str, Any]:
"""
Search for a code pattern in the project using an advanced, fast tool.
@@ -152,6 +141,7 @@ def search_code_advanced(
context_lines: Number of lines to show before and after the match.
file_pattern: A glob pattern to filter files to search in
(e.g., "*.py", "*.js", "test_*.py").
+ max_line_length: Optional. Default None (no limit). Limits the length of lines when context_lines is used.
All search tools now handle glob patterns consistently:
- ugrep: Uses glob patterns (*.py, *.{js,ts})
- ripgrep: Uses glob patterns (*.py, *.{js,ts})
@@ -180,7 +170,8 @@ def search_code_advanced(
context_lines=context_lines,
file_pattern=file_pattern,
fuzzy=fuzzy,
- regex=regex
+ regex=regex,
+ max_line_length=max_line_length
)
@mcp.tool()
@@ -246,6 +237,16 @@ def refresh_index(ctx: Context) -> str:
"""
return IndexManagementService(ctx).rebuild_index()
+@mcp.tool()
+@handle_mcp_tool_errors(return_type='str')
+def build_deep_index(ctx: Context) -> str:
+ """
+ Build the deep index (full symbol extraction) for the current project.
+
+ This performs a complete re-index and loads it into memory.
+ """
+ return IndexManagementService(ctx).rebuild_deep_index()
+
@mcp.tool()
@handle_mcp_tool_errors(return_type='dict')
def get_settings_info(ctx: Context) -> Dict[str, Any]:
@@ -297,62 +298,7 @@ def configure_file_watcher(
return SystemManagementService(ctx).configure_file_watcher(enabled, debounce_seconds, additional_exclude_patterns)
# ----- PROMPTS -----
-
-@mcp.prompt()
-def analyze_code(file_path: str = "", query: str = "") -> list[types.PromptMessage]:
- """Prompt for analyzing code in the project."""
- messages = [
- types.PromptMessage(role="user", content=types.TextContent(type="text", text=f"""I need you to analyze some code from my project.
-
-{f'Please analyze the file: {file_path}' if file_path else ''}
-{f'I want to understand: {query}' if query else ''}
-
-First, let me give you some context about the project structure. Then, I'll provide the code to analyze.
-""")),
- types.PromptMessage(
- role="assistant",
- content=types.TextContent(
- type="text",
- text="I'll help you analyze the code. Let me first examine the project structure to get a better understanding of the codebase."
- )
- )
- ]
- return messages
-
-@mcp.prompt()
-def code_search(query: str = "") -> types.TextContent:
- """Prompt for searching code in the project."""
- search_text = "\"query\"" if not query else f"\"{query}\""
- return types.TextContent(
- type="text",
- text=f"""I need to search through my codebase for {search_text}.
-
-Please help me find all occurrences of this query and explain what each match means in its context.
-Focus on the most relevant files and provide a brief explanation of how each match is used in the code.
-
-If there are too many results, prioritize the most important ones and summarize the patterns you see."""
- )
-
-@mcp.prompt()
-def set_project() -> list[types.PromptMessage]:
- """Prompt for setting the project path."""
- messages = [
- types.PromptMessage(role="user", content=types.TextContent(type="text", text="""
- I need to analyze code from a project, but I haven't set the project path yet. Please help me set up the project path and index the code.
-
- First, I need to specify which project directory to analyze.
- """)),
- types.PromptMessage(role="assistant", content=types.TextContent(type="text", text="""
- Before I can help you analyze any code, we need to set up the project path. This is a required first step.
-
- Please provide the full path to your project folder. For example:
- - Windows: "C:/Users/username/projects/my-project"
- - macOS/Linux: "/home/username/projects/my-project"
-
- Once you provide the path, I'll use the `set_project_path` tool to configure the code analyzer to work with your project.
- """))
- ]
- return messages
+# Removed: analyze_code, code_search, set_project prompts
def main():
"""Main function to run the MCP server."""
diff --git a/src/code_index_mcp/services/code_intelligence_service.py b/src/code_index_mcp/services/code_intelligence_service.py
index 77ff894..af0f1a2 100644
--- a/src/code_index_mcp/services/code_intelligence_service.py
+++ b/src/code_index_mcp/services/code_intelligence_service.py
@@ -9,12 +9,12 @@
import os
from typing import Dict, Any
-logger = logging.getLogger(__name__)
-
from .base_service import BaseService
from ..tools.filesystem import FileSystemTool
from ..indexing import get_index_manager
+logger = logging.getLogger(__name__)
+
class CodeIntelligenceService(BaseService):
"""
@@ -61,9 +61,14 @@ def analyze_file(self, file_path: str) -> Dict[str, Any]:
# Get file summary from JSON index
summary = index_manager.get_file_summary(file_path)
logger.info(f"Summary result: {summary is not None}")
-
+
+ # If deep index isn't available yet, return a helpful hint instead of error
if not summary:
- raise ValueError(f"File not found in index: {file_path}")
+ return {
+ "status": "needs_deep_index",
+ "message": "Deep index not available. Please run build_deep_index before calling get_file_summary.",
+ "file_path": file_path
+ }
return summary
diff --git a/src/code_index_mcp/services/file_discovery_service.py b/src/code_index_mcp/services/file_discovery_service.py
index 478beea..d777511 100644
--- a/src/code_index_mcp/services/file_discovery_service.py
+++ b/src/code_index_mcp/services/file_discovery_service.py
@@ -9,7 +9,7 @@
from dataclasses import dataclass
from .base_service import BaseService
-from ..indexing import get_index_manager
+from ..indexing import get_shallow_index_manager
@dataclass
@@ -32,7 +32,7 @@ class FileDiscoveryService(BaseService):
def __init__(self, ctx):
super().__init__(ctx)
- self._index_manager = get_index_manager()
+ self._index_manager = get_shallow_index_manager()
def find_files(self, pattern: str, max_results: Optional[int] = None) -> List[str]:
"""
diff --git a/src/code_index_mcp/services/file_watcher_service.py b/src/code_index_mcp/services/file_watcher_service.py
index cac4dd5..c2ef64c 100644
--- a/src/code_index_mcp/services/file_watcher_service.py
+++ b/src/code_index_mcp/services/file_watcher_service.py
@@ -50,6 +50,7 @@ def __init__(self):
WATCHDOG_AVAILABLE = False
from .base_service import BaseService
+from ..constants import SUPPORTED_EXTENSIONS
class FileWatcherService(BaseService):
diff --git a/src/code_index_mcp/services/index_management_service.py b/src/code_index_mcp/services/index_management_service.py
index e4714a3..f56c760 100644
--- a/src/code_index_mcp/services/index_management_service.py
+++ b/src/code_index_mcp/services/index_management_service.py
@@ -6,6 +6,8 @@
"""
import time
import logging
+import os
+import json
from typing import Dict, Any
from dataclasses import dataclass
@@ -13,7 +15,7 @@
logger = logging.getLogger(__name__)
from .base_service import BaseService
-from ..indexing import get_index_manager
+from ..indexing import get_index_manager, get_shallow_index_manager, DeepIndexManager
@dataclass
@@ -35,11 +37,18 @@ class IndexManagementService(BaseService):
def __init__(self, ctx):
super().__init__(ctx)
+ # Deep manager (symbols/files, legacy JSON index manager)
self._index_manager = get_index_manager()
+ # Shallow manager (file-list only) for default workflows
+ self._shallow_manager = get_shallow_index_manager()
+ # Optional wrapper for explicit deep builds
+ self._deep_wrapper = DeepIndexManager()
def rebuild_index(self) -> str:
"""
- Rebuild the project index using the new JSON indexing system.
+ Rebuild the project index (DEFAULT: shallow file list).
+
+ For deep/symbol rebuilds, use build_deep_index() tool instead.
Returns:
Success message with rebuild information
@@ -50,11 +59,17 @@ def rebuild_index(self) -> str:
# Business validation
self._validate_rebuild_request()
- # Business workflow: Execute rebuild
- result = self._execute_rebuild_workflow()
+ # Shallow rebuild only (fast path)
+ if not self._shallow_manager.set_project_path(self.base_path):
+ raise RuntimeError("Failed to set project path (shallow) in index manager")
+ if not self._shallow_manager.build_index():
+ raise RuntimeError("Failed to rebuild shallow index")
- # Business result formatting
- return self._format_rebuild_result(result)
+ try:
+ count = len(self._shallow_manager.get_file_list())
+ except Exception:
+ count = 0
+ return f"Shallow index re-built with {count} files."
def get_rebuild_status(self) -> Dict[str, Any]:
"""
@@ -137,3 +152,47 @@ def _format_rebuild_result(self, result: IndexRebuildResult) -> str:
Formatted result string for MCP response
"""
return f"Project re-indexed. Found {result.file_count} files."
+
+ def build_shallow_index(self) -> str:
+ """
+ Build and persist the shallow index (file list only).
+
+ Returns:
+ Success message including file count if available.
+
+ Raises:
+ ValueError/RuntimeError on validation or build failure
+ """
+ # Ensure project is set up
+ self._require_project_setup()
+
+ # Initialize manager with current base path
+ if not self._shallow_manager.set_project_path(self.base_path):
+ raise RuntimeError("Failed to set project path in index manager")
+
+ # Build shallow index
+ if not self._shallow_manager.build_index():
+ raise RuntimeError("Failed to build shallow index")
+
+ # Try to report count
+ count = 0
+ try:
+ shallow_path = getattr(self._shallow_manager, 'index_path', None)
+ if shallow_path and os.path.exists(shallow_path):
+ with open(shallow_path, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+ if isinstance(data, list):
+ count = len(data)
+ except Exception as e: # noqa: BLE001 - safe fallback to zero
+ logger.debug(f"Unable to read shallow index count: {e}")
+
+ return f"Shallow index built{f' with {count} files' if count else ''}."
+
+ def rebuild_deep_index(self) -> str:
+ """Rebuild the deep index using the original workflow."""
+ # Business validation
+ self._validate_rebuild_request()
+
+ # Deep rebuild via existing workflow
+ result = self._execute_rebuild_workflow()
+ return self._format_rebuild_result(result)
diff --git a/src/code_index_mcp/services/project_management_service.py b/src/code_index_mcp/services/project_management_service.py
index 1aa0706..c0f3a63 100644
--- a/src/code_index_mcp/services/project_management_service.py
+++ b/src/code_index_mcp/services/project_management_service.py
@@ -4,7 +4,6 @@
This service handles the business logic for project initialization, configuration,
and lifecycle management using the new JSON-based indexing system.
"""
-import json
import logging
from typing import Dict, Any
from dataclasses import dataclass
@@ -13,7 +12,7 @@
from .base_service import BaseService
from ..utils.response_formatter import ResponseFormatter
from ..constants import SUPPORTED_EXTENSIONS
-from ..indexing import get_index_manager
+from ..indexing import get_index_manager, get_shallow_index_manager
logger = logging.getLogger(__name__)
@@ -40,14 +39,16 @@ class ProjectManagementService(BaseService):
def __init__(self, ctx):
super().__init__(ctx)
- # Use the global singleton index manager
+ # Deep index manager (legacy full index)
self._index_manager = get_index_manager()
+ # Shallow index manager (default for initialization)
+ self._shallow_manager = get_shallow_index_manager()
from ..tools.config import ProjectConfigTool
self._config_tool = ProjectConfigTool()
# Import FileWatcherTool locally to avoid circular import
from ..tools.monitoring import FileWatcherTool
self._watcher_tool = FileWatcherTool(ctx)
-
+
@contextmanager
def _noop_operation(self, *_args, **_kwargs):
@@ -106,15 +107,15 @@ def _execute_initialization_workflow(self, path: str) -> ProjectInitializationRe
"""
# Business step 1: Initialize config tool
self._config_tool.initialize_settings(path)
-
+
# Normalize path for consistent processing
normalized_path = self._config_tool.normalize_project_path(path)
# Business step 2: Cleanup existing project state
self._cleanup_existing_project()
- # Business step 3: Initialize JSON index manager
- index_result = self._initialize_json_index_manager(normalized_path)
+ # Business step 3: Initialize shallow index by default (fast path)
+ index_result = self._initialize_shallow_index_manager(normalized_path)
# Business step 3.1: Store index manager in context for other services
self.helper.update_index_manager(self._index_manager)
@@ -185,6 +186,45 @@ def _initialize_json_index_manager(self, project_path: str) -> Dict[str, Any]:
'languages': stats.get('languages', [])
}
+ def _initialize_shallow_index_manager(self, project_path: str) -> Dict[str, Any]:
+ """
+ Business logic to initialize the shallow index manager by default.
+
+ Args:
+ project_path: Project path
+
+ Returns:
+ Dictionary with initialization results
+ """
+ # Set project path in shallow manager
+ if not self._shallow_manager.set_project_path(project_path):
+ raise RuntimeError(f"Failed to set project path (shallow): {project_path}")
+
+ # Update context
+ self.helper.update_base_path(project_path)
+
+ # Try to load existing shallow index or build new one
+ if self._shallow_manager.load_index():
+ source = "loaded_existing"
+ else:
+ if not self._shallow_manager.build_index():
+ raise RuntimeError("Failed to build shallow index")
+ source = "built_new"
+
+ # Determine file count from shallow list
+ try:
+ files = self._shallow_manager.get_file_list()
+ file_count = len(files)
+ except Exception: # noqa: BLE001 - safe fallback
+ file_count = 0
+
+ return {
+ 'file_count': file_count,
+ 'source': source,
+ 'total_symbols': 0,
+ 'languages': []
+ }
+
def _is_valid_existing_index(self, index_data: Dict[str, Any]) -> bool:
"""
@@ -217,7 +257,7 @@ def _load_existing_index(self, index_data: Dict[str, Any]) -> Dict[str, Any]:
Returns:
Dictionary with loading results
"""
-
+
# Note: Legacy index loading is now handled by UnifiedIndexManager
# This method is kept for backward compatibility but functionality moved
@@ -225,7 +265,7 @@ def _load_existing_index(self, index_data: Dict[str, Any]) -> Dict[str, Any]:
# Extract file count from metadata
file_count = index_data.get('project_metadata', {}).get('total_files', 0)
-
+
return {
'file_count': file_count,
@@ -243,22 +283,30 @@ def _setup_file_monitoring(self, project_path: str) -> str:
Returns:
String describing monitoring setup result
"""
-
+
try:
# Create rebuild callback that uses the JSON index manager
def rebuild_callback():
logger.info("File watcher triggered rebuild callback")
try:
- logger.debug(f"Starting index rebuild for: {project_path}")
- # Business logic: File changed, rebuild using JSON index manager
- if self._index_manager.refresh_index():
- stats = self._index_manager.get_index_stats()
- file_count = stats.get('indexed_files', 0)
- logger.info(f"File watcher rebuild completed successfully - indexed {file_count} files")
- return True
- else:
- logger.warning("File watcher rebuild failed")
+ logger.debug(f"Starting shallow index rebuild for: {project_path}")
+ # Business logic: File changed, rebuild using SHALLOW index manager
+ try:
+ if not self._shallow_manager.set_project_path(project_path):
+ logger.warning("Shallow manager set_project_path failed")
+ return False
+ if self._shallow_manager.build_index():
+ files = self._shallow_manager.get_file_list()
+ logger.info(f"File watcher shallow rebuild completed successfully - files {len(files)}")
+ return True
+ else:
+ logger.warning("File watcher shallow rebuild failed")
+ return False
+ except Exception as e:
+ import traceback
+ logger.error(f"File watcher shallow rebuild failed: {e}")
+ logger.error(f"Traceback: {traceback.format_exc()}")
return False
except Exception as e:
import traceback
@@ -285,7 +333,7 @@ def rebuild_callback():
def _update_project_state(self, project_path: str, file_count: int) -> None:
"""Business logic to update system state after project initialization."""
-
+
# Update context with file count
self.helper.update_file_count(file_count)
@@ -360,39 +408,4 @@ def get_project_config(self) -> str:
return ResponseFormatter.config_response(config_data)
- def get_project_structure(self) -> str:
- """
- Get the project directory structure for MCP resource.
-
- Returns:
- JSON formatted project structure
- """
-
- # Check if project is configured
- if not self.helper.base_path:
- structure_data = {
- "status": "not_configured",
- "message": ("Project path not set. Please use set_project_path "
- "to set a project directory first.")
- }
- return json.dumps(structure_data, indent=2)
-
- # Check if we have index cache with directory tree
- if (hasattr(self.ctx.request_context.lifespan_context, 'index_cache') and
- self.ctx.request_context.lifespan_context.index_cache and
- 'directory_tree' in self.ctx.request_context.lifespan_context.index_cache):
-
- directory_tree = self.ctx.request_context.lifespan_context.index_cache['directory_tree']
- return json.dumps(directory_tree, indent=2)
-
- # If no directory tree available, try to build basic structure
- try:
- # Use config tool to get basic project structure
- basic_structure = self._config_tool.get_basic_project_structure(self.helper.base_path)
- return json.dumps(basic_structure, indent=2)
- except Exception as e:
- error_data = {
- "error": f"Unable to get project structure: {e}",
- "status": "error"
- }
- return json.dumps(error_data, indent=2)
+ # Removed: get_project_structure; the project structure resource is deprecated
diff --git a/src/code_index_mcp/services/search_service.py b/src/code_index_mcp/services/search_service.py
index 7daa3c9..a2c2799 100644
--- a/src/code_index_mcp/services/search_service.py
+++ b/src/code_index_mcp/services/search_service.py
@@ -5,24 +5,20 @@
and search strategy selection.
"""
-from typing import Dict, Any, Optional
+from pathlib import Path
+from typing import Any, Dict, List, Optional
from .base_service import BaseService
-from ..utils import ValidationHelper, ResponseFormatter
+from ..utils import FileFilter, ResponseFormatter, ValidationHelper
from ..search.base import is_safe_regex_pattern
class SearchService(BaseService):
- """
- Service for managing code search operations.
-
- This service handles:
- - Code search with various parameters and options
- - Search tool management and detection
- - Search strategy selection and optimization
- - Search capabilities reporting
- """
+ """Service for managing code search operations."""
+ def __init__(self, ctx):
+ super().__init__(ctx)
+ self.file_filter = self._create_file_filter()
def search_code( # pylint: disable=too-many-arguments
self,
@@ -31,47 +27,24 @@ def search_code( # pylint: disable=too-many-arguments
context_lines: int = 0,
file_pattern: Optional[str] = None,
fuzzy: bool = False,
- regex: Optional[bool] = None
+ regex: Optional[bool] = None,
+ max_line_length: Optional[int] = None
) -> Dict[str, Any]:
- """
- Search for code patterns in the project.
-
- Handles the logic for search_code_advanced MCP tool.
-
- Args:
- pattern: The search pattern
- case_sensitive: Whether search should be case-sensitive
- context_lines: Number of context lines to show
- file_pattern: Glob pattern to filter files
- fuzzy: Whether to enable fuzzy matching
- regex: Regex mode - True/False to force, None for auto-detection
-
- Returns:
- Dictionary with search results or error information
-
- Raises:
- ValueError: If project is not set up or search parameters are invalid
- """
+ """Search for code patterns in the project."""
self._require_project_setup()
- # Smart regex detection if regex parameter is None
if regex is None:
regex = is_safe_regex_pattern(pattern)
- if regex:
- pass
- # Validate search pattern
error = ValidationHelper.validate_search_pattern(pattern, regex)
if error:
raise ValueError(error)
- # Validate file pattern if provided
if file_pattern:
error = ValidationHelper.validate_glob_pattern(file_pattern)
if error:
raise ValueError(f"Invalid file pattern: {error}")
- # Get search strategy from settings
if not self.settings:
raise ValueError("Settings not available")
@@ -79,7 +52,7 @@ def search_code( # pylint: disable=too-many-arguments
if not strategy:
raise ValueError("No search strategies available")
-
+ self._configure_strategy(strategy)
try:
results = strategy.search(
@@ -89,25 +62,16 @@ def search_code( # pylint: disable=too-many-arguments
context_lines=context_lines,
file_pattern=file_pattern,
fuzzy=fuzzy,
- regex=regex
+ regex=regex,
+ max_line_length=max_line_length
)
- return ResponseFormatter.search_results_response(results)
- except Exception as e:
- raise ValueError(f"Search failed using '{strategy.name}': {e}") from e
-
+ filtered = self._filter_results(results)
+ return ResponseFormatter.search_results_response(filtered)
+ except Exception as exc:
+ raise ValueError(f"Search failed using '{strategy.name}': {exc}") from exc
def refresh_search_tools(self) -> str:
- """
- Refresh the available search tools.
-
- Handles the logic for refresh_search_tools MCP tool.
-
- Returns:
- Success message with available tools information
-
- Raises:
- ValueError: If refresh operation fails
- """
+ """Refresh the available search tools."""
if not self.settings:
raise ValueError("Settings not available")
@@ -118,14 +82,8 @@ def refresh_search_tools(self) -> str:
preferred = config['preferred_tool']
return f"Search tools refreshed. Available: {available}. Preferred: {preferred}."
-
def get_search_capabilities(self) -> Dict[str, Any]:
- """
- Get information about search capabilities and available tools.
-
- Returns:
- Dictionary with search tool information and capabilities
- """
+ """Get information about search capabilities and available tools."""
if not self.settings:
return {"error": "Settings not available"}
@@ -142,3 +100,73 @@ def get_search_capabilities(self) -> Dict[str, Any]:
}
return capabilities
+
+ def _configure_strategy(self, strategy) -> None:
+ """Apply shared exclusion configuration to the strategy if supported."""
+ configure = getattr(strategy, 'configure_excludes', None)
+ if not configure:
+ return
+
+ try:
+ configure(self.file_filter)
+ except Exception: # pragma: no cover - defensive fallback
+ pass
+
+ def _create_file_filter(self) -> FileFilter:
+ """Build a shared file filter drawing from project settings."""
+ additional_dirs: List[str] = []
+ additional_file_patterns: List[str] = []
+
+ settings = self.settings
+ if settings:
+ try:
+ config = settings.get_file_watcher_config()
+ except Exception: # pragma: no cover - fallback if config fails
+ config = {}
+
+ for key in ('exclude_patterns', 'additional_exclude_patterns'):
+ patterns = config.get(key) or []
+ for pattern in patterns:
+ if not isinstance(pattern, str):
+ continue
+ normalized = pattern.strip()
+ if not normalized:
+ continue
+ additional_dirs.append(normalized)
+ additional_file_patterns.append(normalized)
+
+ file_filter = FileFilter(additional_dirs or None)
+
+ if additional_file_patterns:
+ file_filter.exclude_files.update(additional_file_patterns)
+
+ return file_filter
+
+ def _filter_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
+ """Filter out matches that reside under excluded paths."""
+ if not isinstance(results, dict) or not results:
+ return results
+
+ if 'error' in results or not self.file_filter or not self.base_path:
+ return results
+
+ base_path = Path(self.base_path)
+ filtered: Dict[str, Any] = {}
+
+ for rel_path, matches in results.items():
+ if not isinstance(rel_path, str):
+ continue
+
+ normalized = Path(rel_path.replace('\\', '/'))
+ try:
+ absolute = (base_path / normalized).resolve()
+ except Exception: # pragma: no cover - invalid path safety
+ continue
+
+ try:
+ if self.file_filter.should_process_path(absolute, base_path):
+ filtered[rel_path] = matches
+ except Exception: # pragma: no cover - defensive fallback
+ continue
+
+ return filtered
diff --git a/tests/search/test_search_filters.py b/tests/search/test_search_filters.py
new file mode 100644
index 0000000..787461d
--- /dev/null
+++ b/tests/search/test_search_filters.py
@@ -0,0 +1,52 @@
+"""Tests covering shared search filtering behaviour."""
+import os
+from types import SimpleNamespace
+from unittest.mock import patch
+from pathlib import Path as _TestPath
+import sys
+
+ROOT = _TestPath(__file__).resolve().parents[2]
+SRC_PATH = ROOT / 'src'
+if str(SRC_PATH) not in sys.path:
+ sys.path.insert(0, str(SRC_PATH))
+
+from code_index_mcp.search.basic import BasicSearchStrategy
+from code_index_mcp.search.ripgrep import RipgrepStrategy
+from code_index_mcp.utils.file_filter import FileFilter
+
+
+def test_basic_strategy_skips_excluded_directories(tmp_path):
+ base = tmp_path
+ src_dir = base / "src"
+ src_dir.mkdir()
+ (src_dir / 'app.js').write_text("const db = 'mongo';\n")
+
+ node_modules_dir = base / "node_modules" / "pkg"
+ node_modules_dir.mkdir(parents=True)
+ (node_modules_dir / 'index.js').write_text("// mongo dependency\n")
+
+ strategy = BasicSearchStrategy()
+ strategy.configure_excludes(FileFilter())
+
+ results = strategy.search("mongo", str(base), case_sensitive=False)
+
+ included_path = os.path.join("src", "app.js")
+ excluded_path = os.path.join("node_modules", "pkg", "index.js")
+
+ assert included_path in results
+ assert excluded_path not in results
+
+
+@patch("code_index_mcp.search.ripgrep.subprocess.run")
+def test_ripgrep_strategy_adds_exclude_globs(mock_run, tmp_path):
+ mock_run.return_value = SimpleNamespace(returncode=0, stdout="", stderr="")
+
+ strategy = RipgrepStrategy()
+ strategy.configure_excludes(FileFilter())
+
+ strategy.search("mongo", str(tmp_path))
+
+ cmd = mock_run.call_args[0][0]
+ glob_args = [cmd[i + 1] for i, arg in enumerate(cmd) if arg == '--glob' and i + 1 < len(cmd)]
+
+ assert any(value.startswith('!**/node_modules/') for value in glob_args)
diff --git a/uv.lock b/uv.lock
index 6642d2e..08294cf 100644
--- a/uv.lock
+++ b/uv.lock
@@ -49,7 +49,7 @@ wheels = [
[[package]]
name = "code-index-mcp"
-version = "2.1.2"
+version = "2.4.1"
source = { editable = "." }
dependencies = [
{ name = "mcp" },
@@ -527,3 +527,4 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070 },
{ url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067 },
]
+