diff --git a/.gitignore b/.gitignore index 367a552..9539f72 100644 --- a/.gitignore +++ b/.gitignore @@ -48,4 +48,4 @@ COMMIT_MESSAGE.txt RELEASE_NOTE.txt .llm-context/ -.kiro/ \ No newline at end of file +AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..886f335 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,25 @@ +# Repository Guidelines + +## Project Structure & Module Organization +Code Index MCP lives in `src/code_index_mcp/`, with `indexing/` managing builders, `services/` exposing MCP tool implementations, `search/` coordinating query utilities, and `utils/` housing cross-cutting helpers. The lightweight CLI bootstrapper is `run.py`, which adds `src/` to `PYTHONPATH` before invoking `code_index_mcp.server`. Sample corpora for language regression reside under `test/sample-projects/` (for example `python/user_management/`). Reserve `tests/` for runnable suites and avoid checking in generated `__pycache__` artifacts. + +## Build, Test, and Development Commands +Install dependencies with `uv sync` after cloning. Use `uv run code-index-mcp` to launch the MCP server directly, or `uv run python run.py` when you need the local sys.path shim. During development, `uv run code-index-mcp --help` will list available CLI flags, and `uv run python -m code_index_mcp.server` mirrors the published entry point for debugging. + +## Coding Style & Naming Conventions +Target Python 3.10+ and follow the `.pylintrc` configuration: 4-space indentation, 100-character line limit, and restrained function signatures (<= 7 parameters). Modules and functions stay `snake_case`, classes use `PascalCase`, and constants remain uppercase with underscores. Prefer explicit imports from sibling packages (`from .services import ...`) and keep logging to stderr as implemented in `server.py`. + +## Testing Guidelines +Automated tests should live under `tests/`, mirroring the package hierarchy (`tests/indexing/test_shallow_index.py`, etc.). Use `uv run pytest` (with optional `-k` selectors) for unit and integration coverage, and stage representative fixtures inside `test/sample-projects/` when exercising new language strategies. Document expected behaviors in fixtures' README files or inline comments, and fail fast if tree-sitter support is not available for a language you add. + +## Commit & Pull Request Guidelines +Follow the Conventional Commits style seen in history (`feat`, `fix`, `refactor(scope): summary`). Reference issue numbers when relevant and keep subjects under 72 characters. Pull requests should include: 1) a concise problem statement, 2) before/after behavior or performance notes, 3) instructions for reproducing test runs (`uv run pytest`, `uv run code-index-mcp`). Attach updated screenshots or logs when touching developer experience flows, and confirm the file watcher still transitions to "active" in manual smoke tests. + +## Agent Workflow Tips +Always call `set_project_path` before invoking other tools, and prefer `search_code_advanced` with targeted `file_pattern` filters to minimize noise. When editing indexing strategies, run `refresh_index` in between changes to confirm cache rebuilds. Clean up temporary directories via `clear_settings` if you notice stale metadata, and document any new tooling you introduce in this guide. + +## Release Preparation Checklist +- Update the project version everywhere it lives: `pyproject.toml`, `src/code_index_mcp/__init__.py`, and `uv.lock`. +- Add a release note entry to `RELEASE_NOTE.txt` for the new version. +- Commit the version bump (plus any release artifacts) and push the branch to `origin`. +- Create a git tag for the new version and push the tag to `origin`. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md deleted file mode 100644 index f3b2d5b..0000000 --- a/ARCHITECTURE.md +++ /dev/null @@ -1,233 +0,0 @@ -# Code Index MCP System Architecture - -## Overview - -Code Index MCP is a Model Context Protocol (MCP) server that provides intelligent code indexing and analysis capabilities. The system follows SCIP (Source Code Intelligence Protocol) standards and uses a service-oriented architecture with clear separation of concerns. - -## High-Level Architecture - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ MCP Interface Layer │ -├─────────────────────────────────────────────────────────────────┤ -│ Service Layer │ -├─────────────────────────────────────────────────────────────────┤ -│ SCIP Core Layer │ -├─────────────────────────────────────────────────────────────────┤ -│ Language Strategies │ -├─────────────────────────────────────────────────────────────────┤ -│ Technical Tools Layer │ -└─────────────────────────────────────────────────────────────────┘ -``` - -## Layer Responsibilities - -### 1. MCP Interface Layer (`server.py`) -**Purpose**: Exposes MCP tools and handles protocol communication - -**Key Components**: -- MCP tool definitions (`@mcp.tool()`) -- Error handling and response formatting -- User interaction and guidance - -**MCP Tools**: -- `set_project_path` - Initialize project indexing -- `find_files` - File discovery with patterns -- `get_file_summary` - File analysis and metadata -- `search_code_advanced` - Content search across files -- `refresh_index` - Manual index rebuilding -- `get_file_watcher_status` - File monitoring status -- `configure_file_watcher` - File watcher settings - -### 2. Service Layer (`services/`) -**Purpose**: Business logic orchestration and workflow management - -**Key Services**: -- `ProjectManagementService` - Project lifecycle and initialization -- `FileWatcherService` - Real-time file monitoring and auto-refresh -- `IndexManagementService` - Index rebuild operations -- `CodeIntelligenceService` - File analysis and symbol intelligence -- `FileDiscoveryService` - File pattern matching and discovery -- `SearchService` - Advanced code search capabilities - -**Architecture Pattern**: Service delegation with clear business boundaries - -### 3. SCIP Core Layer (`scip/core/`) -**Purpose**: Language-agnostic SCIP protocol implementation - -**Core Components**: -- `SCIPSymbolManager` - Standard SCIP symbol ID generation -- `LocalReferenceResolver` - Cross-file reference resolution -- `PositionCalculator` - AST/Tree-sitter position conversion -- `MonikerManager` - External package dependency handling - -**Standards Compliance**: Full SCIP protocol buffer implementation - -### 4. Language Strategies (`scip/strategies/`) -**Purpose**: Language-specific code analysis using two-phase processing - -**Strategy Pattern Implementation**: -- `BaseStrategy` - Abstract interface and common functionality -- `PythonStrategy` - Python AST analysis -- `JavaScriptStrategy` - JavaScript/TypeScript Tree-sitter analysis -- `JavaStrategy` - Java Tree-sitter analysis -- `ObjectiveCStrategy` - Objective-C Tree-sitter analysis -- `FallbackStrategy` - Generic text-based analysis - -**Two-Phase Analysis**: -1. **Phase 1**: Symbol definition collection -2. **Phase 2**: Reference resolution and SCIP document generation - -### 5. Technical Tools Layer (`tools/`) -**Purpose**: Low-level technical capabilities - -**Tool Categories**: -- `filesystem/` - File system operations and pattern matching -- `scip/` - SCIP index operations and symbol analysis -- `config/` - Configuration and settings management -- `monitoring/` - File watching and system monitoring - -## Data Flow Architecture - -### File Analysis Workflow -``` -User Request → Service Layer → SCIP Strategy → Core Components → SCIP Documents -``` - -### Index Management Workflow -``` -File Changes → File Watcher → Index Management Service → Strategy Factory → Updated Index -``` - -### Search Workflow -``` -Search Query → Search Service → Advanced Search Tools → Filtered Results -``` - -## SCIP Implementation Details - -### Symbol ID Format -``` -scip-{language} {manager} {package} [version] {descriptors} -``` - -**Examples**: -- Local: `scip-python local myproject src/main.py/MyClass#method().` -- External: `scip-python pip requests 2.31.0 sessions/Session#get().` - -### Language Support Strategy - -**Parsing Approaches**: -- **Python**: Native AST module -- **JavaScript/TypeScript**: Tree-sitter -- **Java**: Tree-sitter -- **Objective-C**: Tree-sitter -- **Others**: Fallback text analysis - -**Supported Code Intelligence**: -- Symbol definitions (functions, classes, variables) -- Import/export tracking -- Cross-file reference resolution -- External dependency management -- Position-accurate symbol ranges - -## Configuration and Extensibility - -### Package Manager Integration -- **Python**: pip, conda, poetry detection -- **JavaScript**: npm, yarn package.json parsing -- **Java**: Maven pom.xml, Gradle build files -- **Configuration-driven**: Easy addition of new package managers - -### File Watcher System -- **Real-time monitoring**: Watchdog-based file system events -- **Debounced rebuilds**: 4-6 second batching of rapid changes -- **Configurable patterns**: Customizable include/exclude rules -- **Thread-safe**: ThreadPoolExecutor for concurrent rebuilds - -## Performance Characteristics - -### Indexing Performance -- **Incremental updates**: File-level granular rebuilds -- **Parallel processing**: Concurrent file analysis -- **Memory efficient**: Streaming SCIP document generation -- **Cache optimization**: Symbol table reuse across phases - -### Search Performance -- **Advanced tools**: ripgrep, ugrep, ag integration -- **Pattern optimization**: Glob-based file filtering -- **Result streaming**: Large result set handling - -## Error Handling and Reliability - -### Fault Tolerance -- **Graceful degradation**: Continue indexing on individual file failures -- **Error isolation**: Per-file error boundaries -- **Recovery mechanisms**: Automatic retry on transient failures -- **Comprehensive logging**: Debug and audit trail support - -### Validation -- **Input sanitization**: Path traversal protection -- **Range validation**: SCIP position boundary checking -- **Schema validation**: Protocol buffer structure verification - -## Future Architecture Considerations - -### Planned Enhancements -1. **Function Call Relationships**: Complete call graph analysis -2. **Type Information**: Enhanced semantic analysis -3. **Cross-repository Navigation**: Multi-project symbol resolution -4. **Language Server Protocol**: LSP compatibility layer -5. **Distributed Indexing**: Horizontal scaling support - -### Extension Points -- **Custom strategies**: Plugin architecture for new languages -- **Analysis plugins**: Custom symbol analyzers -- **Export formats**: Multiple output format support -- **Integration APIs**: External tool connectivity - -## Directory Structure - -``` -src/code_index_mcp/ -├── server.py # MCP interface layer -├── services/ # Business logic services -│ ├── project_management_service.py -│ ├── file_watcher_service.py -│ ├── index_management_service.py -│ ├── code_intelligence_service.py -│ └── ... -├── scip/ # SCIP implementation -│ ├── core/ # Language-agnostic core -│ │ ├── symbol_manager.py -│ │ ├── local_reference_resolver.py -│ │ ├── position_calculator.py -│ │ └── moniker_manager.py -│ ├── strategies/ # Language-specific strategies -│ │ ├── base_strategy.py -│ │ ├── python_strategy.py -│ │ ├── javascript_strategy.py -│ │ └── ... -│ └── factory.py # Strategy selection -├── tools/ # Technical capabilities -│ ├── filesystem/ -│ ├── scip/ -│ ├── config/ -│ └── monitoring/ -├── indexing/ # Index management -└── utils/ # Shared utilities -``` - -## Key Design Principles - -1. **Standards Compliance**: Full SCIP protocol adherence -2. **Language Agnostic**: Core components independent of specific languages -3. **Extensible**: Easy addition of new languages and features -4. **Performance**: Efficient indexing and search operations -5. **Reliability**: Fault-tolerant with comprehensive error handling -6. **Maintainability**: Clear separation of concerns and modular design - ---- - -*Last updated: 2025-01-14* -*Architecture version: 2.1.0* \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index c3f9006..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,162 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -## [2.1.1] - 2025-01-15 - -### Fixed -- **SCIP Java Strategy**: Simplified Java symbol analysis implementation - - Refactored JavaStrategy to use streamlined symbol registration methods - - Removed complex JavaAnalyzer and JavaRelationshipExtractor classes - - Fixed symbol creation with basic identifier extraction - - Removed relationships summary calculation that was causing issues - - Added back to_scip_relationships method for compatibility - - Streamlined Java AST processing to focus on core symbol definitions - -### Improved -- **Code Maintainability**: Significantly reduced complexity in Java SCIP processing -- **Performance**: Faster Java file analysis with simplified approach -- **Reliability**: More stable symbol extraction without complex relationship tracking - -## [2.1.0] - 2025-01-13 - -### Major SCIP Architecture Enhancement - -This release completes the migration to SCIP-based code indexing with significant improvements to the core infrastructure and API simplification. - -#### Core SCIP Infrastructure -- **Complete SCIP core components**: Added symbol_manager, position_calculator, reference_resolver, moniker_manager -- **Two-phase SCIP analysis**: Implemented symbol collection → reference resolution workflow -- **Unified index management**: New index_provider and unified_index_manager for seamless index operations -- **SCIP-compliant symbol IDs**: Standard symbol ID generation with cross-file reference support - -#### Enhanced Strategy System -- **All language strategies SCIP-compliant**: Refactored Python, Java, JavaScript, Objective-C strategies -- **External symbol extraction**: Added dependency tracking and external symbol resolution -- **Proper SCIP classifications**: Implemented symbol roles and syntax kind detection -- **Robust file handling**: Enhanced encoding detection and error recovery - -#### API Improvements -- **Simplified find_files response**: Returns clean file path lists instead of complex metadata objects -- **Enhanced SCIPSymbolAnalyzer**: Replaced legacy query tools with accurate symbol analysis -- **Improved logging**: Comprehensive logging throughout SCIP indexing pipeline - -#### Dependency Updates -- **pathspec integration**: Better .gitignore parsing and file filtering -- **Updated requirements**: Added comprehensive dependency list for cross-platform support - -#### Technical Improvements -- **Symbol analysis tools**: New inspection scripts for debugging and development -- **Enhanced error handling**: Better fallback strategies and error recovery -- **Testing improvements**: Updated sample projects for multilingual testing - -#### Breaking Changes -- **find_files API**: Now returns `List[str]` instead of complex metadata dictionary -- **Internal architecture**: Significant refactoring of internal components (no user-facing impact) - -## [2.0.0] - 2025-08-11 - -### 🚀 MAJOR RELEASE - SCIP Architecture Migration - -This release represents a **complete architectural overhaul** of the code indexing system, migrating from language-specific analyzers to a unified SCIP-based approach. - -#### ✨ New Architecture -- **Three-layer service architecture**: Service → Tool → Technical Components -- **Unified SCIP indexing**: Replace 8 language-specific analyzers with single SCIP protobuf system -- **Service-oriented design**: Clear separation of business logic, technical tools, and low-level operations -- **Composable components**: Modular design enabling easier testing and maintenance - -#### 🔧 Technical Improvements -- **Tree-sitter AST parsing**: Replace regex-based analysis with proper AST parsing -- **SCIP protobuf format**: Industry-standard code intelligence format -- **Reduced complexity**: Simplified from 40K+ lines to ~1K lines of core logic -- **Better error handling**: Improved exception handling and validation -- **Enhanced logging**: Better debugging and monitoring capabilities - -#### 📦 Backward Compatibility -- **MCP API unchanged**: All existing MCP tools work without modification -- **Automatic migration**: Legacy indexes automatically migrated to SCIP format -- **Same functionality**: All user-facing features preserved and enhanced -- **No breaking changes**: Seamless upgrade experience - -#### 🗑️ Removed Components -- Language-specific analyzers (C, C++, C#, Go, Java, JavaScript, Objective-C, Python) -- Legacy indexing models and relationship management -- Complex duplicate detection and qualified name systems -- Obsolete builder and scanner components -- Demo files and temporary utilities - -#### 🆕 New Services -- **ProjectManagementService**: Project lifecycle and configuration management -- **IndexManagementService**: Index building, rebuilding, and status monitoring -- **FileDiscoveryService**: Intelligent file discovery with pattern matching -- **CodeIntelligenceService**: Code analysis and summary generation -- **SystemManagementService**: File watcher and system configuration - -#### 🛠️ New Tool Layer -- **SCIPIndexTool & SCIPQueryTool**: SCIP operations and querying -- **FileMatchingTool & FileSystemTool**: File system operations -- **ProjectConfigTool & SettingsTool**: Configuration management -- **FileWatcherTool**: Enhanced file monitoring capabilities - -#### 📊 Performance Benefits -- **Faster indexing**: Tree-sitter parsing significantly faster than regex -- **Lower memory usage**: Streamlined data structures and processing -- **Better accuracy**: SCIP provides more precise code intelligence -- **Improved scalability**: Cleaner architecture supports larger codebases - -#### 🔄 Migration Guide -Existing users can upgrade seamlessly: -1. System automatically detects legacy index format -2. Migrates to new SCIP format on first run -3. All existing functionality preserved -4. No manual intervention required - -This release establishes a solid foundation for future enhancements while dramatically simplifying the codebase and improving performance. - -## [1.2.1] - 2024-08-06 - -### Fixed -- **File Watcher**: Enhanced move event handling for modern editors (VS Code, etc.) - - Fixed issue where files created via temp-then-move pattern weren't being detected - - Improved event processing logic to exclusively check destination path for move events - - Eliminated ambiguous fallback behavior that could cause inconsistent results - -### Improved -- **Code Quality**: Comprehensive Pylint compliance improvements - - Fixed all f-string logging warnings using lazy % formatting - - Added proper docstrings to fallback classes - - Fixed multiple-statements warnings - - Moved imports to top-level following PEP 8 conventions - - Added appropriate pylint disables for stub methods - -### Technical Details -- Unified path checking logic across all event types -- Reduced code complexity in `should_process_event()` method -- Better error handling with consistent exception management -- Enhanced debugging capabilities with improved logging - -## [1.2.0] - Previous Release - -### Added -- Enhanced find_files functionality with filename search -- Performance improvements to file discovery -- Auto-refresh troubleshooting documentation - -## [1.1.1] - Previous Release - -### Fixed -- Various bug fixes and stability improvements - -## [1.1.0] - Previous Release - -### Added -- Initial file watcher functionality -- Cross-platform file system monitoring - -## [1.0.0] - Initial Release - -### Added -- Core MCP server implementation -- Code indexing and analysis capabilities -- Multi-language support \ No newline at end of file diff --git a/README.md b/README.md index f51ea87..5cabcbe 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ The easiest way to get started with any MCP-compatible application: - **Direct Tree-sitter Integration**: No regex fallbacks for specialized languages - fail fast with clear errors - **Advanced Search**: Auto-detects and uses the best available tool (ugrep, ripgrep, ag, or grep) - **Universal File Support**: Comprehensive coverage from advanced AST parsing to basic file indexing -- **File Analysis**: Deep insights into structure, imports, classes, methods, and complexity metrics +- **File Analysis**: Deep insights into structure, imports, classes, methods, and complexity metrics after running `build_deep_index` ### 🗂️ **Multi-Language Support** - **7 Languages with Tree-sitter AST Parsing**: Python, JavaScript, TypeScript, Java, Go, Objective-C, Zig @@ -81,7 +81,7 @@ The easiest way to get started with any MCP-compatible application: - **File Watcher**: Automatic index updates when files change - **Cross-platform**: Native OS file system monitoring - **Smart Processing**: Batches rapid changes to prevent excessive rebuilds -- **Rich Metadata**: Captures symbols, references, definitions, and relationships +- **Shallow Index Refresh**: Watches file changes and keeps the file list current; run a deep rebuild when you need symbol metadata ### ⚡ **Performance & Efficiency** - **Tree-sitter AST Parsing**: Native syntax parsing for accurate symbol extraction @@ -218,15 +218,18 @@ Then configure: | Tool | Description | |------|-------------| | **`set_project_path`** | Initialize indexing for a project directory | -| **`refresh_index`** | Rebuild the project index after file changes | +| **`refresh_index`** | Rebuild the shallow file index after file changes | +| **`build_deep_index`** | Generate the full symbol index used by deep analysis | | **`get_settings_info`** | View current project configuration and status | +*Run `build_deep_index` when you need symbol-level data; the default shallow index powers quick file discovery.* + ### 🔍 **Search & Discovery** | Tool | Description | |------|-------------| | **`search_code_advanced`** | Smart search with regex, fuzzy matching, and file filtering | | **`find_files`** | Locate files using glob patterns (e.g., `**/*.py`) | -| **`get_file_summary`** | Analyze file structure, functions, imports, and complexity | +| **`get_file_summary`** | Analyze file structure, functions, imports, and complexity (requires deep index) | ### 🔄 **Monitoring & Auto-refresh** | Tool | Description | @@ -263,6 +266,7 @@ Find all TypeScript component files in src/components Give me a summary of src/api/userService.ts ``` *Uses: `get_file_summary` to show functions, imports, and complexity* +*Tip: run `build_deep_index` first if you get a `needs_deep_index` response.* ### 🔍 **Advanced Search Examples** diff --git a/README_ja.md b/README_ja.md index 76c419a..79059b1 100644 --- a/README_ja.md +++ b/README_ja.md @@ -66,7 +66,7 @@ Code Index MCPは、AIモデルと複雑なコードベースの橋渡しをす - **直接Tree-sitter統合**:特化言語で正規表現フォールバックなし - 明確なエラーメッセージで高速フェイル - **高度な検索**:最適なツール(ugrep、ripgrep、ag、grep)を自動検出・使用 - **汎用ファイルサポート**:高度なAST解析から基本ファイルインデックスまでの包括的カバレッジ -- **ファイル解析**:構造、インポート、クラス、メソッド、複雑度メトリクスへの深い洞察 +- **ファイル解析**:`build_deep_index` 実行後に構造、インポート、クラス、メソッド、複雑度メトリクスを深く把握 ### 🗂️ **多言語サポート** - **7言語でTree-sitter AST解析**:Python、JavaScript、TypeScript、Java、Go、Objective-C、Zig @@ -81,7 +81,7 @@ Code Index MCPは、AIモデルと複雑なコードベースの橋渡しをす - **ファイルウォッチャー**:ファイル変更時の自動インデックス更新 - **クロスプラットフォーム**:ネイティブOSファイルシステム監視 - **スマート処理**:急速な変更をバッチ処理して過度な再構築を防止 -- **豊富なメタデータ**:シンボル、参照、定義、関連性をキャプチャ +- **浅いインデックス更新**:ファイル変更を監視して最新のファイル一覧を維持し、シンボルが必要な場合は `build_deep_index` を実行 ### ⚡ **パフォーマンス・効率性** - **Tree-sitter AST解析**:正確なシンボル抽出のためのネイティブ構文解析 @@ -240,15 +240,18 @@ pip install code-index-mcp | ツール | 説明 | |--------|------| | **`set_project_path`** | プロジェクトディレクトリのインデックス作成を初期化 | -| **`refresh_index`** | ファイル変更後にプロジェクトインデックスを再構築 | +| **`refresh_index`** | ファイル変更後に浅いファイルインデックスを再構築 | +| **`build_deep_index`** | 深い解析で使う完全なシンボルインデックスを生成 | | **`get_settings_info`** | 現在のプロジェクト設定と状態を表示 | +*シンボルレベルのデータが必要な場合は `build_deep_index` を実行してください。デフォルトの浅いインデックスは高速なファイル探索を担います。* + ### 🔍 **検索・発見** | ツール | 説明 | |--------|------| | **`search_code_advanced`** | 正規表現、ファジーマッチング、ファイルフィルタリング対応のスマート検索 | | **`find_files`** | globパターンを使用したファイル検索(例:`**/*.py`) | -| **`get_file_summary`** | ファイル構造、関数、インポート、複雑度の解析 | +| **`get_file_summary`** | ファイル構造、関数、インポート、複雑度の解析(深いインデックスが必要) | ### 🔄 **監視・自動更新** | ツール | 説明 | @@ -285,6 +288,7 @@ src/components で全てのTypeScriptコンポーネントファイルを見つ src/api/userService.ts の要約を教えてください ``` *使用ツール:`get_file_summary` で関数、インポート、複雑度を表示* +*ヒント:`needs_deep_index` が返った場合は `build_deep_index` を先に実行してください。* ### 🔍 **高度な検索例** diff --git a/README_ko.md b/README_ko.md new file mode 100644 index 0000000..6995b6a --- /dev/null +++ b/README_ko.md @@ -0,0 +1,284 @@ +# 코드 인덱스 MCP + +
+ +[![MCP Server](https://img.shields.io/badge/MCP-Server-blue)](https://modelcontextprotocol.io) +[![Python](https://img.shields.io/badge/Python-3.10%2B-green)](https://www.python.org/) +[![License](https://img.shields.io/badge/License-MIT-yellow)](LICENSE) + +**대규모 언어 모델을 위한 지능형 코드 인덱싱과 분석** + +고급 검색, 정밀 분석, 유연한 탐색 기능으로 AI가 코드베이스를 이해하고 활용하는 방식을 혁신하세요. + +
+ + + code-index-mcp MCP server + + +## 개요 + +Code Index MCP는 [Model Context Protocol](https://modelcontextprotocol.io) 기반 MCP 서버로, AI 어시스턴트와 복잡한 코드베이스 사이를 연결합니다. 빠른 인덱싱, 강력한 검색, 정밀한 코드 분석을 제공하여 AI가 프로젝트 구조를 정확히 파악하고 효과적으로 지원하도록 돕습니다. + +**이럴 때 안성맞춤:** 코드 리뷰, 리팩터링, 문서화, 디버깅 지원, 아키텍처 분석 + +## 빠른 시작 + +### 🚀 **권장 설정 (대부분의 사용자)** + +어떤 MCP 호환 애플리케이션에서도 몇 단계만으로 시작할 수 있습니다. + +**사전 준비:** Python 3.10+ 및 [uv](https://github.com/astral-sh/uv) + +1. **MCP 설정에 서버 추가** (예: `claude_desktop_config.json` 또는 `~/.claude.json`) + ```json + { + "mcpServers": { + "code-index": { + "command": "uvx", + "args": ["code-index-mcp"] + } + } + } + ``` + +2. **애플리케이션 재시작** – `uvx`가 설치와 실행을 자동으로 처리합니다. + +3. **사용 시작** (AI 어시스턴트에게 아래 프롬프트를 전달) + ``` + 프로젝트 경로를 /Users/dev/my-react-app 으로 설정해줘 + 이 프로젝트에서 모든 TypeScript 파일을 찾아줘 + "authentication" 관련 함수를 검색해줘 + src/App.tsx 파일을 분석해줘 + ``` + +## 대표 사용 사례 + +**코드 리뷰:** "예전 API를 사용하는 부분을 모두 찾아줘" +**리팩터링 지원:** "이 함수는 어디에서 호출되나요?" +**프로젝트 학습:** "이 React 프로젝트의 핵심 컴포넌트를 보여줘" +**디버깅:** "에러 처리 로직이 있는 파일을 찾아줘" + +## 주요 기능 + +### 🧠 **지능형 검색과 분석** +- **듀얼 전략 아키텍처:** 7개 핵심 언어는 전용 tree-sitter 파서를 사용하고, 그 외 50+ 파일 형식은 폴백 전략으로 처리 +- **직접 Tree-sitter 통합:** 특화 언어에 정규식 폴백 없음 – 문제 시 즉시 실패하고 명확한 오류 메시지 제공 +- **고급 검색:** ugrep, ripgrep, ag, grep 중 최적의 도구를 자동 선택해 활용 +- **범용 파일 지원:** 정교한 AST 분석부터 기본 파일 인덱싱까지 폭넓게 커버 +- **파일 분석:** `build_deep_index` 실행 후 구조, 임포트, 클래스, 메서드, 복잡도 지표를 심층적으로 파악 + +### 🗂️ **다중 언어 지원** +- **Tree-sitter AST 분석(7종):** Python, JavaScript, TypeScript, Java, Go, Objective-C, Zig +- **폴백 전략(50+ 형식):** C/C++, Rust, Ruby, PHP 등 대부분의 프로그래밍 언어 지원 +- **문서 및 설정 파일:** Markdown, JSON, YAML, XML 등 상황에 맞는 처리 +- **웹 프론트엔드:** Vue, React, Svelte, HTML, CSS, SCSS +- **데이터 계층:** SQL, NoSQL, 스토어드 프로시저, 마이그레이션 스크립트 +- **구성 파일:** JSON, YAML, XML, Markdown +- **[지원 파일 전체 목록 보기](#지원-파일-형식)** + +### 🔄 **실시간 모니터링 & 자동 새로고침** +- **파일 워처:** 파일 변경 시 자동으로 얕은 인덱스(파일 목록) 갱신 +- **크로스 플랫폼:** 운영체제 기본 파일시스템 이벤트 활용 +- **스마트 처리:** 빠른 변경을 묶어 과도한 재빌드를 방지 +- **얕은 인덱스 갱신:** 파일 목록을 최신 상태로 유지하며, 심볼 데이터가 필요하면 `build_deep_index`를 실행 + +### ⚡ **성능 & 효율성** +- **Tree-sitter AST 파싱:** 정확한 심볼 추출을 위한 네이티브 구문 분석 +- **지속 캐싱:** 인덱스를 저장해 이후 응답 속도를 극대화 +- **스마트 필터링:** 빌드 디렉터리·임시 파일을 자동 제외 +- **메모리 효율:** 대규모 코드베이스를 염두에 둔 설계 +- **직접 의존성:** 불필요한 폴백 없이 명확한 오류 메시지 제공 + +## 지원 파일 형식 + +
+💻 프로그래밍 언어 (클릭하여 확장) + +**전용 Tree-sitter 전략 언어:** +- **Python** (`.py`, `.pyw`) – 클래스/메서드 추출 및 호출 추적이 포함된 완전 AST 분석 +- **JavaScript** (`.js`, `.jsx`, `.mjs`, `.cjs`) – ES6+ 클래스와 함수를 tree-sitter로 파싱 +- **TypeScript** (`.ts`, `.tsx`) – 인터페이스를 포함한 타입 인지 심볼 추출 +- **Java** (`.java`) – 클래스 계층, 메서드 시그니처, 호출 관계 분석 +- **Go** (`.go`) – 구조체 메서드, 리시버 타입, 함수 분석 +- **Objective-C** (`.m`, `.mm`) – 클래스/인스턴스 메서드를 +/- 표기로 구분 +- **Zig** (`.zig`, `.zon`) – 함수와 구조체를 tree-sitter AST로 분석 + +**기타 모든 프로그래밍 언어:** +나머지 언어는 **폴백 파싱 전략**으로 기본 메타데이터와 파일 인덱싱을 제공합니다. 예: +- **시스템/저수준:** C/C++ (`.c`, `.cpp`, `.h`, `.hpp`), Rust (`.rs`) +- **객체지향:** C# (`.cs`), Kotlin (`.kt`), Scala (`.scala`), Swift (`.swift`) +- **스크립트:** Ruby (`.rb`), PHP (`.php`), Shell (`.sh`, `.bash`) +- **그 외 40+ 형식** – 폴백 전략으로 빠른 탐색 가능 + +
+ +
+🌐 웹 프론트엔드 & UI + +- 프레임워크: Vue (`.vue`), Svelte (`.svelte`), Astro (`.astro`) +- 스타일링: CSS (`.css`, `.scss`, `.less`, `.sass`, `.stylus`, `.styl`), HTML (`.html`) +- 템플릿: Handlebars (`.hbs`, `.handlebars`), EJS (`.ejs`), Pug (`.pug`) + +
+ +
+🗄️ 데이터 계층 & SQL + +- **SQL 변형:** 표준 SQL (`.sql`, `.ddl`, `.dml`), 데이터베이스별 방언 (`.mysql`, `.postgresql`, `.psql`, `.sqlite`, `.mssql`, `.oracle`, `.ora`, `.db2`) +- **DB 객체:** 프로시저/함수 (`.proc`, `.procedure`, `.func`, `.function`), 뷰/트리거/인덱스 (`.view`, `.trigger`, `.index`) +- **마이그레이션 도구:** 마이그레이션 파일 (`.migration`, `.seed`, `.fixture`, `.schema`), 도구 구성 (`.liquibase`, `.flyway`) +- **NoSQL & 그래프:** 질의 언어 (`.cql`, `.cypher`, `.sparql`, `.gql`) + +
+ +
+📄 문서 & 설정 파일 + +- Markdown (`.md`, `.mdx`) +- 구성 파일 (`.json`, `.xml`, `.yml`, `.yaml`) + +
+ +## 사용 가능한 도구 + +### 🏗️ **프로젝트 관리** +| 도구 | 설명 | +|------|------| +| **`set_project_path`** | 프로젝트 디렉터리의 인덱스를 초기화 | +| **`refresh_index`** | 파일 변경 후 얕은 파일 인덱스를 재생성 | +| **`build_deep_index`** | 심층 분석에 사용하는 전체 심볼 인덱스를 생성 | +| **`get_settings_info`** | 현재 프로젝트 설정과 상태를 확인 | + +*심볼 레벨 데이터가 필요하면 `build_deep_index`를 실행하세요. 기본 얕은 인덱스는 빠른 파일 탐색을 담당합니다.* + +### 🔍 **검색 & 탐색** +| 도구 | 설명 | +|------|------| +| **`search_code_advanced`** | 정규식, 퍼지 매칭, 파일 필터링을 지원하는 스마트 검색 | +| **`find_files`** | 글롭 패턴으로 파일 찾기 (예: `**/*.py`) | +| **`get_file_summary`** | 파일 구조, 함수, 임포트, 복잡도를 분석 (심층 인덱스 필요) | + +### 🔄 **모니터링 & 자동 새로고침** +| 도구 | 설명 | +|------|------| +| **`get_file_watcher_status`** | 파일 워처 상태와 구성을 확인 | +| **`configure_file_watcher`** | 자동 새로고침 설정 (활성/비활성, 지연 시간, 추가 제외 패턴) | + +### 🛠️ **시스템 & 유지 관리** +| 도구 | 설명 | +|------|------| +| **`create_temp_directory`** | 인덱스 저장용 임시 디렉터리를 생성 | +| **`check_temp_directory`** | 인덱스 저장 위치와 권한을 확인 | +| **`clear_settings`** | 모든 설정과 캐시 데이터를 초기화 | +| **`refresh_search_tools`** | 사용 가능한 검색 도구를 재검색 (ugrep, ripgrep 등) | + +## 사용 예시 + +### 🧭 **빠른 시작 워크플로** + +**1. 프로젝트 초기화** +``` +프로젝트 경로를 /Users/dev/my-react-app 으로 설정해줘 +``` +*프로젝트를 설정하고 얕은 인덱스를 생성합니다.* + +**2. 프로젝트 구조 탐색** +``` +src/components 안의 TypeScript 컴포넌트 파일을 모두 찾아줘 +``` +*사용 도구: `find_files` (`src/components/**/*.tsx`)* + +**3. 핵심 파일 분석** +``` +src/api/userService.ts 요약을 알려줘 +``` +*사용 도구: `get_file_summary` (함수, 임포트, 복잡도 표시)* +*팁: `needs_deep_index` 응답이 나오면 먼저 `build_deep_index`를 실행하세요.* + +### 🔍 **고급 검색 예시** + +
+코드 패턴 검색 + +``` +"get.*Data"에 해당하는 함수 호출을 정규식으로 찾아줘 +``` +*예: `getData()`, `getUserData()`, `getFormData()`* + +
+ +
+퍼지 함수 검색 + +``` +'authUser'와 유사한 인증 관련 함수를 찾아줘 +``` +*예: `authenticateUser`, `authUserToken`, `userAuthCheck`* + +
+ +
+언어별 검색 + +``` +Python 파일에서만 "API_ENDPOINT" 를 찾아줘 +``` +*`search_code_advanced` + `file_pattern="*.py"`* + +
+ +
+자동 새로고침 설정 + +``` +파일 변경 시 자동으로 인덱스를 새로고침하도록 설정해줘 +``` +*`configure_file_watcher`로 활성화 및 지연 시간 설정* + +
+ +
+프로젝트 유지 관리 + +``` +새 컴포넌트를 추가했어. 프로젝트 인덱스를 다시 빌드해줘 +``` +*`refresh_index`로 빠르게 얕은 인덱스를 업데이트* + +
+ +## 문제 해결 + +### 🔄 **자동 새로고침이 동작하지 않을 때** +- 환경 문제로 `watchdog`가 빠졌다면 설치: `pip install watchdog` +- 수동 새로고침: 변경 후 `refresh_index` 도구 실행 +- 워처 상태 확인: `get_file_watcher_status` 도구로 활성 여부 점검 + +## 개발 & 기여 + +### 🛠️ **소스에서 실행하기** +```bash +git clone https://github.com/johnhuang316/code-index-mcp.git +cd code-index-mcp +uv sync +uv run code-index-mcp +``` + +### 🧪 **디버깅 도구** +```bash +npx @modelcontextprotocol/inspector uvx code-index-mcp +``` + +### 🤝 **기여 안내** +Pull Request를 언제든 환영합니다. 변경 사항과 테스트 방법을 함께 공유해주세요. + +--- + +### 📄 **라이선스** +[MIT License](LICENSE) + +### 🌍 **번역본** +- [English](README.md) +- [繁體中文](README_zh.md) +- [日本語](README_ja.md) diff --git a/README_zh.md b/README_zh.md index 5a61fbb..1e9c5ae 100644 --- a/README_zh.md +++ b/README_zh.md @@ -66,7 +66,7 @@ - **直接 Tree-sitter 整合**:專業化語言無正則表達式備用 - 快速失敗並提供清晰錯誤訊息 - **進階搜尋**:自動偵測並使用最佳工具(ugrep、ripgrep、ag 或 grep) - **通用檔案支援**:從進階 AST 解析到基本檔案索引的全面覆蓋 -- **檔案分析**:深入了解結構、匯入、類別、方法和複雜度指標 +- **檔案分析**:執行 `build_deep_index` 後深入了解結構、匯入、類別、方法和複雜度指標 ### 🗂️ **多語言支援** - **7 種語言使用 Tree-sitter AST 解析**:Python、JavaScript、TypeScript、Java、Go、Objective-C、Zig @@ -81,7 +81,7 @@ - **檔案監控器**:檔案變更時自動更新索引 - **跨平台**:原生作業系統檔案系統監控 - **智慧處理**:批次處理快速變更以防止過度重建 -- **豐富元資料**:捕獲符號、引用、定義和關聯性 +- **淺層索引更新**:監控檔案變更並維持檔案清單最新;需要符號資料時請執行 `build_deep_index` ### ⚡ **效能與效率** - **Tree-sitter AST 解析**:原生語法解析以實現準確的符號提取 @@ -240,15 +240,18 @@ pip install code-index-mcp | 工具 | 描述 | |------|------| | **`set_project_path`** | 為專案目錄初始化索引 | -| **`refresh_index`** | 在檔案變更後重建專案索引 | +| **`refresh_index`** | 在檔案變更後重建淺層檔案索引 | +| **`build_deep_index`** | 產生供深度分析使用的完整符號索引 | | **`get_settings_info`** | 檢視目前專案配置和狀態 | +*需要符號層級資料時,請執行 `build_deep_index`;預設的淺層索引提供快速檔案探索。* + ### 🔍 **搜尋與探索** | 工具 | 描述 | |------|------| | **`search_code_advanced`** | 智慧搜尋,支援正規表達式、模糊匹配和檔案篩選 | | **`find_files`** | 使用萬用字元模式尋找檔案(例如 `**/*.py`) | -| **`get_file_summary`** | 分析檔案結構、函式、匯入和複雜度 | +| **`get_file_summary`** | 分析檔案結構、函式、匯入和複雜度(需要深度索引) | ### 🔄 **監控與自動刷新** | 工具 | 描述 | @@ -285,6 +288,7 @@ pip install code-index-mcp 給我 src/api/userService.ts 的摘要 ``` *使用:`get_file_summary` 顯示函式、匯入和複雜度* +*提示:若收到 `needs_deep_index` 回應,請先執行 `build_deep_index`。* ### 🔍 **進階搜尋範例** diff --git a/RELEASE_NOTE.txt b/RELEASE_NOTE.txt new file mode 100644 index 0000000..8a744bb --- /dev/null +++ b/RELEASE_NOTE.txt @@ -0,0 +1,7 @@ +## 2.4.1 - Search Filtering Alignment + +### Highlights +- Code search now shares the central FileFilter blacklist, keeping results consistent with indexing (no more `node_modules` noise). +- CLI search strategies emit the appropriate exclusion flags automatically (ripgrep, ugrep, ag, grep). +- Basic fallback search prunes excluded directories during traversal, avoiding unnecessary IO. +- Added regression coverage for the new filtering behaviour (`tests/search/test_search_filters.py`). diff --git a/SCIP_OFFICIAL_STANDARDS.md b/SCIP_OFFICIAL_STANDARDS.md deleted file mode 100644 index 763b56c..0000000 --- a/SCIP_OFFICIAL_STANDARDS.md +++ /dev/null @@ -1,337 +0,0 @@ -# SCIP (Source Code Intelligence Protocol) Official Standards - -*This document contains only the official SCIP standards as defined by Sourcegraph, without any project-specific implementations.* - -## Overview - -SCIP (pronounced "skip") is a language-agnostic protocol for indexing source code to power code navigation functionality such as Go to definition, Find references, and Find implementations. It is a recursive acronym that stands for "SCIP Code Intelligence Protocol." - -**Official Repository**: https://github.com/sourcegraph/scip - -## Core Design Principles (Official) - -### Primary Goals -1. **Support code navigation at IDE-level fidelity** - Provide excellent code navigation experience -2. **Make indexer creation easy** by: - - Enabling cross-repository navigation - - Supporting file-level incremental indexing - - Facilitating parallel indexing - - Supporting multi-language indexer development - -### Design Philosophy -> "SCIP is meant to be a transmission format for sending data from some producers to some consumers -- it is not meant as a storage format for querying." - -### Technical Design Decisions -1. **Protobuf Schema** - - Relatively compact binary format - - Supports easy code generation - - Enables streaming reads/writes - - Maintains forward/backward compatibility - -2. **String-based Identifiers** - - Prefer human-readable string IDs for symbols - - Avoid integer ID mapping tables - - Improve debuggability - - Limit potential bug impact - -3. **Data Encoding Approach** - - Avoid direct graph encoding - - Use document and array-based approaches - - Enable streaming capabilities - - Minimize memory consumption during indexing - -### Non-Goals -- Not focused on code modification tools -- Not optimizing for consumer-side tooling -- Not prioritizing uncompressed data compactness -- Not serving as a standalone query engine - -## Protocol Buffer Schema (Official) - -### Main Message Types - -```protobuf -syntax = "proto3"; -package scip; - -message Index { - Metadata metadata = 1; - repeated Document documents = 2; - repeated SymbolInformation external_symbols = 3; -} - -message Metadata { - ProtocolVersion version = 1; - ToolInfo tool_info = 2; - string project_root = 3; - TextEncoding text_encoding = 4; -} - -message Document { - string language = 4; - string relative_path = 1; - repeated Occurrence occurrences = 2; - repeated SymbolInformation symbols = 3; - string text = 5; -} - -message Symbol { - string scheme = 1; - Package package = 2; - repeated Descriptor descriptors = 3; -} - -message SymbolInformation { - string symbol = 1; - repeated string documentation = 3; - repeated Relationship relationships = 4; - SymbolKind kind = 5; - string display_name = 6; - Signature signature_documentation = 7; - repeated string enclosing_symbol = 8; -} - -message Occurrence { - Range range = 1; - string symbol = 2; - int32 symbol_roles = 3; - repeated Diagnostic override_documentation = 4; - SyntaxKind syntax_kind = 5; -} - -message Range { - repeated int32 start = 1; // [line, column] - repeated int32 end = 2; // [line, column] -} -``` - -## Official Symbol Format Specification - -### Symbol Grammar (Official) -``` - ::= ' ' ' ' ()+ | 'local ' - ::= ' ' ' ' - ::= UTF-8 string (escape spaces with double space) - ::= | | | | | | | -``` - -### Symbol Components - -**Scheme**: Identifies the symbol's origin/context -- UTF-8 string -- Escape spaces with double space - -**Package**: Includes manager, name, and version -- Manager: Package manager identifier -- Package name: Unique package identifier -- Version: Package version - -**Descriptors**: Represent nested/hierarchical symbol structure -- Form a fully qualified name -- Support various symbol types - -**Local Symbols**: Only for entities within a single Document -- Format: `local ` -- Used for file-scoped symbols - -### Encoding Rules (Official) -- Descriptors form a fully qualified name -- Local symbols are only for entities within a single Document -- Symbols must uniquely identify an entity across a package -- Supports escaping special characters in identifiers - -## Enumerations (Official) - -### ProtocolVersion -```protobuf -enum ProtocolVersion { - UnspecifiedProtocolVersion = 0; -} -``` - -### TextEncoding -```protobuf -enum TextEncoding { - UnspecifiedTextEncoding = 0; - UTF8 = 1; - UTF16 = 2; -} -``` - -### SymbolRole -```protobuf -enum SymbolRole { - UnspecifiedSymbolRole = 0; - Definition = 1; - Import = 2; - WriteAccess = 4; - ReadAccess = 8; - Generated = 16; - Test = 32; -} -``` - -### SymbolKind -```protobuf -enum SymbolKind { - UnspecifiedSymbolKind = 0; - Array = 1; - Boolean = 2; - Class = 3; - Constant = 4; - Constructor = 5; - Enum = 6; - EnumMember = 7; - Event = 8; - Field = 9; - File = 10; - Function = 11; - Interface = 12; - Key = 13; - Method = 14; - Module = 15; - Namespace = 16; - Null = 17; - Number = 18; - Object = 19; - Operator = 20; - Package = 21; - Property = 22; - String = 23; - Struct = 24; - TypeParameter = 25; - Variable = 26; - Macro = 27; -} -``` - -### SyntaxKind -```protobuf -enum SyntaxKind { - UnspecifiedSyntaxKind = 0; - Comment = 1; - PunctuationDelimiter = 2; - PunctuationBracket = 3; - Keyword = 4; - // ... (additional syntax kinds) - IdentifierKeyword = 13; - IdentifierOperator = 14; - IdentifierBuiltin = 15; - IdentifierNull = 16; - IdentifierConstant = 17; - IdentifierMutableGlobal = 18; - IdentifierParameter = 19; - IdentifierLocal = 20; - IdentifierShadowed = 21; - IdentifierNamespace = 22; - IdentifierFunction = 23; - IdentifierFunctionDefinition = 24; - IdentifierMacro = 25; - IdentifierMacroDefinition = 26; - IdentifierType = 27; - IdentifierBuiltinType = 28; - IdentifierAttribute = 29; -} -``` - -## Official Position and Range Specification - -### Coordinate System -- **Line numbers**: 0-indexed -- **Column numbers**: 0-indexed character positions -- **UTF-8/UTF-16 aware**: Proper Unicode handling - -### Range Format -```protobuf -message Range { - repeated int32 start = 1; // [line, column] - repeated int32 end = 2; // [line, column] -} -``` - -### Requirements -- Start position must be <= end position -- Ranges must be within document boundaries -- Character-level precision required - -## Official Language Support - -### Currently Supported (Official Implementations) -- **TypeScript/JavaScript**: scip-typescript -- **Java**: scip-java (also supports Scala, Kotlin) -- **Python**: In development - -### Language Bindings Available -- **Rich bindings**: Go, Rust -- **Auto-generated bindings**: TypeScript, Haskell -- **CLI tools**: scip CLI for index manipulation - -## Performance Characteristics (Official Claims) - -### Compared to LSIF -- **10x speedup** in CI environments -- **4x smaller** compressed payload size -- **Better streaming**: Enables processing without loading entire index -- **Lower memory usage**: Document-based processing - -### Design Benefits -- Static typing from Protobuf schema -- More ergonomic debugging -- Reduced runtime errors -- Smaller index files - -## Official Tools and Ecosystem - -### SCIP CLI -- Index manipulation and conversion -- LSIF compatibility support -- Debugging and inspection tools - -### Official Indexers -- **scip-typescript**: `npm install -g @sourcegraph/scip-typescript` -- **scip-java**: Available as Docker image, Java launcher, fat jar - -### Integration Support -- GitLab Code Intelligence (via LSIF conversion) -- Sourcegraph native support -- VS Code extensions (community) - -## Standards Compliance Requirements - -### For SCIP Index Producers -1. Must generate valid Protocol Buffer format -2. Must follow symbol ID format specification -3. Must provide accurate position information -4. Should support streaming output -5. Must handle UTF-8/UTF-16 encoding correctly - -### For SCIP Index Consumers -1. Must handle streaming input -2. Should support all standard symbol kinds -3. Must respect symbol role classifications -4. Should provide graceful error handling -5. Must support position range validation - -## Official Documentation Sources - -### Primary Sources -- **Main Repository**: https://github.com/sourcegraph/scip -- **Protocol Schema**: https://github.com/sourcegraph/scip/blob/main/scip.proto -- **Design Document**: https://github.com/sourcegraph/scip/blob/main/DESIGN.md -- **Announcement Blog**: https://sourcegraph.com/blog/announcing-scip - -### Language-Specific Documentation -- **Java**: https://github.com/sourcegraph/scip-java -- **TypeScript**: https://github.com/sourcegraph/scip-typescript - -### Community Resources -- **Bindings**: Available for Go, Rust, TypeScript, Haskell -- **Examples**: Implementation examples in official repositories -- **Issues**: Bug reports and feature requests on GitHub - ---- - -*This document contains only official SCIP standards as defined by Sourcegraph.* -*Last updated: 2025-01-14* -*SCIP Version: Compatible with official v0.3.x specification* -*Source: Official Sourcegraph SCIP repositories and documentation* \ No newline at end of file diff --git a/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md b/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md deleted file mode 100644 index 25d4e8c..0000000 --- a/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md +++ /dev/null @@ -1,372 +0,0 @@ -# SCIPSymbolAnalyzer Refactoring Plan - -## 🎯 Overview - -This document outlines a comprehensive refactoring plan for the `SCIPSymbolAnalyzer` class to transform it from a monolithic architecture into a modular, extensible, and maintainable system that supports multiple programming languages with proper separation of concerns. - -## 🔍 Current Architecture Problems - -### 1. **Monolithic Design Issues** -- All language-specific logic is mixed within a single class -- The `_extract_imports` method contains Python, Objective-C, and Zig-specific logic -- Lack of extensibility - adding new languages requires modifying the core class -- Violation of Single Responsibility Principle - -### 2. **Dependency Processing Chaos** -- Methods like `_classify_zig_import`, `_categorize_import` are scattered throughout the codebase -- No unified dependency classification standard -- Language-specific standard library lists are hardcoded -- Inconsistent dependency type mapping - -### 3. **Symbol Resolution Complexity** -- Position detection logic is complex and error-prone -- Three-layer position detection strategy is difficult to maintain -- Symbol ID parsing logic lacks flexibility -- Mixed concerns between symbol extraction and position calculation - -### 4. **Poor Language Support Scalability** -- Each new language requires core class modifications -- No clear plugin architecture -- Language-specific logic embedded in generic methods -- Difficult to test language-specific features in isolation - -## 🏗️ Proposed Refactoring Architecture - -### Phase 1: Language Plugin System - -```python -# New architecture design -class LanguageAnalyzer(ABC): - """Language-specific analyzer interface""" - - @abstractmethod - def extract_imports(self, document, imports: ImportGroup) -> None: - """Extract import information from SCIP document""" - - @abstractmethod - def classify_dependency(self, module_name: str) -> str: - """Classify dependency as standard_library, third_party, or local""" - - @abstractmethod - def extract_symbol_metadata(self, symbol_info) -> Dict[str, Any]: - """Extract language-specific symbol metadata""" - - @abstractmethod - def get_standard_library_modules(self) -> Set[str]: - """Return set of standard library module names""" - -class ZigAnalyzer(LanguageAnalyzer): - """Zig language-specific analyzer""" - -class PythonAnalyzer(LanguageAnalyzer): - """Python language-specific analyzer""" - -class ObjectiveCAnalyzer(LanguageAnalyzer): - """Objective-C language-specific analyzer""" - -class LanguageAnalyzerFactory: - """Factory for creating language-specific analyzers""" - - def get_analyzer(self, language: str) -> LanguageAnalyzer: - """Get appropriate analyzer for language""" -``` - -### Phase 2: Dependency Management System - -```python -class DependencyClassifier: - """Unified dependency classification system""" - - def __init__(self): - self.language_configs = { - 'python': PythonDependencyConfig(), - 'zig': ZigDependencyConfig(), - 'javascript': JavaScriptDependencyConfig() - } - - def classify_import(self, import_path: str, language: str) -> str: - """Classify import based on language-specific rules""" - -class DependencyConfig(ABC): - """Language-specific dependency configuration""" - - @abstractmethod - def get_stdlib_modules(self) -> Set[str]: - """Return standard library modules for this language""" - - @abstractmethod - def classify_import(self, import_path: str) -> str: - """Classify import path for this language""" - - @abstractmethod - def normalize_import_path(self, raw_path: str) -> str: - """Normalize import path for consistent processing""" -``` - -### Phase 3: Position Resolution System - -```python -class PositionResolver: - """Unified symbol position resolution system""" - - def __init__(self): - self.strategies = [ - SCIPOccurrenceStrategy(), # High confidence - TreeSitterStrategy(), # Medium confidence - HeuristicStrategy() # Fallback - ] - - def resolve_position(self, symbol, document) -> LocationInfo: - """Resolve symbol position using strategy pattern""" - -class PositionStrategy(ABC): - """Base class for position resolution strategies""" - - @abstractmethod - def try_resolve(self, symbol, document) -> Optional[LocationInfo]: - """Attempt to resolve symbol position""" - - @abstractmethod - def get_confidence_level(self) -> str: - """Return confidence level: 'high', 'medium', 'low'""" -``` - -## 📋 Detailed Implementation Plan - -### **Phase 1: Architecture Separation (Week 1)** - -#### 1.1 Create Language Analyzer Interface -``` -src/code_index_mcp/tools/scip/analyzers/ -├── base.py # Base interfaces and common utilities -├── python_analyzer.py # Python-specific analysis logic -├── zig_analyzer.py # Zig-specific analysis logic -├── objc_analyzer.py # Objective-C-specific analysis logic -├── javascript_analyzer.py # JavaScript/TypeScript analysis logic -└── factory.py # Analyzer factory and registry -``` - -**Tasks:** -- [ ] Define `LanguageAnalyzer` abstract base class -- [ ] Extract Python-specific logic to `PythonAnalyzer` -- [ ] Move Zig logic from current implementation to `ZigAnalyzer` -- [ ] Migrate Objective-C logic to `ObjectiveCAnalyzer` -- [ ] Create factory pattern for analyzer instantiation - -#### 1.2 Extract Language-Specific Logic -- [ ] Move `_classify_zig_import` to `ZigAnalyzer` -- [ ] Move Python stdlib detection to `PythonAnalyzer` -- [ ] Move Objective-C framework detection to `ObjectiveCAnalyzer` -- [ ] Create language-specific symbol metadata extraction - -### **Phase 2: Dependency Processing Refactoring (Week 2)** - -#### 2.1 Create Dependency Management Module -``` -src/code_index_mcp/tools/scip/dependencies/ -├── classifier.py # Main dependency classifier -├── configs/ # Language-specific configurations -│ ├── __init__.py -│ ├── python.py # Python dependency rules -│ ├── zig.py # Zig dependency rules -│ ├── javascript.py # JavaScript dependency rules -│ └── base.py # Base configuration class -├── registry.py # Dependency registry and caching -└── normalizer.py # Import path normalization -``` - -**Tasks:** -- [ ] Create unified `DependencyClassifier` class -- [ ] Implement language-specific configuration classes -- [ ] Standardize dependency type constants -- [ ] Add configurable standard library lists -- [ ] Implement caching for dependency classification results - -#### 2.2 Standardize Dependency Classification -- [ ] Define consistent classification types: `standard_library`, `third_party`, `local` -- [ ] Create configurable standard library lists per language -- [ ] Support custom classification rules -- [ ] Implement dependency version detection where applicable - -### **Phase 3: Symbol Resolution Refactoring (Week 3)** - -#### 3.1 Modularize Position Detection -``` -src/code_index_mcp/tools/scip/position/ -├── resolver.py # Main position resolver -├── strategies/ # Position detection strategies -│ ├── __init__.py -│ ├── scip_occurrence.py # SCIP occurrence-based detection -│ ├── tree_sitter.py # Tree-sitter AST-based detection -│ ├── heuristic.py # Heuristic fallback detection -│ └── base.py # Base strategy interface -├── calculator.py # Position calculation utilities -└── confidence.py # Confidence level management -``` - -**Tasks:** -- [ ] Implement strategy pattern for position resolution -- [ ] Separate SCIP occurrence processing logic -- [ ] Extract tree-sitter position calculation -- [ ] Create heuristic fallback mechanisms -- [ ] Add confidence level tracking - -#### 3.2 Improve Symbol Parsing -- [ ] Refactor `_extract_name_from_scip_symbol` method -- [ ] Unify Symbol ID format processing -- [ ] Support additional SCIP symbol formats -- [ ] Add robust error handling for malformed symbols - -### **Phase 4: Relationship Analysis Refactoring (Week 4)** - -#### 4.1 Separate Relationship Analysis Logic -``` -src/code_index_mcp/tools/scip/relationships/ -├── analyzer.py # Main relationship analyzer -├── types.py # Relationship type definitions -├── builder.py # Relationship construction logic -├── extractors/ # Relationship extraction strategies -│ ├── __init__.py -│ ├── call_extractor.py # Function call relationships -│ ├── inheritance_extractor.py # Class inheritance -│ └── reference_extractor.py # Symbol references -└── formatter.py # Relationship output formatting -``` - -**Tasks:** -- [ ] Extract relationship analysis from main analyzer -- [ ] Implement relationship type system -- [ ] Create relationship builders for different types -- [ ] Add relationship validation logic - -#### 4.2 Optimize Relationship Detection -- [ ] Improve function call detection accuracy -- [ ] Support additional relationship types (inheritance, interfaces, etc.) -- [ ] Add cross-file relationship resolution -- [ ] Implement relationship confidence scoring - -### **Phase 5: Integration and Testing (Week 5)** - -#### 5.1 Integrate New Architecture -- [ ] Update `SCIPSymbolAnalyzer` to use new plugin system -- [ ] Create adapter layer for backward compatibility -- [ ] Update configuration and initialization logic -- [ ] Add performance monitoring - -#### 5.2 Comprehensive Testing -- [ ] Unit tests for each language analyzer -- [ ] Integration tests for dependency classification -- [ ] Position resolution accuracy tests -- [ ] Performance benchmark tests -- [ ] Memory usage optimization tests - -## 🎯 Refactoring Goals - -### **Maintainability Improvements** -- ✅ **Single Responsibility**: Each class focuses on specific functionality -- ✅ **Open/Closed Principle**: Easy to add new language support without modifying existing code -- ✅ **Dependency Injection**: Components are replaceable and testable -- ✅ **Clear Separation of Concerns**: Position detection, dependency classification, and symbol analysis are separate - -### **Performance Optimizations** -- ✅ **Lazy Loading**: Only load required language analyzers -- ✅ **Caching Mechanisms**: Cache symbol resolution and dependency classification results -- ✅ **Parallel Processing**: Support multi-file parallel analysis -- ✅ **Memory Efficiency**: Reduce memory footprint through better data structures - -### **Extensibility Features** -- ✅ **Plugin System**: Third-party language support through plugins -- ✅ **Configuration-Driven**: Configurable analysis rules and standards -- ✅ **Stable API**: Backward-compatible interfaces -- ✅ **Language Agnostic Core**: Core logic independent of specific languages - -## 🧪 Testing Strategy - -### **Unit Testing Coverage** -- [ ] Each language analyzer tested independently -- [ ] Dependency classifier comprehensive test suite -- [ ] Position resolver strategy tests -- [ ] Symbol parsing edge case tests -- [ ] Relationship extraction validation tests - -### **Integration Testing** -- [ ] Cross-language analysis scenarios -- [ ] End-to-end file analysis workflows -- [ ] SCIP compliance validation -- [ ] Performance regression testing - -### **Regression Testing** -- [ ] Existing functionality preservation -- [ ] Zig dependency processing validation -- [ ] Python analysis accuracy maintenance -- [ ] Objective-C framework detection consistency - -## 📈 Success Metrics - -### **Code Quality Improvements** -- **Cyclomatic Complexity**: Reduce from current >50 to <10 per method -- **Test Coverage**: Achieve >90% code coverage -- **Maintainability Index**: Improve from current score to >80 - -### **Performance Targets** -- **Analysis Speed**: <500ms per file (currently ~2s) -- **Memory Usage**: <50MB for 1000-file project (currently ~200MB) -- **Accuracy**: >95% symbol position accuracy - -### **Extensibility Goals** -- **New Language Addition**: <2 hours to add basic support -- **Plugin Development**: Third-party plugin support -- **Configuration Flexibility**: Runtime configuration changes - -## 🚀 Migration Plan - -### **Phase 1: Preparation (Week 1)** -- Create new module structure -- Implement base interfaces -- Set up testing framework - -### **Phase 2: Gradual Migration (Weeks 2-4)** -- Migrate one language at a time -- Maintain backward compatibility -- Add comprehensive tests for each component - -### **Phase 3: Integration (Week 5)** -- Integrate all components -- Performance optimization -- Final testing and validation - -### **Phase 4: Documentation and Cleanup (Week 6)** -- Update documentation -- Remove deprecated code -- Finalize API documentation - -## 🔧 Implementation Notes - -### **Backward Compatibility** -- Maintain existing public API during transition -- Create adapter layer for legacy code -- Gradual deprecation of old methods - -### **Configuration Management** -- Use dependency injection for configurability -- Support runtime configuration updates -- Provide sensible defaults for all languages - -### **Error Handling** -- Implement comprehensive error handling at each layer -- Provide detailed error messages for debugging -- Graceful degradation when analyzers fail - -### **Logging and Monitoring** -- Add structured logging throughout the system -- Implement performance metrics collection -- Create debugging tools for complex analysis scenarios - ---- - -**Status**: 📋 Planning Phase -**Priority**: 🔥 High -**Estimated Effort**: 6 weeks -**Dependencies**: None - -This refactoring will establish a solid foundation for supporting additional programming languages and maintaining high code quality as the system grows. \ No newline at end of file diff --git a/benchmark_scip_framework.py b/benchmark_scip_framework.py deleted file mode 100644 index 88d05f5..0000000 --- a/benchmark_scip_framework.py +++ /dev/null @@ -1,1017 +0,0 @@ -"""SCIP Framework Performance Benchmark Suite - Comprehensive performance testing and analysis.""" - -import os -import time -import tempfile -import statistics -import gc -import psutil -import threading -from pathlib import Path -from typing import Dict, List, Any, Tuple, Optional -from dataclasses import dataclass, asdict -from concurrent.futures import ThreadPoolExecutor, as_completed - -from src.code_index_mcp.scip.framework import ( - SCIPFrameworkAPI, SCIPConfig, create_scip_framework, - PythonSCIPIndexFactory, JavaScriptSCIPIndexFactory, JavaSCIPIndexFactory, - SCIPCacheManager, StreamingIndexer -) - - -@dataclass -class BenchmarkResult: - """Benchmark result data structure.""" - test_name: str - file_count: int - total_time: float - memory_usage_mb: float - symbols_generated: int - occurrences_generated: int - cache_hit_rate: float - throughput_files_per_sec: float - throughput_symbols_per_sec: float - error_count: int - additional_metrics: Dict[str, Any] - - -@dataclass -class SystemMetrics: - """System resource metrics.""" - cpu_percent: float - memory_percent: float - memory_available_mb: float - disk_io_read_mb: float - disk_io_write_mb: float - - -class PerformanceMonitor: - """Real-time performance monitoring during benchmarks.""" - - def __init__(self): - self.monitoring = False - self.metrics_history: List[SystemMetrics] = [] - self.monitor_thread: Optional[threading.Thread] = None - self.process = psutil.Process() - - def start_monitoring(self, interval: float = 0.5): - """Start performance monitoring.""" - self.monitoring = True - self.metrics_history.clear() - self.monitor_thread = threading.Thread(target=self._monitor_loop, args=(interval,)) - self.monitor_thread.daemon = True - self.monitor_thread.start() - - def stop_monitoring(self) -> List[SystemMetrics]: - """Stop monitoring and return collected metrics.""" - self.monitoring = False - if self.monitor_thread: - self.monitor_thread.join(timeout=2.0) - return self.metrics_history.copy() - - def _monitor_loop(self, interval: float): - """Monitor system metrics in a loop.""" - while self.monitoring: - try: - # Get current metrics - memory_info = self.process.memory_info() - - metrics = SystemMetrics( - cpu_percent=self.process.cpu_percent(), - memory_percent=self.process.memory_percent(), - memory_available_mb=memory_info.rss / 1024 / 1024, - disk_io_read_mb=0.0, # Simplified for demo - disk_io_write_mb=0.0 - ) - - self.metrics_history.append(metrics) - time.sleep(interval) - - except Exception as e: - print(f"Monitoring error: {e}") - break - - -class SCIPFrameworkBenchmark: - """Comprehensive benchmark suite for SCIP framework.""" - - def __init__(self): - self.results: List[BenchmarkResult] = [] - self.monitor = PerformanceMonitor() - - def run_all_benchmarks(self) -> Dict[str, Any]: - """Run complete benchmark suite.""" - print("=== SCIP Framework Performance Benchmark Suite ===") - print(f"System: {psutil.cpu_count()} CPUs, {psutil.virtual_memory().total // 1024**3} GB RAM") - - with tempfile.TemporaryDirectory() as temp_dir: - # Create test projects of various sizes - small_project = self.create_test_project(temp_dir, "small", 50) - medium_project = self.create_test_project(temp_dir, "medium", 200) - large_project = self.create_test_project(temp_dir, "large", 1000) - - # Run benchmarks - benchmark_suite = [ - ("Small Project (50 files)", small_project, {'max_workers': 2, 'batch_size': 10}), - ("Medium Project (200 files)", medium_project, {'max_workers': 4, 'batch_size': 50}), - ("Large Project (1000 files)", large_project, {'max_workers': 8, 'batch_size': 100}), - ] - - for test_name, project_path, config_overrides in benchmark_suite: - print(f"\n🏃 Running: {test_name}") - - # Basic index generation benchmark - result = self.benchmark_index_generation(test_name, project_path, config_overrides) - self.results.append(result) - - # Caching performance benchmark - cache_result = self.benchmark_caching_performance(f"{test_name} - Caching", project_path, config_overrides) - self.results.append(cache_result) - - # Streaming performance benchmark - streaming_result = self.benchmark_streaming_performance(f"{test_name} - Streaming", project_path, config_overrides) - self.results.append(streaming_result) - - # Multi-language benchmark - multi_lang_project = self.create_multi_language_project(temp_dir) - multi_result = self.benchmark_multi_language(multi_lang_project) - self.results.append(multi_result) - - # Memory stress test - memory_result = self.benchmark_memory_usage(large_project) - self.results.append(memory_result) - - # Concurrent processing benchmark - concurrent_result = self.benchmark_concurrent_processing(medium_project) - self.results.append(concurrent_result) - - # Generate comprehensive report - return self.generate_benchmark_report() - - def create_test_project(self, base_dir: str, project_name: str, file_count: int) -> str: - """Create test project with specified number of files.""" - project_dir = os.path.join(base_dir, project_name) - os.makedirs(project_dir, exist_ok=True) - - # Generate Python files with varying complexity - for i in range(file_count): - file_path = os.path.join(project_dir, f"module_{i:04d}.py") - content = self.generate_python_file_content(i, file_count) - - with open(file_path, 'w', encoding='utf-8') as f: - f.write(content) - - return project_dir - - def create_multi_language_project(self, base_dir: str) -> str: - """Create project with multiple programming languages.""" - project_dir = os.path.join(base_dir, "multi_language") - os.makedirs(project_dir, exist_ok=True) - - # Python files - for i in range(30): - file_path = os.path.join(project_dir, f"python_module_{i}.py") - with open(file_path, 'w') as f: - f.write(self.generate_python_file_content(i, 30)) - - # JavaScript files - for i in range(20): - file_path = os.path.join(project_dir, f"js_module_{i}.js") - with open(file_path, 'w') as f: - f.write(self.generate_javascript_file_content(i)) - - # Java files - for i in range(15): - file_path = os.path.join(project_dir, f"JavaClass_{i}.java") - with open(file_path, 'w') as f: - f.write(self.generate_java_file_content(i)) - - return project_dir - - def generate_python_file_content(self, file_index: int, total_files: int) -> str: - """Generate Python file content with realistic complexity.""" - imports_count = min(5, file_index % 8 + 1) - classes_count = file_index % 3 + 1 - functions_count = file_index % 5 + 2 - - content = f'"""Module {file_index} - Generated for performance testing."""\n\n' - - # Add imports - for i in range(imports_count): - import_target = f"module_{(file_index + i) % total_files:04d}" - content += f"from {import_target} import Class{i}, function_{i}\n" - - content += "\nimport os\nimport sys\nfrom typing import List, Dict, Optional\n\n" - - # Add classes - for class_i in range(classes_count): - content += f''' -class Class{file_index}_{class_i}: - """Test class {class_i} in module {file_index}.""" - - def __init__(self, value: int = 0): - self.value = value - self.data: Dict[str, int] = {{}} - self.items: List[str] = [] - - def process_data(self, input_data: List[int]) -> Dict[str, int]: - """Process input data and return results.""" - result = {{}} - for i, item in enumerate(input_data): - key = f"item_{{i}}" - result[key] = item * self.value - return result - - def calculate_total(self, multiplier: float = 1.0) -> float: - """Calculate total value.""" - return sum(self.data.values()) * multiplier - - def add_item(self, item: str) -> None: - """Add item to collection.""" - if item not in self.items: - self.items.append(item) - - @property - def item_count(self) -> int: - """Get number of items.""" - return len(self.items) -''' - - # Add functions - for func_i in range(functions_count): - content += f''' -def function_{file_index}_{func_i}(param1: int, param2: str = "default") -> Tuple[int, str]: - """Function {func_i} in module {file_index}.""" - processed_value = param1 * {func_i + 1} - processed_string = f"{{param2}}_{{processed_value}}" - - # Some processing logic - if processed_value > 100: - processed_value = processed_value // 2 - - return processed_value, processed_string - -def helper_function_{file_index}_{func_i}(data: List[Any]) -> Optional[Any]: - """Helper function for function_{func_i}.""" - if not data: - return None - - return data[0] if len(data) == 1 else data -''' - - # Add module-level variables - content += f''' -# Module-level variables -MODULE_ID = {file_index} -MODULE_NAME = "module_{file_index:04d}" -DEFAULT_CONFIG = {{ - "enabled": True, - "max_items": {file_index * 10 + 100}, - "timeout": {file_index * 2 + 30} -}} -''' - - return content - - def generate_javascript_file_content(self, file_index: int) -> str: - """Generate JavaScript file content.""" - return f''' -// JavaScript module {file_index} for performance testing -const express = require('express'); -const {{ EventEmitter }} = require('events'); - -class Service{file_index} extends EventEmitter {{ - constructor(config = {{}}) {{ - super(); - this.config = config; - this.data = new Map(); - this.active = false; - }} - - async initialize() {{ - this.active = true; - this.emit('initialized', {{ serviceId: {file_index} }}); - }} - - processData(input) {{ - const result = []; - for (const item of input) {{ - result.push({{ - id: item.id, - value: item.value * {file_index}, - timestamp: Date.now() - }}); - }} - return result; - }} - - async asyncOperation(delay = 100) {{ - return new Promise(resolve => {{ - setTimeout(() => {{ - resolve({{ result: 'completed', serviceId: {file_index} }}); - }}, delay); - }}); - }} -}} - -function helper{file_index}(data) {{ - return data.map(item => ({{ - ...item, - processed: true, - serviceId: {file_index} - }})); -}} - -const config{file_index} = {{ - serviceId: {file_index}, - enabled: true, - maxConnections: {file_index * 10 + 50} -}}; - -module.exports = {{ - Service{file_index}, - helper{file_index}, - config{file_index} -}}; -''' - - def generate_java_file_content(self, file_index: int) -> str: - """Generate Java file content.""" - return f''' -package com.benchmark.test; - -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; -import java.time.LocalDateTime; - -/** - * Test class {file_index} for performance benchmarking. - * Demonstrates various Java language features. - */ -public class JavaClass_{file_index} {{ - private final int classId; - private final Map data; - private final List items; - private boolean active; - - /** - * Constructor for JavaClass_{file_index}. - * - * @param classId Unique identifier for this class - */ - public JavaClass_{file_index}(int classId) {{ - this.classId = classId; - this.data = new ConcurrentHashMap<>(); - this.items = new ArrayList<>(); - this.active = false; - }} - - /** - * Initialize the class with default values. - */ - public void initialize() {{ - this.active = true; - this.data.put("initialized", LocalDateTime.now()); - this.data.put("classId", this.classId); - }} - - /** - * Process a list of integers and return results. - * - * @param input List of integers to process - * @return Map of processed results - */ - public Map processNumbers(List input) {{ - Map results = new HashMap<>(); - - for (int i = 0; i < input.size(); i++) {{ - String key = "result_" + i; - Integer value = input.get(i) * {file_index} + i; - results.put(key, value); - }} - - return results; - }} - - /** - * Add item to the collection. - * - * @param item Item to add - * @return true if item was added, false if it already exists - */ - public boolean addItem(String item) {{ - if (item == null || item.trim().isEmpty()) {{ - return false; - }} - - if (!items.contains(item)) {{ - items.add(item); - return true; - }} - - return false; - }} - - /** - * Get total count of items. - * - * @return Number of items in collection - */ - public int getItemCount() {{ - return items.size(); - }} - - /** - * Check if class is active. - * - * @return true if active, false otherwise - */ - public boolean isActive() {{ - return active; - }} - - /** - * Set active status. - * - * @param active New active status - */ - public void setActive(boolean active) {{ - this.active = active; - if (active) {{ - data.put("lastActivated", LocalDateTime.now()); - }} - }} - - @Override - public String toString() {{ - return String.format("JavaClass_%d{{classId=%d, active=%s, items=%d}}", - {file_index}, classId, active, items.size()); - }} - - @Override - public boolean equals(Object obj) {{ - if (this == obj) return true; - if (obj == null || getClass() != obj.getClass()) return false; - JavaClass_{file_index} other = (JavaClass_{file_index}) obj; - return classId == other.classId; - }} - - @Override - public int hashCode() {{ - return Objects.hash(classId); - }} -}} -''' - - def benchmark_index_generation(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult: - """Benchmark basic index generation performance.""" - print(f" 📊 Index generation benchmark...") - - # Configure framework - config = SCIPConfig( - project_root=project_path, - cache_enabled=False, # Disable cache for pure generation benchmark - validate_compliance=True, - **config_overrides - ) - - framework = SCIPFrameworkAPI(config) - - # Count files - file_count = len(list(Path(project_path).rglob("*.py"))) - - # Start monitoring - self.monitor.start_monitoring() - - # Run benchmark - start_time = time.time() - start_memory = psutil.Process().memory_info().rss / 1024 / 1024 - - try: - index = framework.create_complete_index() - - end_time = time.time() - end_memory = psutil.Process().memory_info().rss / 1024 / 1024 - - # Stop monitoring - metrics_history = self.monitor.stop_monitoring() - - # Calculate metrics - total_time = end_time - start_time - memory_usage = end_memory - start_memory - - symbols_count = sum(len(doc.symbols) for doc in index.documents) - occurrences_count = sum(len(doc.occurrences) for doc in index.occurrences) - - throughput_files = file_count / total_time if total_time > 0 else 0 - throughput_symbols = symbols_count / total_time if total_time > 0 else 0 - - # Additional metrics - avg_cpu = statistics.mean([m.cpu_percent for m in metrics_history]) if metrics_history else 0 - peak_memory = max([m.memory_available_mb for m in metrics_history]) if metrics_history else end_memory - - result = BenchmarkResult( - test_name=test_name, - file_count=file_count, - total_time=total_time, - memory_usage_mb=memory_usage, - symbols_generated=symbols_count, - occurrences_generated=occurrences_count, - cache_hit_rate=0.0, # No cache in this test - throughput_files_per_sec=throughput_files, - throughput_symbols_per_sec=throughput_symbols, - error_count=0, - additional_metrics={ - 'avg_cpu_percent': avg_cpu, - 'peak_memory_mb': peak_memory, - 'documents_generated': len(index.documents), - 'external_symbols': len(index.external_symbols) - } - ) - - print(f" ✓ {file_count} files, {symbols_count} symbols in {total_time:.2f}s") - print(f" ✓ {throughput_files:.1f} files/sec, {throughput_symbols:.1f} symbols/sec") - - return result - - except Exception as e: - self.monitor.stop_monitoring() - print(f" ❌ Benchmark failed: {e}") - - return BenchmarkResult( - test_name=f"{test_name} (FAILED)", - file_count=file_count, - total_time=0, - memory_usage_mb=0, - symbols_generated=0, - occurrences_generated=0, - cache_hit_rate=0.0, - throughput_files_per_sec=0, - throughput_symbols_per_sec=0, - error_count=1, - additional_metrics={'error': str(e)} - ) - - def benchmark_caching_performance(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult: - """Benchmark caching system performance.""" - print(f" 🗂️ Caching performance benchmark...") - - config = SCIPConfig( - project_root=project_path, - cache_enabled=True, - **config_overrides - ) - - framework = SCIPFrameworkAPI(config) - file_count = len(list(Path(project_path).rglob("*.py"))) - - # First run to populate cache - start_time = time.time() - index1 = framework.create_complete_index() - first_run_time = time.time() - start_time - - # Second run with cache - start_time = time.time() - index2 = framework.create_complete_index() - second_run_time = time.time() - start_time - - # Get cache statistics - cache_stats = framework.get_cache_statistics() - hit_rate = float(cache_stats.get('hit_rate', '0%').rstrip('%')) / 100.0 - - symbols_count = sum(len(doc.symbols) for doc in index2.documents) - - result = BenchmarkResult( - test_name=test_name, - file_count=file_count, - total_time=second_run_time, - memory_usage_mb=0, # Not measured in this test - symbols_generated=symbols_count, - occurrences_generated=0, - cache_hit_rate=hit_rate, - throughput_files_per_sec=file_count / second_run_time if second_run_time > 0 else 0, - throughput_symbols_per_sec=symbols_count / second_run_time if second_run_time > 0 else 0, - error_count=0, - additional_metrics={ - 'first_run_time': first_run_time, - 'second_run_time': second_run_time, - 'cache_speedup': first_run_time / second_run_time if second_run_time > 0 else 0, - 'cache_entries': cache_stats.get('memory_entries', 0) - } - ) - - speedup = first_run_time / second_run_time if second_run_time > 0 else 0 - print(f" ✓ Cache hit rate: {hit_rate:.1%}, speedup: {speedup:.1f}x") - - return result - - def benchmark_streaming_performance(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult: - """Benchmark streaming indexer performance.""" - print(f" 🌊 Streaming performance benchmark...") - - config = SCIPConfig( - project_root=project_path, - cache_enabled=True, - **config_overrides - ) - - framework = SCIPFrameworkAPI(config) - python_files = list(Path(project_path).rglob("*.py")) - file_paths = [str(f) for f in python_files] - - # Create streaming indexer - python_factory = PythonSCIPIndexFactory(project_path) - cache_manager = SCIPCacheManager() - streaming_indexer = StreamingIndexer( - factory=python_factory, - cache_manager=cache_manager, - max_workers=config_overrides.get('max_workers', 4), - chunk_size=config_overrides.get('batch_size', 50) // 2 - ) - - # Track progress - progress_updates = [] - def track_progress(progress): - progress_updates.append({ - 'percentage': progress.progress_percentage, - 'elapsed': progress.elapsed_time - }) - - streaming_indexer.add_progress_callback(track_progress) - - # Run streaming benchmark - start_time = time.time() - - documents = [] - for doc in streaming_indexer.index_files_streaming(file_paths): - documents.append(doc) - - total_time = time.time() - start_time - - symbols_count = sum(len(doc.symbols) for doc in documents) - occurrences_count = sum(len(doc.occurrences) for doc in documents) - - result = BenchmarkResult( - test_name=test_name, - file_count=len(file_paths), - total_time=total_time, - memory_usage_mb=0, - symbols_generated=symbols_count, - occurrences_generated=occurrences_count, - cache_hit_rate=0.0, - throughput_files_per_sec=len(file_paths) / total_time if total_time > 0 else 0, - throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0, - error_count=0, - additional_metrics={ - 'progress_updates': len(progress_updates), - 'avg_chunk_time': total_time / max(1, len(progress_updates)), - 'documents_streamed': len(documents) - } - ) - - print(f" ✓ Streamed {len(documents)} documents in {total_time:.2f}s") - - return result - - def benchmark_multi_language(self, project_path: str) -> BenchmarkResult: - """Benchmark multi-language processing.""" - print(f" 🌐 Multi-language performance benchmark...") - - config = SCIPConfig( - project_root=project_path, - max_workers=6, - supported_languages={'python', 'javascript', 'java'} - ) - - framework = SCIPFrameworkAPI(config) - - # Count files by language - python_files = len(list(Path(project_path).rglob("*.py"))) - js_files = len(list(Path(project_path).rglob("*.js"))) - java_files = len(list(Path(project_path).rglob("*.java"))) - total_files = python_files + js_files + java_files - - # Run benchmark - start_time = time.time() - index = framework.create_complete_index() - total_time = time.time() - start_time - - symbols_count = sum(len(doc.symbols) for doc in index.documents) - - result = BenchmarkResult( - test_name="Multi-Language Processing", - file_count=total_files, - total_time=total_time, - memory_usage_mb=0, - symbols_generated=symbols_count, - occurrences_generated=0, - cache_hit_rate=0.0, - throughput_files_per_sec=total_files / total_time if total_time > 0 else 0, - throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0, - error_count=0, - additional_metrics={ - 'python_files': python_files, - 'javascript_files': js_files, - 'java_files': java_files, - 'languages_processed': 3, - 'documents_generated': len(index.documents) - } - ) - - print(f" ✓ {total_files} files ({python_files} Python, {js_files} JS, {java_files} Java)") - print(f" ✓ {symbols_count} symbols in {total_time:.2f}s") - - return result - - def benchmark_memory_usage(self, project_path: str) -> BenchmarkResult: - """Benchmark memory usage under load.""" - print(f" 🧠 Memory usage benchmark...") - - # Configure for memory stress testing - config = SCIPConfig( - project_root=project_path, - max_workers=1, # Single worker to control memory usage - batch_size=10, # Small batches - cache_enabled=True - ) - - framework = SCIPFrameworkAPI(config) - file_count = len(list(Path(project_path).rglob("*.py"))) - - # Monitor memory throughout the process - self.monitor.start_monitoring(interval=0.1) # High frequency monitoring - - process = psutil.Process() - initial_memory = process.memory_info().rss / 1024 / 1024 - - start_time = time.time() - - # Process with memory monitoring - index = framework.create_complete_index() - - total_time = time.time() - start_time - final_memory = process.memory_info().rss / 1024 / 1024 - - # Stop monitoring and analyze - metrics_history = self.monitor.stop_monitoring() - - if metrics_history: - peak_memory = max(m.memory_available_mb for m in metrics_history) - avg_memory = statistics.mean(m.memory_available_mb for m in metrics_history) - else: - peak_memory = final_memory - avg_memory = final_memory - - memory_growth = final_memory - initial_memory - symbols_count = sum(len(doc.symbols) for doc in index.documents) - - result = BenchmarkResult( - test_name="Memory Usage Analysis", - file_count=file_count, - total_time=total_time, - memory_usage_mb=memory_growth, - symbols_generated=symbols_count, - occurrences_generated=0, - cache_hit_rate=0.0, - throughput_files_per_sec=file_count / total_time if total_time > 0 else 0, - throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0, - error_count=0, - additional_metrics={ - 'initial_memory_mb': initial_memory, - 'final_memory_mb': final_memory, - 'peak_memory_mb': peak_memory, - 'avg_memory_mb': avg_memory, - 'memory_efficiency_mb_per_symbol': memory_growth / symbols_count if symbols_count > 0 else 0, - 'monitoring_samples': len(metrics_history) - } - ) - - print(f" ✓ Memory growth: {memory_growth:.1f} MB (peak: {peak_memory:.1f} MB)") - print(f" ✓ {memory_growth/symbols_count:.3f} MB per symbol") - - return result - - def benchmark_concurrent_processing(self, project_path: str) -> BenchmarkResult: - """Benchmark concurrent processing capabilities.""" - print(f" ⚡ Concurrent processing benchmark...") - - python_files = list(Path(project_path).rglob("*.py")) - file_paths = [str(f) for f in python_files] - - # Test different worker counts - worker_counts = [1, 2, 4, 8] - results = {} - - for workers in worker_counts: - config = SCIPConfig( - project_root=project_path, - max_workers=workers, - batch_size=50 - ) - - framework = SCIPFrameworkAPI(config) - - start_time = time.time() - index = framework.create_complete_index() - elapsed_time = time.time() - start_time - - results[workers] = { - 'time': elapsed_time, - 'symbols': sum(len(doc.symbols) for doc in index.documents) - } - - # Find optimal worker count - best_workers = min(results.keys(), key=lambda w: results[w]['time']) - best_time = results[best_workers]['time'] - sequential_time = results[1]['time'] - - speedup = sequential_time / best_time if best_time > 0 else 0 - efficiency = speedup / best_workers if best_workers > 0 else 0 - - result = BenchmarkResult( - test_name="Concurrent Processing Analysis", - file_count=len(file_paths), - total_time=best_time, - memory_usage_mb=0, - symbols_generated=results[best_workers]['symbols'], - occurrences_generated=0, - cache_hit_rate=0.0, - throughput_files_per_sec=len(file_paths) / best_time if best_time > 0 else 0, - throughput_symbols_per_sec=results[best_workers]['symbols'] / best_time if best_time > 0 else 0, - error_count=0, - additional_metrics={ - 'optimal_workers': best_workers, - 'speedup': speedup, - 'efficiency': efficiency, - 'worker_results': results, - 'parallel_efficiency_percent': efficiency * 100 - } - ) - - print(f" ✓ Optimal workers: {best_workers}, speedup: {speedup:.1f}x") - print(f" ✓ Parallel efficiency: {efficiency:.1%}") - - return result - - def generate_benchmark_report(self) -> Dict[str, Any]: - """Generate comprehensive benchmark report.""" - if not self.results: - return {"error": "No benchmark results available"} - - # Calculate aggregate statistics - total_files = sum(r.file_count for r in self.results) - total_symbols = sum(r.symbols_generated for r in self.results) - total_time = sum(r.total_time for r in self.results) - - # Performance metrics - avg_throughput_files = statistics.mean([r.throughput_files_per_sec for r in self.results if r.throughput_files_per_sec > 0]) - avg_throughput_symbols = statistics.mean([r.throughput_symbols_per_sec for r in self.results if r.throughput_symbols_per_sec > 0]) - - # Memory analysis - memory_results = [r for r in self.results if r.memory_usage_mb > 0] - avg_memory_usage = statistics.mean([r.memory_usage_mb for r in memory_results]) if memory_results else 0 - - # Cache performance - cache_results = [r for r in self.results if r.cache_hit_rate > 0] - avg_cache_hit_rate = statistics.mean([r.cache_hit_rate for r in cache_results]) if cache_results else 0 - - # System information - system_info = { - 'cpu_count': psutil.cpu_count(), - 'cpu_freq_mhz': psutil.cpu_freq().current if psutil.cpu_freq() else 0, - 'memory_total_gb': psutil.virtual_memory().total / 1024**3, - 'memory_available_gb': psutil.virtual_memory().available / 1024**3, - 'disk_usage_percent': psutil.disk_usage('/').percent if os.name != 'nt' else psutil.disk_usage('C:\\').percent - } - - # Performance summary - performance_summary = { - 'total_benchmarks': len(self.results), - 'total_files_processed': total_files, - 'total_symbols_generated': total_symbols, - 'total_processing_time': total_time, - 'average_throughput_files_per_sec': avg_throughput_files, - 'average_throughput_symbols_per_sec': avg_throughput_symbols, - 'average_memory_usage_mb': avg_memory_usage, - 'average_cache_hit_rate': avg_cache_hit_rate, - 'failed_benchmarks': len([r for r in self.results if r.error_count > 0]) - } - - # Detailed results - detailed_results = [] - for result in self.results: - detailed_results.append(asdict(result)) - - # Performance recommendations - recommendations = self.generate_performance_recommendations() - - report = { - 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), - 'system_info': system_info, - 'performance_summary': performance_summary, - 'detailed_results': detailed_results, - 'recommendations': recommendations - } - - # Print summary - print("\n" + "="*60) - print("📊 BENCHMARK RESULTS SUMMARY") - print("="*60) - print(f"Total benchmarks: {len(self.results)}") - print(f"Files processed: {total_files:,}") - print(f"Symbols generated: {total_symbols:,}") - print(f"Total time: {total_time:.2f} seconds") - print(f"Average throughput: {avg_throughput_files:.1f} files/sec, {avg_throughput_symbols:.1f} symbols/sec") - print(f"Average memory usage: {avg_memory_usage:.1f} MB") - if avg_cache_hit_rate > 0: - print(f"Average cache hit rate: {avg_cache_hit_rate:.1%}") - print() - - # Print individual results - for result in self.results: - status = "✓" if result.error_count == 0 else "❌" - print(f"{status} {result.test_name}") - print(f" {result.file_count} files → {result.symbols_generated} symbols in {result.total_time:.2f}s") - print(f" {result.throughput_files_per_sec:.1f} files/sec, {result.throughput_symbols_per_sec:.1f} symbols/sec") - if result.cache_hit_rate > 0: - print(f" Cache hit rate: {result.cache_hit_rate:.1%}") - print() - - return report - - def generate_performance_recommendations(self) -> List[str]: - """Generate performance recommendations based on benchmark results.""" - recommendations = [] - - # Analyze results for recommendations - memory_results = [r for r in self.results if r.memory_usage_mb > 0] - if memory_results: - avg_memory = statistics.mean([r.memory_usage_mb for r in memory_results]) - if avg_memory > 500: # More than 500 MB - recommendations.append("Consider reducing batch_size or max_workers to control memory usage") - - # Cache performance - cache_results = [r for r in self.results if r.cache_hit_rate > 0] - if cache_results: - avg_cache_rate = statistics.mean([r.cache_hit_rate for r in cache_results]) - if avg_cache_rate < 0.7: # Less than 70% hit rate - recommendations.append("Cache performance is suboptimal. Consider increasing cache size or optimizing file change detection") - - # Throughput analysis - throughput_results = [r.throughput_files_per_sec for r in self.results if r.throughput_files_per_sec > 0] - if throughput_results: - avg_throughput = statistics.mean(throughput_results) - if avg_throughput < 10: # Less than 10 files per second - recommendations.append("Consider increasing max_workers or batch_size to improve throughput") - - # Concurrent processing - concurrent_results = [r for r in self.results if 'speedup' in r.additional_metrics] - if concurrent_results: - for result in concurrent_results: - efficiency = result.additional_metrics.get('efficiency', 0) - if efficiency < 0.5: # Less than 50% efficiency - recommendations.append("Parallel processing efficiency is low. Consider reducing worker count or optimizing workload distribution") - - # General recommendations - recommendations.extend([ - "Enable caching for repeated operations to improve performance", - "Use SSD storage for cache directory to reduce I/O latency", - "Monitor memory usage during large project processing", - "Consider streaming processing for very large codebases", - "Validate SCIP compliance only when necessary for better performance" - ]) - - return recommendations - - -def run_benchmark_suite(): - """Main function to run the complete benchmark suite.""" - benchmark = SCIPFrameworkBenchmark() - - try: - report = benchmark.run_all_benchmarks() - - # Save report to file - import json - report_path = "scip_framework_benchmark_report.json" - with open(report_path, 'w', encoding='utf-8') as f: - json.dump(report, f, indent=2, ensure_ascii=False) - - print(f"📄 Detailed benchmark report saved to: {report_path}") - - # Print recommendations - print("\n🎯 PERFORMANCE RECOMMENDATIONS:") - for i, rec in enumerate(report['recommendations'], 1): - print(f"{i}. {rec}") - - return report - - except Exception as e: - print(f"❌ Benchmark suite failed: {e}") - import traceback - traceback.print_exc() - return None - - -if __name__ == "__main__": - run_benchmark_suite() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 548c91d..428e2d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "code-index-mcp" -version = "2.2.0" +version = "2.4.1" description = "Code indexing and analysis tools for LLMs using MCP" readme = "README.md" requires-python = ">=3.10" diff --git a/src/code_index_mcp/__init__.py b/src/code_index_mcp/__init__.py index 3ac3936..f47ee02 100644 --- a/src/code_index_mcp/__init__.py +++ b/src/code_index_mcp/__init__.py @@ -3,4 +3,5 @@ A Model Context Protocol server for code indexing, searching, and analysis. """ -__version__ = "2.2.0" +__version__ = "2.4.1" + diff --git a/src/code_index_mcp/constants.py b/src/code_index_mcp/constants.py index d1d4235..159e31a 100644 --- a/src/code_index_mcp/constants.py +++ b/src/code_index_mcp/constants.py @@ -5,7 +5,8 @@ # Directory and file names SETTINGS_DIR = "code_indexer" CONFIG_FILE = "config.json" -INDEX_FILE = "index.json" # JSON index file +INDEX_FILE = "index.json" # JSON index file (deep index) +INDEX_FILE_SHALLOW = "index.shallow.json" # Minimal shallow index (file list) # Supported file extensions for code analysis # This is the authoritative list used by both old and new indexing systems diff --git a/src/code_index_mcp/indexing/__init__.py b/src/code_index_mcp/indexing/__init__.py index 512ad3f..e779911 100644 --- a/src/code_index_mcp/indexing/__init__.py +++ b/src/code_index_mcp/indexing/__init__.py @@ -13,6 +13,8 @@ # New JSON-based indexing system from .json_index_builder import JSONIndexBuilder, IndexMetadata from .json_index_manager import JSONIndexManager, get_index_manager +from .shallow_index_manager import ShallowIndexManager, get_shallow_index_manager +from .deep_index_manager import DeepIndexManager from .models import SymbolInfo, FileInfo __all__ = [ @@ -21,6 +23,9 @@ 'JSONIndexBuilder', 'JSONIndexManager', 'get_index_manager', + 'ShallowIndexManager', + 'get_shallow_index_manager', + 'DeepIndexManager', 'SymbolInfo', 'FileInfo', 'IndexMetadata' diff --git a/src/code_index_mcp/indexing/deep_index_manager.py b/src/code_index_mcp/indexing/deep_index_manager.py new file mode 100644 index 0000000..6558703 --- /dev/null +++ b/src/code_index_mcp/indexing/deep_index_manager.py @@ -0,0 +1,46 @@ +""" +Deep Index Manager - Wrapper around JSONIndexManager for deep indexing. + +This class provides a clear semantic separation from the shallow manager. +It delegates to the existing JSONIndexManager (symbols + files JSON index). +""" + +from __future__ import annotations + +from typing import Optional, Dict, Any, List + +from .json_index_manager import JSONIndexManager + + +class DeepIndexManager: + """Thin wrapper over JSONIndexManager to expose deep-index API.""" + + def __init__(self) -> None: + self._mgr = JSONIndexManager() + + # Expose a subset of API to keep callers simple + def set_project_path(self, project_path: str) -> bool: + return self._mgr.set_project_path(project_path) + + def build_index(self, force_rebuild: bool = False) -> bool: + return self._mgr.build_index(force_rebuild=force_rebuild) + + def load_index(self) -> bool: + return self._mgr.load_index() + + def refresh_index(self) -> bool: + return self._mgr.refresh_index() + + def find_files(self, pattern: str = "*") -> List[str]: + return self._mgr.find_files(pattern) + + def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]: + return self._mgr.get_file_summary(file_path) + + def get_index_stats(self) -> Dict[str, Any]: + return self._mgr.get_index_stats() + + def cleanup(self) -> None: + self._mgr.cleanup() + + diff --git a/src/code_index_mcp/indexing/json_index_builder.py b/src/code_index_mcp/indexing/json_index_builder.py index 0f95c5b..c12d694 100644 --- a/src/code_index_mcp/indexing/json_index_builder.py +++ b/src/code_index_mcp/indexing/json_index_builder.py @@ -8,9 +8,10 @@ import logging import os import time +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed from dataclasses import dataclass, asdict from pathlib import Path -from typing import Dict, List, Optional, Any +from typing import Dict, List, Optional, Any, Tuple from .strategies import StrategyFactory from .models import SymbolInfo, FileInfo @@ -44,18 +45,18 @@ class JSONIndexBuilder: def __init__(self, project_path: str, additional_excludes: Optional[List[str]] = None): from ..utils import FileFilter - + # Input validation if not isinstance(project_path, str): raise ValueError(f"Project path must be a string, got {type(project_path)}") - + project_path = project_path.strip() if not project_path: raise ValueError("Project path cannot be empty") - + if not os.path.isdir(project_path): raise ValueError(f"Project path does not exist: {project_path}") - + self.project_path = project_path self.in_memory_index: Optional[Dict[str, Any]] = None self.strategy_factory = StrategyFactory() @@ -70,14 +71,53 @@ def __init__(self, project_path: str, additional_excludes: Optional[List[str]] = fallback = len(self.strategy_factory.get_fallback_extensions()) logger.info(f"Specialized parsers: {specialized} extensions, Fallback coverage: {fallback} extensions") - def build_index(self) -> Dict[str, Any]: + def _process_file(self, file_path: str, specialized_extensions: set) -> Optional[Tuple[Dict, Dict, str, bool]]: + """ + Process a single file - designed for parallel execution. + + Args: + file_path: Path to the file to process + specialized_extensions: Set of extensions with specialized parsers + + Returns: + Tuple of (symbols, file_info, language, is_specialized) or None on error + """ + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + ext = Path(file_path).suffix.lower() + rel_path = os.path.relpath(file_path, self.project_path).replace('\\', '/') + + # Get appropriate strategy + strategy = self.strategy_factory.get_strategy(ext) + + # Track strategy usage + is_specialized = ext in specialized_extensions + + # Parse file using strategy + symbols, file_info = strategy.parse_file(rel_path, content) + + logger.debug(f"Parsed {rel_path}: {len(symbols)} symbols ({file_info.language})") + + return (symbols, {rel_path: file_info}, file_info.language, is_specialized) + + except Exception as e: + logger.warning(f"Error processing {file_path}: {e}") + return None + + def build_index(self, parallel: bool = True, max_workers: Optional[int] = None) -> Dict[str, Any]: """ - Build the complete index using Strategy pattern. + Build the complete index using Strategy pattern with parallel processing. + + Args: + parallel: Whether to use parallel processing (default: True) + max_workers: Maximum number of worker processes/threads (default: CPU count) Returns: Complete JSON index with metadata, symbols, and file information """ - logger.info("Building JSON index using Strategy pattern...") + logger.info(f"Building JSON index using Strategy pattern (parallel={parallel})...") start_time = time.time() all_symbols = {} @@ -89,38 +129,66 @@ def build_index(self) -> Dict[str, Any]: # Get specialized extensions for tracking specialized_extensions = set(self.strategy_factory.get_specialized_extensions()) - # Traverse project files - for file_path in self._get_supported_files(): - try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: - content = f.read() - - ext = Path(file_path).suffix.lower() - - # Convert to relative path first - rel_path = os.path.relpath(file_path, self.project_path).replace('\\', '/') - - # Get appropriate strategy - strategy = self.strategy_factory.get_strategy(ext) - - # Track strategy usage - if ext in specialized_extensions: - specialized_count += 1 - else: - fallback_count += 1 - - # Parse file using strategy with relative path - symbols, file_info = strategy.parse_file(rel_path, content) - - # Add to index - all_symbols.update(symbols) - all_files[rel_path] = file_info - languages.add(file_info.language) - - logger.debug(f"Parsed {rel_path}: {len(symbols)} symbols ({file_info.language})") - - except Exception as e: - logger.warning(f"Error processing {file_path}: {e}") + # Get list of files to process + files_to_process = self._get_supported_files() + total_files = len(files_to_process) + + if total_files == 0: + logger.warning("No files to process") + return self._create_empty_index() + + logger.info(f"Processing {total_files} files...") + + if parallel and total_files > 1: + # Use ThreadPoolExecutor for I/O-bound file reading + # ProcessPoolExecutor has issues with strategy sharing + if max_workers is None: + max_workers = min(os.cpu_count() or 4, total_files) + + logger.info(f"Using parallel processing with {max_workers} workers") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_file = { + executor.submit(self._process_file, file_path, specialized_extensions): file_path + for file_path in files_to_process + } + + # Process completed tasks + processed = 0 + for future in as_completed(future_to_file): + file_path = future_to_file[future] + result = future.result() + + if result: + symbols, file_info_dict, language, is_specialized = result + all_symbols.update(symbols) + all_files.update(file_info_dict) + languages.add(language) + + if is_specialized: + specialized_count += 1 + else: + fallback_count += 1 + + processed += 1 + if processed % 100 == 0: + logger.debug(f"Processed {processed}/{total_files} files") + else: + # Sequential processing + logger.info("Using sequential processing") + for file_path in files_to_process: + result = self._process_file(file_path, specialized_extensions) + if result: + symbols, file_info_dict, language, is_specialized = result + all_symbols.update(symbols) + all_files.update(file_info_dict) + languages.add(language) + + if is_specialized: + specialized_count += 1 + else: + fallback_count += 1 # Build index metadata metadata = IndexMetadata( @@ -151,6 +219,25 @@ def build_index(self) -> Dict[str, Any]: return index + def _create_empty_index(self) -> Dict[str, Any]: + """Create an empty index structure.""" + metadata = IndexMetadata( + project_path=self.project_path, + indexed_files=0, + index_version="2.0.0-strategy", + timestamp=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + languages=[], + total_symbols=0, + specialized_parsers=0, + fallback_files=0 + ) + + return { + "metadata": asdict(metadata), + "symbols": {}, + "files": {} + } + def get_index(self) -> Optional[Dict[str, Any]]: """Get the current in-memory index.""" return self.in_memory_index @@ -187,6 +274,31 @@ def _get_supported_files(self) -> List[str]: logger.debug(f"Found {len(supported_files)} supported files") return supported_files + def build_shallow_file_list(self) -> List[str]: + """ + Build a minimal shallow index consisting of relative file paths only. + + This method does not read file contents. It enumerates supported files + using centralized filtering and returns normalized relative paths with + forward slashes for cross-platform consistency. + + Returns: + List of relative file paths (using '/'). + """ + try: + absolute_files = self._get_supported_files() + result: List[str] = [] + for abs_path in absolute_files: + rel_path = os.path.relpath(abs_path, self.project_path).replace('\\', '/') + # Normalize leading './' + if rel_path.startswith('./'): + rel_path = rel_path[2:] + result.append(rel_path) + return result + except Exception as e: + logger.error(f"Failed to build shallow file list: {e}") + return [] + def save_index(self, index: Dict[str, Any], index_path: str) -> bool: """ Save index to disk. @@ -284,16 +396,16 @@ def get_file_symbols(self, file_path: str) -> List[Dict[str, Any]]: # Work directly with global symbols for this file global_symbols = self.in_memory_index.get("symbols", {}) result = [] - + # Find all symbols for this file directly from global symbols for symbol_id, symbol_data in global_symbols.items(): symbol_file = symbol_data.get("file", "").replace("\\", "/") - + # Check if this symbol belongs to our file if symbol_file == file_path: symbol_type = symbol_data.get("type", "unknown") symbol_name = symbol_id.split("::")[-1] # Extract symbol name from ID - + # Create symbol info symbol_info = { "name": symbol_name, @@ -301,7 +413,7 @@ def get_file_symbols(self, file_path: str) -> List[Dict[str, Any]]: "line": symbol_data.get("line"), "signature": symbol_data.get("signature") } - + # Categorize by type if symbol_type in ["function", "method"]: result.append(symbol_info) @@ -310,7 +422,7 @@ def get_file_symbols(self, file_path: str) -> List[Dict[str, Any]]: # Sort by line number for consistent ordering result.sort(key=lambda x: x.get("line", 0)) - + return result except Exception as e: diff --git a/src/code_index_mcp/indexing/json_index_manager.py b/src/code_index_mcp/indexing/json_index_manager.py index d4564f3..ec320e4 100644 --- a/src/code_index_mcp/indexing/json_index_manager.py +++ b/src/code_index_mcp/indexing/json_index_manager.py @@ -9,28 +9,32 @@ import json import logging import os +import re import tempfile import threading +import fnmatch from pathlib import Path from typing import Dict, List, Optional, Any from .json_index_builder import JSONIndexBuilder -from ..constants import SETTINGS_DIR, INDEX_FILE +from ..constants import SETTINGS_DIR, INDEX_FILE, INDEX_FILE_SHALLOW logger = logging.getLogger(__name__) class JSONIndexManager: """Manages JSON-based code index lifecycle and storage.""" - + def __init__(self): self.project_path: Optional[str] = None self.index_builder: Optional[JSONIndexBuilder] = None self.temp_dir: Optional[str] = None self.index_path: Optional[str] = None + self.shallow_index_path: Optional[str] = None + self._shallow_file_list: Optional[List[str]] = None self._lock = threading.RLock() logger.info("Initialized JSON Index Manager") - + def set_project_path(self, project_path: str) -> bool: """Set the project path and initialize index storage.""" with self._lock: @@ -39,67 +43,68 @@ def set_project_path(self, project_path: str) -> bool: if not project_path or not isinstance(project_path, str): logger.error(f"Invalid project path: {project_path}") return False - + project_path = project_path.strip() if not project_path: logger.error("Project path cannot be empty") return False - + if not os.path.isdir(project_path): logger.error(f"Project path does not exist: {project_path}") return False - + self.project_path = project_path self.index_builder = JSONIndexBuilder(project_path) - + # Create temp directory for index storage project_hash = hashlib.md5(project_path.encode()).hexdigest()[:12] self.temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, project_hash) os.makedirs(self.temp_dir, exist_ok=True) - + self.index_path = os.path.join(self.temp_dir, INDEX_FILE) - + self.shallow_index_path = os.path.join(self.temp_dir, INDEX_FILE_SHALLOW) + logger.info(f"Set project path: {project_path}") logger.info(f"Index storage: {self.index_path}") return True - + except Exception as e: logger.error(f"Failed to set project path: {e}") return False - + def build_index(self, force_rebuild: bool = False) -> bool: """Build or rebuild the index.""" with self._lock: if not self.index_builder or not self.project_path: logger.error("Index builder not initialized") return False - + try: # Check if we need to rebuild if not force_rebuild and self._is_index_fresh(): logger.info("Index is fresh, skipping rebuild") return True - + logger.info("Building JSON index...") index = self.index_builder.build_index() - + # Save to disk self.index_builder.save_index(index, self.index_path) - + logger.info(f"Successfully built index with {len(index['symbols'])} symbols") return True - + except Exception as e: logger.error(f"Failed to build index: {e}") return False - + def load_index(self) -> bool: """Load existing index from disk.""" with self._lock: if not self.index_builder or not self.index_path: logger.error("Index manager not initialized") return False - + try: index = self.index_builder.load_index(self.index_path) if index: @@ -108,11 +113,57 @@ def load_index(self) -> bool: else: logger.warning("No existing index found") return False - + except Exception as e: logger.error(f"Failed to load index: {e}") return False - + + def build_shallow_index(self) -> bool: + """Build and save the minimal shallow index (file list).""" + with self._lock: + if not self.index_builder or not self.project_path or not self.shallow_index_path: + logger.error("Index builder not initialized for shallow index") + return False + + try: + file_list = self.index_builder.build_shallow_file_list() + # Persist as a JSON array for minimal overhead + with open(self.shallow_index_path, 'w', encoding='utf-8') as f: + json.dump(file_list, f, ensure_ascii=False) + self._shallow_file_list = file_list + logger.info(f"Saved shallow index with {len(file_list)} files to {self.shallow_index_path}") + return True + except Exception as e: + logger.error(f"Failed to build shallow index: {e}") + return False + + def load_shallow_index(self) -> bool: + """Load shallow index (file list) from disk into memory.""" + with self._lock: + try: + if not self.shallow_index_path or not os.path.exists(self.shallow_index_path): + logger.warning("No existing shallow index found") + return False + with open(self.shallow_index_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if not isinstance(data, list): + logger.error("Shallow index format invalid (expected list)") + return False + # Normalize paths + normalized = [] + for p in data: + if isinstance(p, str): + q = p.replace('\\\\', '/').replace('\\', '/') + if q.startswith('./'): + q = q[2:] + normalized.append(q) + self._shallow_file_list = normalized + logger.info(f"Loaded shallow index with {len(normalized)} files") + return True + except Exception as e: + logger.error(f"Failed to load shallow index: {e}") + return False + def refresh_index(self) -> bool: """Refresh the index (rebuild and reload).""" with self._lock: @@ -120,49 +171,64 @@ def refresh_index(self) -> bool: if self.build_index(force_rebuild=True): return self.load_index() return False - + def find_files(self, pattern: str = "*") -> List[str]: - """Find files matching a pattern.""" + """ + Find files matching a glob pattern using the SHALLOW file list only. + + Notes: + - '*' does not cross '/' + - '**' matches across directories + - Always sources from the shallow index for consistency and speed + """ with self._lock: # Input validation if not isinstance(pattern, str): logger.error(f"Pattern must be a string, got {type(pattern)}") return [] - + pattern = pattern.strip() if not pattern: pattern = "*" - - if not self.index_builder or not self.index_builder.in_memory_index: - logger.warning("Index not loaded") - return [] - + + # Normalize to forward slashes + norm_pattern = pattern.replace('\\\\', '/').replace('\\', '/') + + # Build glob regex: '*' does not cross '/', '**' crosses directories + regex = self._compile_glob_regex(norm_pattern) + + # Always use shallow index for file discovery try: - files = list(self.index_builder.in_memory_index["files"].keys()) - - if pattern == "*": + if self._shallow_file_list is None: + # Try load existing shallow index; if missing, build then load + if not self.load_shallow_index(): + # If still not available, attempt to build + if self.build_shallow_index(): + self.load_shallow_index() + + files = list(self._shallow_file_list or []) + + if norm_pattern == "*": return files - - # Simple pattern matching - import fnmatch - return [f for f in files if fnmatch.fnmatch(f, pattern)] - + + return [f for f in files if regex.match(f) is not None] + except Exception as e: logger.error(f"Error finding files: {e}") return [] - + def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]: """ Get summary information for a file. - + This method attempts to retrieve comprehensive file information including symbol counts, functions, classes, methods, and imports. If the index is not loaded, it will attempt auto-initialization to restore from the most recent index state. - + Args: file_path: Relative path to the file - + Returns: Dictionary containing file summary information, or None if not found """ @@ -171,38 +237,38 @@ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]: if not isinstance(file_path, str): logger.error(f"File path must be a string, got {type(file_path)}") return None - + file_path = file_path.strip() if not file_path: logger.error("File path cannot be empty") return None - + # Try to load cached index if not ready if not self.index_builder or not self.index_builder.in_memory_index: if not self._try_load_cached_index(): logger.warning("Index not loaded and no cached index available") return None - + try: # Normalize file path file_path = file_path.replace('\\', '/') if file_path.startswith('./'): file_path = file_path[2:] - + # Get file info file_info = self.index_builder.in_memory_index["files"].get(file_path) if not file_info: logger.warning(f"File not found in index: {file_path}") return None - + # Get symbols in file symbols = self.index_builder.get_file_symbols(file_path) - + # Categorize symbols by signature functions = [] classes = [] methods = [] - + for s in symbols: signature = s.get("signature", "") if signature: @@ -210,7 +276,7 @@ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]: # Method: contains class context methods.append(s) elif signature.startswith("def "): - # Function: starts with def but no class context + # Function: starts with def but no class context functions.append(s) elif signature.startswith("class ") or signature is None: # Class: starts with class or has no signature @@ -227,7 +293,7 @@ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]: else: # Default to function functions.append(s) - + return { "file_path": file_path, "language": file_info["language"], @@ -239,63 +305,26 @@ def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]: "imports": file_info.get("imports", []), "exports": file_info.get("exports", []) } - + except Exception as e: logger.error(f"Error getting file summary: {e}") return None - - def search_symbols(self, query: str, symbol_type: Optional[str] = None) -> List[Dict[str, Any]]: - """Search for symbols by name.""" - with self._lock: - if not self.index_builder or not self.index_builder.in_memory_index: - logger.warning("Index not loaded") - return [] - - try: - results = [] - query_lower = query.lower() - - for symbol_id, symbol_data in self.index_builder.in_memory_index["symbols"].items(): - # Filter by type if specified - if symbol_type and symbol_data.get("type") != symbol_type: - continue - - # Check if query matches symbol name - if query_lower in symbol_id.lower(): - results.append({ - "id": symbol_id, - **symbol_data - }) - - return results[:50] # Limit results - - except Exception as e: - logger.error(f"Error searching symbols: {e}") - return [] - - def get_symbol_callers(self, symbol_name: str) -> List[str]: - """Get all symbols that call the given symbol.""" - with self._lock: - if not self.index_builder: - return [] - - return self.index_builder.get_callers(symbol_name) - + def get_index_stats(self) -> Dict[str, Any]: """Get statistics about the current index.""" with self._lock: if not self.index_builder or not self.index_builder.in_memory_index: return {"status": "not_loaded"} - + try: index = self.index_builder.in_memory_index metadata = index["metadata"] - + symbol_counts = {} for symbol_data in index["symbols"].values(): symbol_type = symbol_data.get("type", "unknown") symbol_counts[symbol_type] = symbol_counts.get(symbol_type, 0) + 1 - + return { "status": "loaded", "project_path": metadata["project_path"], @@ -306,51 +335,51 @@ def get_index_stats(self) -> Dict[str, Any]: "index_version": metadata["index_version"], "timestamp": metadata["timestamp"] } - + except Exception as e: logger.error(f"Error getting index stats: {e}") return {"status": "error", "error": str(e)} - + def _is_index_fresh(self) -> bool: """Check if the current index is fresh.""" if not self.index_path or not os.path.exists(self.index_path): return False - + try: - from ..utils import FileFilter - file_filter = FileFilter() - + from code_index_mcp.utils.file_filter import FileFilter as _FileFilter # pylint: disable=C0415 + file_filter = _FileFilter() + # Simple freshness check - index exists and is recent index_mtime = os.path.getmtime(self.index_path) base_path = Path(self.project_path) - + # Check if any source files are newer than index for root, dirs, files in os.walk(self.project_path): # Filter directories using centralized logic dirs[:] = [d for d in dirs if not file_filter.should_exclude_directory(d)] - + for file in files: file_path = Path(root) / file if file_filter.should_process_path(file_path, base_path): if os.path.getmtime(str(file_path)) > index_mtime: return False - + return True - + except Exception as e: logger.warning(f"Error checking index freshness: {e}") return False - + def _try_load_cached_index(self, expected_project_path: Optional[str] = None) -> bool: """ Try to load a cached index file if available. - + This is a simplified version of auto-initialization that only loads a cached index if we can verify it matches the expected project. - + Args: expected_project_path: Optional path to verify against cached index - + Returns: True if cached index was loaded successfully, False otherwise. """ @@ -358,28 +387,28 @@ def _try_load_cached_index(self, expected_project_path: Optional[str] = None) -> # First try to load from current index_path if set if self.index_path and os.path.exists(self.index_path): return self.load_index() - + # If expected project path provided, try to find its cache if expected_project_path: project_hash = hashlib.md5(expected_project_path.encode()).hexdigest()[:12] temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, project_hash) index_path = os.path.join(temp_dir, INDEX_FILE) - + if os.path.exists(index_path): # Verify the cached index matches the expected project with open(index_path, 'r', encoding='utf-8') as f: index_data = json.load(f) cached_project = index_data.get('metadata', {}).get('project_path') - + if cached_project == expected_project_path: self.temp_dir = temp_dir self.index_path = index_path return self.load_index() else: logger.warning(f"Cached index project mismatch: {cached_project} != {expected_project_path}") - + return False - + except Exception as e: logger.debug(f"Failed to load cached index: {e}") return False @@ -393,6 +422,39 @@ def cleanup(self): self.index_path = None logger.info("Cleaned up JSON Index Manager") + @staticmethod + def _compile_glob_regex(pattern: str) -> re.Pattern: + """ + Compile a glob pattern where '*' does not match '/', and '**' matches across directories. + + Examples: + src/*.py -> direct children .py under src + **/*.py -> .py at any depth + """ + # Translate glob to regex + i = 0 + out = [] + special = ".^$+{}[]|()" + while i < len(pattern): + c = pattern[i] + if c == '*': + if i + 1 < len(pattern) and pattern[i + 1] == '*': + # '**' -> match across directories + out.append('.*') + i += 2 + continue + else: + out.append('[^/]*') + elif c == '?': + out.append('[^/]') + elif c in special: + out.append('\\' + c) + else: + out.append(c) + i += 1 + regex_str = '^' + ''.join(out) + '$' + return re.compile(regex_str) + # Global instance _index_manager = JSONIndexManager() @@ -400,4 +462,4 @@ def cleanup(self): def get_index_manager() -> JSONIndexManager: """Get the global index manager instance.""" - return _index_manager \ No newline at end of file + return _index_manager diff --git a/src/code_index_mcp/indexing/shallow_index_manager.py b/src/code_index_mcp/indexing/shallow_index_manager.py new file mode 100644 index 0000000..530c593 --- /dev/null +++ b/src/code_index_mcp/indexing/shallow_index_manager.py @@ -0,0 +1,155 @@ +""" +Shallow Index Manager - Manages a minimal file-list-only index. + +This manager builds and loads a shallow index consisting of relative file +paths only. It is optimized for fast initialization and filename-based +search/browsing. Content parsing and symbol extraction are not performed. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +import tempfile +import threading +from typing import List, Optional +import re + +from .json_index_builder import JSONIndexBuilder +from ..constants import SETTINGS_DIR, INDEX_FILE_SHALLOW + +logger = logging.getLogger(__name__) + + +class ShallowIndexManager: + """Manage shallow (file-list) index lifecycle and storage.""" + + def __init__(self) -> None: + self.project_path: Optional[str] = None + self.index_builder: Optional[JSONIndexBuilder] = None + self.temp_dir: Optional[str] = None + self.index_path: Optional[str] = None + self._file_list: Optional[List[str]] = None + self._lock = threading.RLock() + + def set_project_path(self, project_path: str) -> bool: + with self._lock: + try: + if not isinstance(project_path, str) or not project_path.strip(): + logger.error("Invalid project path for shallow index") + return False + project_path = project_path.strip() + if not os.path.isdir(project_path): + logger.error(f"Project path does not exist: {project_path}") + return False + + self.project_path = project_path + self.index_builder = JSONIndexBuilder(project_path) + + project_hash = hashlib.md5(project_path.encode()).hexdigest()[:12] + self.temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, project_hash) + os.makedirs(self.temp_dir, exist_ok=True) + self.index_path = os.path.join(self.temp_dir, INDEX_FILE_SHALLOW) + return True + except Exception as e: # noqa: BLE001 - centralized logging + logger.error(f"Failed to set project path (shallow): {e}") + return False + + def build_index(self) -> bool: + """Build and persist the shallow file list index.""" + with self._lock: + if not self.index_builder or not self.index_path: + logger.error("ShallowIndexManager not initialized") + return False + try: + file_list = self.index_builder.build_shallow_file_list() + with open(self.index_path, 'w', encoding='utf-8') as f: + json.dump(file_list, f, ensure_ascii=False) + self._file_list = file_list + logger.info(f"Built shallow index with {len(file_list)} files") + return True + except Exception as e: # noqa: BLE001 + logger.error(f"Failed to build shallow index: {e}") + return False + + def load_index(self) -> bool: + """Load shallow index from disk to memory.""" + with self._lock: + try: + if not self.index_path or not os.path.exists(self.index_path): + return False + with open(self.index_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list): + # Normalize slashes/prefix + normalized: List[str] = [] + for p in data: + if isinstance(p, str): + q = p.replace('\\\\', '/').replace('\\', '/') + if q.startswith('./'): + q = q[2:] + normalized.append(q) + self._file_list = normalized + return True + return False + except Exception as e: # noqa: BLE001 + logger.error(f"Failed to load shallow index: {e}") + return False + + def get_file_list(self) -> List[str]: + with self._lock: + return list(self._file_list or []) + + def find_files(self, pattern: str = "*") -> List[str]: + with self._lock: + if not isinstance(pattern, str): + return [] + norm = (pattern.strip() or "*").replace('\\\\','/').replace('\\','/') + regex = self._compile_glob_regex(norm) + files = self._file_list or [] + if norm == "*": + return list(files) + return [f for f in files if regex.match(f) is not None] + + @staticmethod + def _compile_glob_regex(pattern: str) -> re.Pattern: + i = 0 + out = [] + special = ".^$+{}[]|()" + while i < len(pattern): + c = pattern[i] + if c == '*': + if i + 1 < len(pattern) and pattern[i + 1] == '*': + out.append('.*') + i += 2 + continue + else: + out.append('[^/]*') + elif c == '?': + out.append('[^/]') + elif c in special: + out.append('\\' + c) + else: + out.append(c) + i += 1 + return re.compile('^' + ''.join(out) + '$') + + def cleanup(self) -> None: + with self._lock: + self.project_path = None + self.index_builder = None + self.temp_dir = None + self.index_path = None + self._file_list = None + + +# Global singleton +_shallow_manager = ShallowIndexManager() + + +def get_shallow_index_manager() -> ShallowIndexManager: + return _shallow_manager + + diff --git a/src/code_index_mcp/indexing/strategies/java_strategy.py b/src/code_index_mcp/indexing/strategies/java_strategy.py index b1c9845..af2ff8e 100644 --- a/src/code_index_mcp/indexing/strategies/java_strategy.py +++ b/src/code_index_mcp/indexing/strategies/java_strategy.py @@ -1,10 +1,9 @@ """ -Java parsing strategy using tree-sitter. +Java parsing strategy using tree-sitter - Optimized single-pass version. """ import logging -import re -from typing import Dict, List, Tuple, Optional +from typing import Dict, List, Tuple, Optional, Set from .base_strategy import ParsingStrategy from ..models import SymbolInfo, FileInfo @@ -15,7 +14,7 @@ class JavaParsingStrategy(ParsingStrategy): - """Java-specific parsing strategy.""" + """Java-specific parsing strategy - Single Pass Optimized.""" def __init__(self): self.java_language = tree_sitter.Language(language()) @@ -27,31 +26,40 @@ def get_supported_extensions(self) -> List[str]: return ['.java'] def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Parse Java file using tree-sitter.""" - return self._tree_sitter_parse(file_path, content) - - def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Parse using tree-sitter.""" + """Parse Java file using tree-sitter with single-pass optimization.""" symbols = {} functions = [] classes = [] imports = [] package = None + + # Symbol lookup index for O(1) access + symbol_lookup = {} # name -> symbol_id mapping parser = tree_sitter.Parser(self.java_language) try: tree = parser.parse(content.encode('utf8')) - # Phase 1: Extract symbol definitions - self._traverse_java_node(tree.root_node, content, file_path, symbols, functions, classes, imports) - # Phase 2: Analyze method calls and build relationships - self._analyze_java_calls(tree, content, symbols, file_path) - - # Extract package info + + # Extract package info first for node in tree.root_node.children: if node.type == 'package_declaration': package = self._extract_java_package(node, content) break + + # Single-pass traversal that handles everything + context = TraversalContext( + content=content, + file_path=file_path, + symbols=symbols, + functions=functions, + classes=classes, + imports=imports, + symbol_lookup=symbol_lookup + ) + + self._traverse_node_single_pass(tree.root_node, context) + except Exception as e: logger.warning(f"Error parsing Java file {file_path}: {e}") @@ -65,36 +73,90 @@ def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, Sy return symbols, file_info - - def _traverse_java_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], - functions: List[str], classes: List[str], imports: List[str]): - """Traverse Java AST node.""" + def _traverse_node_single_pass(self, node, context: 'TraversalContext', + current_class: Optional[str] = None, + current_method: Optional[str] = None): + """Single-pass traversal that extracts symbols and analyzes calls.""" + + # Handle class declarations if node.type == 'class_declaration': - name = self._get_java_class_name(node, content) + name = self._get_java_class_name(node, context.content) if name: - symbol_id = self._create_symbol_id(file_path, name) - symbols[symbol_id] = SymbolInfo( + symbol_id = self._create_symbol_id(context.file_path, name) + symbol_info = SymbolInfo( type="class", - file=file_path, + file=context.file_path, line=node.start_point[0] + 1 ) - classes.append(name) - + context.symbols[symbol_id] = symbol_info + context.symbol_lookup[name] = symbol_id + context.classes.append(name) + + # Traverse class body with updated context + for child in node.children: + self._traverse_node_single_pass(child, context, current_class=name, current_method=current_method) + return + + # Handle method declarations elif node.type == 'method_declaration': - name = self._get_java_method_name(node, content) + name = self._get_java_method_name(node, context.content) if name: - symbol_id = self._create_symbol_id(file_path, name) - symbols[symbol_id] = SymbolInfo( + # Build full method name with class context + if current_class: + full_name = f"{current_class}.{name}" + else: + full_name = name + + symbol_id = self._create_symbol_id(context.file_path, full_name) + symbol_info = SymbolInfo( type="method", - file=file_path, + file=context.file_path, line=node.start_point[0] + 1, - signature=self._get_java_method_signature(node, content) + signature=self._get_java_method_signature(node, context.content) ) - functions.append(name) - - # Continue traversing children + context.symbols[symbol_id] = symbol_info + context.symbol_lookup[full_name] = symbol_id + context.symbol_lookup[name] = symbol_id # Also index by method name alone + context.functions.append(full_name) + + # Traverse method body with updated context + for child in node.children: + self._traverse_node_single_pass(child, context, current_class=current_class, + current_method=symbol_id) + return + + # Handle method invocations (calls) + elif node.type == 'method_invocation': + if current_method: + called_method = self._get_called_method_name(node, context.content) + if called_method: + # Use O(1) lookup instead of O(n) iteration + if called_method in context.symbol_lookup: + symbol_id = context.symbol_lookup[called_method] + symbol_info = context.symbols[symbol_id] + if current_method not in symbol_info.called_by: + symbol_info.called_by.append(current_method) + else: + # Try to find method with class prefix + for name, sid in context.symbol_lookup.items(): + if name.endswith(f".{called_method}"): + symbol_info = context.symbols[sid] + if current_method not in symbol_info.called_by: + symbol_info.called_by.append(current_method) + break + + # Handle import declarations + elif node.type == 'import_declaration': + import_text = context.content[node.start_byte:node.end_byte] + # Extract the import path (remove 'import' keyword and semicolon) + import_path = import_text.replace('import', '').replace(';', '').strip() + if import_path: + context.imports.append(import_path) + + # Continue traversing children for other node types for child in node.children: - self._traverse_java_node(child, content, file_path, symbols, functions, classes, imports) + self._traverse_node_single_pass(child, context, current_class=current_class, + current_method=current_method) def _get_java_class_name(self, node, content: str) -> Optional[str]: for child in node.children: @@ -117,34 +179,31 @@ def _extract_java_package(self, node, content: str) -> Optional[str]: return content[child.start_byte:child.end_byte] return None - def _analyze_java_calls(self, tree, content: str, symbols: Dict[str, SymbolInfo], file_path: str): - """Analyze Java method calls for relationships.""" - self._find_java_calls(tree.root_node, content, symbols, file_path) - - def _find_java_calls(self, node, content: str, symbols: Dict[str, SymbolInfo], file_path: str, current_method: str = None): - """Recursively find Java method calls.""" - if node.type == 'method_declaration': - method_name = self._get_java_method_name(node, content) - if method_name: - current_method = self._create_symbol_id(file_path, method_name) - - elif node.type == 'method_invocation': - if current_method: - called_method = self._get_called_method_name(node, content) - if called_method: - # Find the called method in symbols and add relationship - for symbol_id, symbol_info in symbols.items(): - if called_method in symbol_id.split("::")[-1]: - if current_method not in symbol_info.called_by: - symbol_info.called_by.append(current_method) - - # Continue traversing children - for child in node.children: - self._find_java_calls(child, content, symbols, file_path, current_method) - def _get_called_method_name(self, node, content: str) -> Optional[str]: """Extract called method name from method invocation node.""" + # Handle obj.method() pattern - look for the method name after the dot for child in node.children: - if child.type == 'identifier': + if child.type == 'field_access': + # For field_access nodes, get the field (method) name + for subchild in child.children: + if subchild.type == 'identifier' and subchild.start_byte > child.start_byte: + # Get the rightmost identifier (the method name) + return content[subchild.start_byte:subchild.end_byte] + elif child.type == 'identifier': + # Direct method call without object reference return content[child.start_byte:child.end_byte] return None + + +class TraversalContext: + """Context object to pass state during single-pass traversal.""" + + def __init__(self, content: str, file_path: str, symbols: Dict, + functions: List, classes: List, imports: List, symbol_lookup: Dict): + self.content = content + self.file_path = file_path + self.symbols = symbols + self.functions = functions + self.classes = classes + self.imports = imports + self.symbol_lookup = symbol_lookup \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/python_strategy.py b/src/code_index_mcp/indexing/strategies/python_strategy.py index 2cf62cd..a09d00c 100644 --- a/src/code_index_mcp/indexing/strategies/python_strategy.py +++ b/src/code_index_mcp/indexing/strategies/python_strategy.py @@ -1,10 +1,10 @@ """ -Python parsing strategy using AST. +Python parsing strategy using AST - Optimized single-pass version. """ import ast import logging -from typing import Dict, List, Tuple, Optional +from typing import Dict, List, Tuple, Optional, Set from .base_strategy import ParsingStrategy from ..models import SymbolInfo, FileInfo @@ -12,7 +12,7 @@ class PythonParsingStrategy(ParsingStrategy): - """Python-specific parsing strategy using Python's built-in AST.""" + """Python-specific parsing strategy using Python's built-in AST - Single Pass Optimized.""" def get_language_name(self) -> str: return "python" @@ -21,7 +21,7 @@ def get_supported_extensions(self) -> List[str]: return ['.py', '.pyw'] def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Parse Python file using AST.""" + """Parse Python file using AST with single-pass optimization.""" symbols = {} functions = [] classes = [] @@ -29,10 +29,9 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo try: tree = ast.parse(content) - # Phase 1: Extract symbol definitions - self._visit_ast_node(tree, symbols, functions, classes, imports, file_path, content) - # Phase 2: Analyze function calls and build relationships - self._analyze_calls(tree, symbols, file_path) + # Single-pass visitor that handles everything at once + visitor = SinglePassVisitor(symbols, functions, classes, imports, file_path) + visitor.visit(tree) except SyntaxError as e: logger.warning(f"Syntax error in Python file {file_path}: {e}") except Exception as e: @@ -46,158 +45,161 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo ) return symbols, file_info + + +class SinglePassVisitor(ast.NodeVisitor): + """Single-pass AST visitor that extracts symbols and analyzes calls in one traversal.""" - def _visit_ast_node(self, node: ast.AST, symbols: Dict, functions: List, - classes: List, imports: List, file_path: str, content: str): - """Visit AST nodes and extract symbols.""" - # Track processed nodes to avoid duplicates - processed_nodes = set() - - # First pass: handle classes and mark their methods as processed - for child in ast.walk(node): - if isinstance(child, ast.ClassDef): - self._handle_class(child, symbols, classes, file_path, functions) - # Mark all methods in this class as processed - for class_child in child.body: - if isinstance(class_child, ast.FunctionDef): - processed_nodes.add(id(class_child)) - - # Second pass: handle standalone functions and imports - for child in ast.walk(node): - if isinstance(child, ast.FunctionDef) and id(child) not in processed_nodes: - self._handle_function(child, symbols, functions, file_path) - elif isinstance(child, (ast.Import, ast.ImportFrom)): - self._handle_import(child, imports) - - def _handle_function(self, node: ast.FunctionDef, symbols: Dict, functions: List, file_path: str): - """Handle function definition.""" - func_name = node.name - symbol_id = self._create_symbol_id(file_path, func_name) + def __init__(self, symbols: Dict[str, SymbolInfo], functions: List[str], + classes: List[str], imports: List[str], file_path: str): + self.symbols = symbols + self.functions = functions + self.classes = classes + self.imports = imports + self.file_path = file_path - # Extract function signature - signature = self._extract_function_signature(node) + # Context tracking for call analysis + self.current_function_stack = [] + self.current_class = None - # Extract docstring - docstring = ast.get_docstring(node) + # Symbol lookup index for O(1) access + self.symbol_lookup = {} # name -> symbol_id mapping for fast lookups - symbols[symbol_id] = SymbolInfo( - type="function", - file=file_path, - line=node.lineno, - signature=signature, - docstring=docstring - ) - functions.append(func_name) + # Track processed nodes to avoid duplicates + self.processed_nodes: Set[int] = set() - def _handle_class(self, node: ast.ClassDef, symbols: Dict, classes: List, file_path: str, functions: List = None): - """Handle class definition.""" + def visit_ClassDef(self, node: ast.ClassDef): + """Visit class definition - extract symbol and analyze in single pass.""" class_name = node.name - symbol_id = self._create_symbol_id(file_path, class_name) + symbol_id = self._create_symbol_id(self.file_path, class_name) # Extract docstring docstring = ast.get_docstring(node) - symbols[symbol_id] = SymbolInfo( + # Create symbol info + symbol_info = SymbolInfo( type="class", - file=file_path, + file=self.file_path, line=node.lineno, docstring=docstring ) - classes.append(class_name) - # Handle methods within the class + # Store in symbols and lookup index + self.symbols[symbol_id] = symbol_info + self.symbol_lookup[class_name] = symbol_id + self.classes.append(class_name) + + # Track class context for method processing + old_class = self.current_class + self.current_class = class_name + + # Process class body (including methods) for child in node.body: if isinstance(child, ast.FunctionDef): - method_name = f"{class_name}.{child.name}" - method_symbol_id = self._create_symbol_id(file_path, method_name) - - method_signature = self._extract_function_signature(child) - method_docstring = ast.get_docstring(child) - - symbols[method_symbol_id] = SymbolInfo( - type="method", - file=file_path, - line=child.lineno, - signature=method_signature, - docstring=method_docstring - ) - - # Add method to functions list if provided - if functions is not None: - functions.append(method_name) - - def _handle_import(self, node, imports: List): - """Handle import statements.""" - if isinstance(node, ast.Import): - for alias in node.names: - imports.append(alias.name) - elif isinstance(node, ast.ImportFrom): - if node.module: - for alias in node.names: - imports.append(f"{node.module}.{alias.name}") + self._handle_method(child, class_name) + else: + # Visit other nodes in class body + self.visit(child) + + # Restore previous class context + self.current_class = old_class - def _extract_function_signature(self, node: ast.FunctionDef) -> str: - """Extract function signature from AST node.""" - # Build basic signature - args = [] + def visit_FunctionDef(self, node: ast.FunctionDef): + """Visit function definition - extract symbol and track context.""" + # Skip if this is a method (already handled by ClassDef) + if self.current_class: + return - # Regular arguments - for arg in node.args.args: - args.append(arg.arg) + # Skip if already processed + node_id = id(node) + if node_id in self.processed_nodes: + return + self.processed_nodes.add(node_id) - # Varargs (*args) - if node.args.vararg: - args.append(f"*{node.args.vararg.arg}") + func_name = node.name + symbol_id = self._create_symbol_id(self.file_path, func_name) - # Keyword arguments (**kwargs) - if node.args.kwarg: - args.append(f"**{node.args.kwarg.arg}") + # Extract function signature and docstring + signature = self._extract_function_signature(node) + docstring = ast.get_docstring(node) - signature = f"def {node.name}({', '.join(args)}):" - return signature - - def _analyze_calls(self, tree: ast.AST, symbols: Dict[str, SymbolInfo], file_path: str): - """Analyze function calls and build caller-callee relationships.""" - visitor = CallAnalysisVisitor(symbols, file_path) - visitor.visit(tree) - - -class CallAnalysisVisitor(ast.NodeVisitor): - """AST visitor to analyze function calls and build caller-callee relationships.""" - - def __init__(self, symbols: Dict[str, SymbolInfo], file_path: str): - self.symbols = symbols - self.file_path = file_path - self.current_function_stack = [] - self.current_class = None - - def visit_ClassDef(self, node: ast.ClassDef): - """Visit class definition and track context.""" - self.current_class = node.name + # Create symbol info + symbol_info = SymbolInfo( + type="function", + file=self.file_path, + line=node.lineno, + signature=signature, + docstring=docstring + ) + + # Store in symbols and lookup index + self.symbols[symbol_id] = symbol_info + self.symbol_lookup[func_name] = symbol_id + self.functions.append(func_name) + + # Track function context for call analysis + function_id = f"{self.file_path}::{func_name}" + self.current_function_stack.append(function_id) + + # Visit function body to analyze calls self.generic_visit(node) - self.current_class = None + + # Pop function from stack + self.current_function_stack.pop() - def visit_FunctionDef(self, node: ast.FunctionDef): - """Visit function definition and track context.""" - # File path is already relative after our fix - relative_path = self.file_path + def _handle_method(self, node: ast.FunctionDef, class_name: str): + """Handle method definition within a class.""" + method_name = f"{class_name}.{node.name}" + method_symbol_id = self._create_symbol_id(self.file_path, method_name) - # Handle methods within classes - if self.current_class: - function_id = f"{relative_path}::{self.current_class}.{node.name}" - else: - function_id = f"{relative_path}::{node.name}" - + method_signature = self._extract_function_signature(node) + method_docstring = ast.get_docstring(node) + + # Create symbol info + symbol_info = SymbolInfo( + type="method", + file=self.file_path, + line=node.lineno, + signature=method_signature, + docstring=method_docstring + ) + + # Store in symbols and lookup index + self.symbols[method_symbol_id] = symbol_info + self.symbol_lookup[method_name] = method_symbol_id + self.symbol_lookup[node.name] = method_symbol_id # Also index by method name alone + self.functions.append(method_name) + + # Track method context for call analysis + function_id = f"{self.file_path}::{method_name}" self.current_function_stack.append(function_id) - # Visit all child nodes within this function - self.generic_visit(node) + # Visit method body to analyze calls + for child in node.body: + self.visit(child) - # Pop the function from stack when done + # Pop method from stack self.current_function_stack.pop() + def visit_Import(self, node: ast.Import): + """Handle import statements.""" + for alias in node.names: + self.imports.append(alias.name) + self.generic_visit(node) + + def visit_ImportFrom(self, node: ast.ImportFrom): + """Handle from...import statements.""" + if node.module: + for alias in node.names: + self.imports.append(f"{node.module}.{alias.name}") + self.generic_visit(node) + def visit_Call(self, node: ast.Call): - """Visit function call and record relationship.""" + """Visit function call and record relationship using O(1) lookup.""" + if not self.current_function_stack: + self.generic_visit(node) + return + try: # Get the function name being called called_function = None @@ -208,28 +210,55 @@ def visit_Call(self, node: ast.Call): elif isinstance(node.func, ast.Attribute): # Method call: obj.method() or module.function() called_function = node.func.attr - - if called_function and self.current_function_stack: + + if called_function: # Get the current calling function caller_function = self.current_function_stack[-1] - # Look for the called function in our symbols and add relationship - for symbol_id, symbol_info in self.symbols.items(): + # Use O(1) lookup instead of O(n) iteration + # First try exact match + if called_function in self.symbol_lookup: + symbol_id = self.symbol_lookup[called_function] + symbol_info = self.symbols[symbol_id] if symbol_info.type in ["function", "method"]: - # Extract just the function/method name from the symbol ID - symbol_name = symbol_id.split("::")[-1] - - # Check for exact match or method name match (ClassName.method) - if (symbol_name == called_function or - symbol_name.endswith(f".{called_function}")): - # Add caller to the called function's called_by list - if caller_function not in symbol_info.called_by: - symbol_info.called_by.append(caller_function) - break + if caller_function not in symbol_info.called_by: + symbol_info.called_by.append(caller_function) + else: + # Try method name match for any class + for name, symbol_id in self.symbol_lookup.items(): + if name.endswith(f".{called_function}"): + symbol_info = self.symbols[symbol_id] + if symbol_info.type in ["function", "method"]: + if caller_function not in symbol_info.called_by: + symbol_info.called_by.append(caller_function) + break except Exception: # Silently handle parsing errors for complex call patterns pass - + # Continue visiting child nodes self.generic_visit(node) + def _create_symbol_id(self, file_path: str, symbol_name: str) -> str: + """Create a unique symbol ID.""" + return f"{file_path}::{symbol_name}" + + def _extract_function_signature(self, node: ast.FunctionDef) -> str: + """Extract function signature from AST node.""" + # Build basic signature + args = [] + + # Regular arguments + for arg in node.args.args: + args.append(arg.arg) + + # Varargs (*args) + if node.args.vararg: + args.append(f"*{node.args.vararg.arg}") + + # Keyword arguments (**kwargs) + if node.args.kwarg: + args.append(f"**{node.args.kwarg.arg}") + + signature = f"def {node.name}({', '.join(args)}):" + return signature \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/typescript_strategy.py b/src/code_index_mcp/indexing/strategies/typescript_strategy.py index efd2ec9..05ed04d 100644 --- a/src/code_index_mcp/indexing/strategies/typescript_strategy.py +++ b/src/code_index_mcp/indexing/strategies/typescript_strategy.py @@ -1,9 +1,9 @@ """ -TypeScript parsing strategy using tree-sitter. +TypeScript parsing strategy using tree-sitter - Optimized single-pass version. """ import logging -from typing import Dict, List, Tuple, Optional +from typing import Dict, List, Tuple, Optional, Set from .base_strategy import ParsingStrategy from ..models import SymbolInfo, FileInfo @@ -14,7 +14,7 @@ class TypeScriptParsingStrategy(ParsingStrategy): - """TypeScript-specific parsing strategy using tree-sitter.""" + """TypeScript-specific parsing strategy using tree-sitter - Single Pass Optimized.""" def __init__(self): self.ts_language = tree_sitter.Language(language_typescript()) @@ -26,19 +26,32 @@ def get_supported_extensions(self) -> List[str]: return ['.ts', '.tsx'] def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Parse TypeScript file using tree-sitter.""" + """Parse TypeScript file using tree-sitter with single-pass optimization.""" symbols = {} functions = [] classes = [] imports = [] exports = [] + # Symbol lookup index for O(1) access + symbol_lookup = {} # name -> symbol_id mapping + parser = tree_sitter.Parser(self.ts_language) tree = parser.parse(content.encode('utf8')) - # Phase 1: Extract symbols - self._traverse_ts_node(tree.root_node, content, file_path, symbols, functions, classes, imports, exports) - # Phase 2: Analyze function calls using tree-sitter - self._analyze_ts_calls_with_tree_sitter(tree.root_node, content, file_path, symbols) + + # Single-pass traversal that handles everything + context = TraversalContext( + content=content, + file_path=file_path, + symbols=symbols, + functions=functions, + classes=classes, + imports=imports, + exports=exports, + symbol_lookup=symbol_lookup + ) + + self._traverse_node_single_pass(tree.root_node, context) file_info = FileInfo( language=self.get_language_name(), @@ -50,63 +63,145 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo return symbols, file_info - def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], - functions: List[str], classes: List[str], imports: List[str], exports: List[str]): - """Traverse TypeScript AST node.""" + def _traverse_node_single_pass(self, node, context: 'TraversalContext', + current_function: Optional[str] = None, + current_class: Optional[str] = None): + """Single-pass traversal that extracts symbols and analyzes calls.""" + + # Handle function declarations if node.type == 'function_declaration': - name = self._get_function_name(node, content) + name = self._get_function_name(node, context.content) if name: - symbol_id = self._create_symbol_id(file_path, name) - signature = self._get_ts_function_signature(node, content) - symbols[symbol_id] = SymbolInfo( + symbol_id = self._create_symbol_id(context.file_path, name) + signature = self._get_ts_function_signature(node, context.content) + symbol_info = SymbolInfo( type="function", - file=file_path, + file=context.file_path, line=node.start_point[0] + 1, signature=signature ) - functions.append(name) - + context.symbols[symbol_id] = symbol_info + context.symbol_lookup[name] = symbol_id + context.functions.append(name) + + # Traverse function body with updated context + func_context = f"{context.file_path}::{name}" + for child in node.children: + self._traverse_node_single_pass(child, context, current_function=func_context, + current_class=current_class) + return + + # Handle class declarations elif node.type == 'class_declaration': - name = self._get_class_name(node, content) + name = self._get_class_name(node, context.content) if name: - symbol_id = self._create_symbol_id(file_path, name) - symbols[symbol_id] = SymbolInfo( + symbol_id = self._create_symbol_id(context.file_path, name) + symbol_info = SymbolInfo( type="class", - file=file_path, + file=context.file_path, line=node.start_point[0] + 1 ) - classes.append(name) + context.symbols[symbol_id] = symbol_info + context.symbol_lookup[name] = symbol_id + context.classes.append(name) + # Traverse class body with updated context + for child in node.children: + self._traverse_node_single_pass(child, context, current_function=current_function, + current_class=name) + return + + # Handle interface declarations elif node.type == 'interface_declaration': - name = self._get_interface_name(node, content) + name = self._get_interface_name(node, context.content) if name: - symbol_id = self._create_symbol_id(file_path, name) - symbols[symbol_id] = SymbolInfo( + symbol_id = self._create_symbol_id(context.file_path, name) + symbol_info = SymbolInfo( type="interface", - file=file_path, + file=context.file_path, line=node.start_point[0] + 1 ) - classes.append(name) # Group interfaces with classes for simplicity + context.symbols[symbol_id] = symbol_info + context.symbol_lookup[name] = symbol_id + context.classes.append(name) # Group interfaces with classes + + # Traverse interface body with updated context + for child in node.children: + self._traverse_node_single_pass(child, context, current_function=current_function, + current_class=name) + return + # Handle method definitions elif node.type == 'method_definition': - method_name = self._get_method_name(node, content) - class_name = self._find_parent_class(node, content) - if method_name and class_name: - full_name = f"{class_name}.{method_name}" - symbol_id = self._create_symbol_id(file_path, full_name) - signature = self._get_ts_function_signature(node, content) - symbols[symbol_id] = SymbolInfo( + method_name = self._get_method_name(node, context.content) + if method_name and current_class: + full_name = f"{current_class}.{method_name}" + symbol_id = self._create_symbol_id(context.file_path, full_name) + signature = self._get_ts_function_signature(node, context.content) + symbol_info = SymbolInfo( type="method", - file=file_path, + file=context.file_path, line=node.start_point[0] + 1, signature=signature ) - # Add method to functions list for consistency - functions.append(full_name) + context.symbols[symbol_id] = symbol_info + context.symbol_lookup[full_name] = symbol_id + context.symbol_lookup[method_name] = symbol_id # Also index by method name alone + context.functions.append(full_name) + + # Traverse method body with updated context + method_context = f"{context.file_path}::{full_name}" + for child in node.children: + self._traverse_node_single_pass(child, context, current_function=method_context, + current_class=current_class) + return + + # Handle function calls + elif node.type == 'call_expression' and current_function: + # Extract the function being called + called_function = None + if node.children: + func_node = node.children[0] + if func_node.type == 'identifier': + # Direct function call + called_function = context.content[func_node.start_byte:func_node.end_byte] + elif func_node.type == 'member_expression': + # Method call (obj.method or this.method) + for child in func_node.children: + if child.type == 'property_identifier': + called_function = context.content[child.start_byte:child.end_byte] + break + + # Add relationship using O(1) lookup + if called_function: + if called_function in context.symbol_lookup: + symbol_id = context.symbol_lookup[called_function] + symbol_info = context.symbols[symbol_id] + if current_function not in symbol_info.called_by: + symbol_info.called_by.append(current_function) + else: + # Try to find method with class prefix + for name, sid in context.symbol_lookup.items(): + if name.endswith(f".{called_function}"): + symbol_info = context.symbols[sid] + if current_function not in symbol_info.called_by: + symbol_info.called_by.append(current_function) + break + + # Handle import declarations + elif node.type == 'import_statement': + import_text = context.content[node.start_byte:node.end_byte] + context.imports.append(import_text) - # Continue traversing children + # Handle export declarations + elif node.type in ['export_statement', 'export_default_declaration']: + export_text = context.content[node.start_byte:node.end_byte] + context.exports.append(export_text) + + # Continue traversing children for other node types for child in node.children: - self._traverse_ts_node(child, content, file_path, symbols, functions, classes, imports, exports) + self._traverse_node_single_pass(child, context, current_function=current_function, + current_class=current_class) def _get_function_name(self, node, content: str) -> Optional[str]: """Extract function name from tree-sitter node.""" @@ -136,65 +231,21 @@ def _get_method_name(self, node, content: str) -> Optional[str]: return content[child.start_byte:child.end_byte] return None - def _find_parent_class(self, node, content: str) -> Optional[str]: - """Find the parent class of a method.""" - parent = node.parent - while parent: - if parent.type in ['class_declaration', 'interface_declaration']: - return self._get_class_name(parent, content) or self._get_interface_name(parent, content) - parent = parent.parent - return None - def _get_ts_function_signature(self, node, content: str) -> str: """Extract TypeScript function signature.""" return content[node.start_byte:node.end_byte].split('\n')[0].strip() - def _analyze_ts_calls_with_tree_sitter(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], - current_function: Optional[str] = None, current_class: Optional[str] = None): - """Analyze TypeScript function calls using tree-sitter AST.""" - # Track function/method context - if node.type == 'function_declaration': - func_name = self._get_function_name(node, content) - if func_name: - current_function = f"{file_path}::{func_name}" - elif node.type == 'method_definition': - method_name = self._get_method_name(node, content) - parent_class = self._find_parent_class(node, content) - if method_name and parent_class: - current_function = f"{file_path}::{parent_class}.{method_name}" - elif node.type == 'class_declaration': - current_class = self._get_class_name(node, content) - - # Detect function calls - if node.type == 'call_expression' and current_function: - # Extract the function being called - called_function = None - if node.children: - func_node = node.children[0] - if func_node.type == 'identifier': - # Direct function call - called_function = content[func_node.start_byte:func_node.end_byte] - elif func_node.type == 'member_expression': - # Method call (obj.method or this.method) - for child in func_node.children: - if child.type == 'property_identifier': - called_function = content[child.start_byte:child.end_byte] - break - - # Add relationship if we found the called function - if called_function: - for symbol_id, symbol_info in symbols.items(): - if symbol_info.type in ["function", "method"]: - symbol_name = symbol_id.split("::")[-1] - # Check for exact match or method name match - if (symbol_name == called_function or - symbol_name.endswith(f".{called_function}")): - if current_function not in symbol_info.called_by: - symbol_info.called_by.append(current_function) - break - - # Recursively process children - for child in node.children: - self._analyze_ts_calls_with_tree_sitter(child, content, file_path, symbols, current_function, current_class) +class TraversalContext: + """Context object to pass state during single-pass traversal.""" + def __init__(self, content: str, file_path: str, symbols: Dict, + functions: List, classes: List, imports: List, exports: List, symbol_lookup: Dict): + self.content = content + self.file_path = file_path + self.symbols = symbols + self.functions = functions + self.classes = classes + self.imports = imports + self.exports = exports + self.symbol_lookup = symbol_lookup \ No newline at end of file diff --git a/src/code_index_mcp/search/ag.py b/src/code_index_mcp/search/ag.py index e2506a2..aa3eb33 100644 --- a/src/code_index_mcp/search/ag.py +++ b/src/code_index_mcp/search/ag.py @@ -27,7 +27,8 @@ def search( context_lines: int = 0, file_pattern: Optional[str] = None, fuzzy: bool = False, - regex: bool = False + regex: bool = False, + max_line_length: Optional[int] = None ) -> Dict[str, List[Tuple[int, str]]]: """ Execute a search using The Silver Searcher (ag). @@ -40,6 +41,7 @@ def search( file_pattern: File pattern to filter fuzzy: Enable word boundary matching (not true fuzzy search) regex: Enable regex pattern matching + max_line_length: Optional. Limit the length of lines when context_lines is used """ # ag prints line numbers and groups by file by default, which is good. # --noheading is used to be consistent with other tools' output format. @@ -93,6 +95,26 @@ def search( cmd.extend(['-G', regex_pattern]) + processed_patterns = set() + exclude_dirs = getattr(self, 'exclude_dirs', []) + exclude_file_patterns = getattr(self, 'exclude_file_patterns', []) + + for directory in exclude_dirs: + normalized = directory.strip() + if not normalized or normalized in processed_patterns: + continue + cmd.extend(['--ignore', normalized]) + processed_patterns.add(normalized) + + for pattern in exclude_file_patterns: + normalized = pattern.strip() + if not normalized or normalized in processed_patterns: + continue + if normalized.startswith('!'): + normalized = normalized[1:] + cmd.extend(['--ignore', normalized]) + processed_patterns.add(normalized) + # Add -- to treat pattern as a literal argument, preventing injection cmd.append('--') cmd.append(search_pattern) @@ -116,10 +138,10 @@ def search( if process.returncode > 1: raise RuntimeError(f"ag failed with exit code {process.returncode}: {process.stderr}") - return parse_search_output(process.stdout, base_path) + return parse_search_output(process.stdout, base_path, max_line_length) except FileNotFoundError: raise RuntimeError("'ag' (The Silver Searcher) not found. Please install it and ensure it's in your PATH.") except Exception as e: # Re-raise other potential exceptions like permission errors - raise RuntimeError(f"An error occurred while running ag: {e}") + raise RuntimeError(f"An error occurred while running ag: {e}") diff --git a/src/code_index_mcp/search/base.py b/src/code_index_mcp/search/base.py index 038e6b5..5e4c63b 100644 --- a/src/code_index_mcp/search/base.py +++ b/src/code_index_mcp/search/base.py @@ -10,17 +10,25 @@ import subprocess import sys from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Tuple, Any +from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING from ..indexing.qualified_names import normalize_file_path -def parse_search_output(output: str, base_path: str) -> Dict[str, List[Tuple[int, str]]]: +if TYPE_CHECKING: # pragma: no cover + from ..utils.file_filter import FileFilter + +def parse_search_output( + output: str, + base_path: str, + max_line_length: Optional[int] = None +) -> Dict[str, List[Tuple[int, str]]]: """ Parse the output of command-line search tools (grep, ag, rg). Args: output: The raw output from the command-line tool. base_path: The base path of the project to make file paths relative. + max_line_length: Optional maximum line length to truncate long lines. Returns: A dictionary where keys are file paths and values are lists of (line_number, line_content) tuples. @@ -33,26 +41,53 @@ def parse_search_output(output: str, base_path: str) -> Dict[str, List[Tuple[int if not line.strip(): continue try: - # Handle Windows paths which might have a drive letter, e.g., C: + # Try to parse as a matched line first (format: path:linenum:content) parts = line.split(':', 2) - if sys.platform == "win32" and len(parts[0]) == 1 and parts[1].startswith('\\'): - # Re-join drive letter with the rest of the path + + # Check if this might be a context line (format: path-linenum-content) + # Context lines use '-' as separator in grep/ag output + if len(parts) < 3 and '-' in line: + # Try to parse as context line + # Match pattern: path-linenum-content or path-linenum-\tcontent + match = re.match(r'^(.*?)-(\d+)[-\t](.*)$', line) + if match: + file_path_abs = match.group(1) + line_number_str = match.group(2) + content = match.group(3) + else: + # If regex doesn't match, skip this line + continue + elif sys.platform == "win32" and len(parts) >= 3 and len(parts[0]) == 1 and parts[1].startswith('\\'): + # Handle Windows paths with drive letter (e.g., C:\path\file.txt) file_path_abs = f"{parts[0]}:{parts[1]}" line_number_str = parts[2].split(':', 1)[0] - content = parts[2].split(':', 1)[1] - else: + content = parts[2].split(':', 1)[1] if ':' in parts[2] else parts[2] + elif len(parts) >= 3: + # Standard format: path:linenum:content file_path_abs = parts[0] line_number_str = parts[1] content = parts[2] + else: + # Line doesn't match any expected format + continue line_number = int(line_number_str) - # Make the file path relative to the base_path - relative_path = os.path.relpath(file_path_abs, normalized_base_path) + # If the path is already relative (doesn't start with /), keep it as is + # Otherwise, make it relative to the base_path + if os.path.isabs(file_path_abs): + relative_path = os.path.relpath(file_path_abs, normalized_base_path) + else: + # Path is already relative, use it as is + relative_path = file_path_abs # Normalize path separators for consistency relative_path = normalize_file_path(relative_path) + # Truncate content if it exceeds max_line_length + if max_line_length and len(content) > max_line_length: + content = content[:max_line_length] + '... (truncated)' + if relative_path not in results: results[relative_path] = [] results[relative_path].append((line_number, content)) @@ -150,6 +185,16 @@ class SearchStrategy(ABC): Each strategy is responsible for searching code using a specific tool or method. """ + def configure_excludes(self, file_filter: Optional['FileFilter']) -> None: + """Configure shared exclusion settings for the strategy.""" + self.file_filter = file_filter + if file_filter: + self.exclude_dirs = sorted(set(file_filter.exclude_dirs)) + self.exclude_file_patterns = sorted(set(file_filter.exclude_files)) + else: + self.exclude_dirs = [] + self.exclude_file_patterns = [] + @property @abstractmethod def name(self) -> str: @@ -175,7 +220,8 @@ def search( context_lines: int = 0, file_pattern: Optional[str] = None, fuzzy: bool = False, - regex: bool = False + regex: bool = False, + max_line_length: Optional[int] = None ) -> Dict[str, List[Tuple[int, str]]]: """ Execute a search using the specific strategy. @@ -193,4 +239,3 @@ def search( A dictionary mapping filenames to lists of (line_number, line_content) tuples. """ pass - diff --git a/src/code_index_mcp/search/basic.py b/src/code_index_mcp/search/basic.py index 57aab77..9ef1846 100644 --- a/src/code_index_mcp/search/basic.py +++ b/src/code_index_mcp/search/basic.py @@ -1,9 +1,10 @@ """ Basic, pure-Python search strategy. """ +import fnmatch import os import re -import fnmatch +from pathlib import Path from typing import Dict, List, Optional, Tuple from .base import SearchStrategy, create_word_boundary_pattern, is_safe_regex_pattern @@ -46,7 +47,8 @@ def search( context_lines: int = 0, file_pattern: Optional[str] = None, fuzzy: bool = False, - regex: bool = False + regex: bool = False, + max_line_length: Optional[int] = None ) -> Dict[str, List[Tuple[int, str]]]: """ Execute a basic, line-by-line search. @@ -60,6 +62,7 @@ def search( file_pattern: File pattern to filter fuzzy: Enable word boundary matching regex: Enable regex pattern matching + max_line_length: Optional. Limit the length of lines when context_lines is used """ results: Dict[str, List[Tuple[int, str]]] = {} @@ -81,28 +84,38 @@ def search( except re.error as e: raise ValueError(f"Invalid regex pattern: {pattern}, error: {e}") - for root, _, files in os.walk(base_path): + file_filter = getattr(self, 'file_filter', None) + base = Path(base_path) + + for root, dirs, files in os.walk(base_path): + if file_filter: + dirs[:] = [d for d in dirs if not file_filter.should_exclude_directory(d)] + for file in files: - # Improved file pattern matching with glob support if file_pattern and not self._matches_pattern(file, file_pattern): continue - file_path = os.path.join(root, file) + file_path = Path(root) / file + + if file_filter and not file_filter.should_process_path(file_path, base): + continue + rel_path = os.path.relpath(file_path, base_path) - + try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: for line_num, line in enumerate(f, 1): if search_regex.search(line): + content = line.rstrip('\n') + if max_line_length and len(content) > max_line_length: + content = content[:max_line_length] + '... (truncated)' + if rel_path not in results: results[rel_path] = [] - # Strip newline for consistent output - results[rel_path].append((line_num, line.rstrip('\n'))) + results[rel_path].append((line_num, content)) except (UnicodeDecodeError, PermissionError, OSError): - # Ignore files that can't be opened or read due to encoding/permission issues continue except Exception: - # Ignore any other unexpected exceptions to maintain robustness continue - return results \ No newline at end of file + return results diff --git a/src/code_index_mcp/search/grep.py b/src/code_index_mcp/search/grep.py index cd2d18e..f24c469 100644 --- a/src/code_index_mcp/search/grep.py +++ b/src/code_index_mcp/search/grep.py @@ -32,7 +32,8 @@ def search( context_lines: int = 0, file_pattern: Optional[str] = None, fuzzy: bool = False, - regex: bool = False + regex: bool = False, + max_line_length: Optional[int] = None ) -> Dict[str, List[Tuple[int, str]]]: """ Execute a search using standard grep. @@ -45,6 +46,7 @@ def search( file_pattern: File pattern to filter fuzzy: Enable word boundary matching regex: Enable regex pattern matching + max_line_length: Optional. Limit the length of lines when context_lines is used """ # -r: recursive, -n: line number cmd = ['grep', '-r', '-n'] @@ -81,6 +83,27 @@ def search( # Note: grep's --include uses glob patterns, not regex cmd.append(f'--include={file_pattern}') + exclude_dirs = getattr(self, 'exclude_dirs', []) + exclude_file_patterns = getattr(self, 'exclude_file_patterns', []) + + processed_dirs = set() + for directory in exclude_dirs: + normalized = directory.strip() + if not normalized or normalized in processed_dirs: + continue + cmd.append(f'--exclude-dir={normalized}') + processed_dirs.add(normalized) + + processed_files = set() + for pattern in exclude_file_patterns: + normalized = pattern.strip() + if not normalized or normalized in processed_files: + continue + if normalized.startswith('!'): + normalized = normalized[1:] + cmd.append(f'--exclude={normalized}') + processed_files.add(normalized) + # Add -- to treat pattern as a literal argument, preventing injection cmd.append('--') cmd.append(search_pattern) @@ -102,9 +125,9 @@ def search( if process.returncode > 1: raise RuntimeError(f"grep failed with exit code {process.returncode}: {process.stderr}") - return parse_search_output(process.stdout, base_path) + return parse_search_output(process.stdout, base_path, max_line_length) except FileNotFoundError: raise RuntimeError("'grep' not found. Please install it and ensure it's in your PATH.") except Exception as e: - raise RuntimeError(f"An error occurred while running grep: {e}") + raise RuntimeError(f"An error occurred while running grep: {e}") diff --git a/src/code_index_mcp/search/ripgrep.py b/src/code_index_mcp/search/ripgrep.py index 15dd6c0..8a5c325 100644 --- a/src/code_index_mcp/search/ripgrep.py +++ b/src/code_index_mcp/search/ripgrep.py @@ -27,7 +27,8 @@ def search( context_lines: int = 0, file_pattern: Optional[str] = None, fuzzy: bool = False, - regex: bool = False + regex: bool = False, + max_line_length: Optional[int] = None ) -> Dict[str, List[Tuple[int, str]]]: """ Execute a search using ripgrep. @@ -40,6 +41,7 @@ def search( file_pattern: File pattern to filter fuzzy: Enable word boundary matching (not true fuzzy search) regex: Enable regex pattern matching + max_line_length: Optional. Limit the length of lines when context_lines is used """ cmd = ['rg', '--line-number', '--no-heading', '--color=never', '--no-ignore'] @@ -67,6 +69,31 @@ def search( if file_pattern: cmd.extend(['--glob', file_pattern]) + exclude_dirs = getattr(self, 'exclude_dirs', []) + exclude_file_patterns = getattr(self, 'exclude_file_patterns', []) + + processed_patterns = set() + + for directory in exclude_dirs: + normalized = directory.strip() + if not normalized or normalized in processed_patterns: + continue + cmd.extend(['--glob', f'!**/{normalized}/**']) + processed_patterns.add(normalized) + + for pattern in exclude_file_patterns: + normalized = pattern.strip() + if not normalized or normalized in processed_patterns: + continue + if normalized.startswith('!'): + glob_pattern = normalized + elif any(ch in normalized for ch in '*?[') or '/' in normalized: + glob_pattern = f'!{normalized}' + else: + glob_pattern = f'!**/{normalized}' + cmd.extend(['--glob', glob_pattern]) + processed_patterns.add(normalized) + # Add -- to treat pattern as a literal argument, preventing injection cmd.append('--') cmd.append(search_pattern) @@ -87,10 +114,10 @@ def search( if process.returncode > 1: raise RuntimeError(f"ripgrep failed with exit code {process.returncode}: {process.stderr}") - return parse_search_output(process.stdout, base_path) + return parse_search_output(process.stdout, base_path, max_line_length) except FileNotFoundError: raise RuntimeError("ripgrep (rg) not found. Please install it and ensure it's in your PATH.") except Exception as e: # Re-raise other potential exceptions like permission errors - raise RuntimeError(f"An error occurred while running ripgrep: {e}") + raise RuntimeError(f"An error occurred while running ripgrep: {e}") diff --git a/src/code_index_mcp/search/ugrep.py b/src/code_index_mcp/search/ugrep.py index 69f2cc4..d4302c1 100644 --- a/src/code_index_mcp/search/ugrep.py +++ b/src/code_index_mcp/search/ugrep.py @@ -27,7 +27,8 @@ def search( context_lines: int = 0, file_pattern: Optional[str] = None, fuzzy: bool = False, - regex: bool = False + regex: bool = False, + max_line_length: Optional[int] = None ) -> Dict[str, List[Tuple[int, str]]]: """ Execute a search using the 'ug' command-line tool. @@ -40,11 +41,12 @@ def search( file_pattern: File pattern to filter fuzzy: Enable true fuzzy search (ugrep native support) regex: Enable regex pattern matching + max_line_length: Optional. Limit the length of lines when context_lines is used """ if not self.is_available(): return {"error": "ugrep (ug) command not found."} - cmd = ['ug', '--line-number', '--no-heading'] + cmd = ['ug', '-r', '--line-number', '--no-heading'] if fuzzy: # ugrep has native fuzzy search support @@ -65,7 +67,31 @@ def search( cmd.extend(['-A', str(context_lines), '-B', str(context_lines)]) if file_pattern: - cmd.extend(['-g', file_pattern]) # Correct parameter for file patterns + cmd.extend(['--include', file_pattern]) + + processed_patterns = set() + exclude_dirs = getattr(self, 'exclude_dirs', []) + exclude_file_patterns = getattr(self, 'exclude_file_patterns', []) + + for directory in exclude_dirs: + normalized = directory.strip() + if not normalized or normalized in processed_patterns: + continue + cmd.extend(['--ignore', f'**/{normalized}/**']) + processed_patterns.add(normalized) + + for pattern in exclude_file_patterns: + normalized = pattern.strip() + if not normalized or normalized in processed_patterns: + continue + if normalized.startswith('!'): + ignore_pattern = normalized[1:] + elif any(ch in normalized for ch in '*?[') or '/' in normalized: + ignore_pattern = normalized + else: + ignore_pattern = f'**/{normalized}' + cmd.extend(['--ignore', ignore_pattern]) + processed_patterns.add(normalized) # Add '--' to treat pattern as a literal argument, preventing injection cmd.append('--') @@ -89,7 +115,7 @@ def search( error_output = process.stderr.strip() return {"error": f"ugrep execution failed with code {process.returncode}", "details": error_output} - return parse_search_output(process.stdout, base_path) + return parse_search_output(process.stdout, base_path, max_line_length) except FileNotFoundError: return {"error": "ugrep (ug) command not found. Please ensure it's installed and in your PATH."} diff --git a/src/code_index_mcp/server.py b/src/code_index_mcp/server.py index 5892c0a..2d1eb80 100644 --- a/src/code_index_mcp/server.py +++ b/src/code_index_mcp/server.py @@ -13,10 +13,9 @@ import logging from contextlib import asynccontextmanager from dataclasses import dataclass -from typing import AsyncIterator, Dict, Any, Optional, List +from typing import AsyncIterator, Dict, Any, List # Third-party imports -from mcp import types from mcp.server.fastmcp import FastMCP, Context # Local imports @@ -60,7 +59,6 @@ class CodeIndexerContext: base_path: str settings: ProjectSettings file_count: int = 0 - index_manager: Optional['UnifiedIndexManager'] = None file_watcher_service: FileWatcherService = None @asynccontextmanager @@ -87,10 +85,6 @@ async def indexer_lifespan(_server: FastMCP) -> AsyncIterator[CodeIndexerContext if context.file_watcher_service: context.file_watcher_service.stop_monitoring() - # Only save index if project path has been set - if context.base_path and context.index_manager: - context.index_manager.save_index() - # Create the MCP server with lifespan manager mcp = FastMCP("CodeIndexer", lifespan=indexer_lifespan, dependencies=["pathlib"]) @@ -111,13 +105,7 @@ def get_file_content(file_path: str) -> str: # Use FileService for simple file reading - this is appropriate for a resource return FileService(ctx).get_file_content(file_path) -@mcp.resource("structure://project") -@handle_mcp_resource_errors -def get_project_structure() -> str: - """Get the structure of the project as a JSON tree.""" - ctx = mcp.get_context() - return ProjectManagementService(ctx).get_project_structure() - +# Removed: structure://project resource - not necessary for most workflows # Removed: settings://stats resource - this information is available via get_settings_info() tool # and is more of a debugging/technical detail rather than context AI needs @@ -138,7 +126,8 @@ def search_code_advanced( context_lines: int = 0, file_pattern: str = None, fuzzy: bool = False, - regex: bool = None + regex: bool = None, + max_line_length: int = None ) -> Dict[str, Any]: """ Search for a code pattern in the project using an advanced, fast tool. @@ -152,6 +141,7 @@ def search_code_advanced( context_lines: Number of lines to show before and after the match. file_pattern: A glob pattern to filter files to search in (e.g., "*.py", "*.js", "test_*.py"). + max_line_length: Optional. Default None (no limit). Limits the length of lines when context_lines is used. All search tools now handle glob patterns consistently: - ugrep: Uses glob patterns (*.py, *.{js,ts}) - ripgrep: Uses glob patterns (*.py, *.{js,ts}) @@ -180,7 +170,8 @@ def search_code_advanced( context_lines=context_lines, file_pattern=file_pattern, fuzzy=fuzzy, - regex=regex + regex=regex, + max_line_length=max_line_length ) @mcp.tool() @@ -246,6 +237,16 @@ def refresh_index(ctx: Context) -> str: """ return IndexManagementService(ctx).rebuild_index() +@mcp.tool() +@handle_mcp_tool_errors(return_type='str') +def build_deep_index(ctx: Context) -> str: + """ + Build the deep index (full symbol extraction) for the current project. + + This performs a complete re-index and loads it into memory. + """ + return IndexManagementService(ctx).rebuild_deep_index() + @mcp.tool() @handle_mcp_tool_errors(return_type='dict') def get_settings_info(ctx: Context) -> Dict[str, Any]: @@ -297,62 +298,7 @@ def configure_file_watcher( return SystemManagementService(ctx).configure_file_watcher(enabled, debounce_seconds, additional_exclude_patterns) # ----- PROMPTS ----- - -@mcp.prompt() -def analyze_code(file_path: str = "", query: str = "") -> list[types.PromptMessage]: - """Prompt for analyzing code in the project.""" - messages = [ - types.PromptMessage(role="user", content=types.TextContent(type="text", text=f"""I need you to analyze some code from my project. - -{f'Please analyze the file: {file_path}' if file_path else ''} -{f'I want to understand: {query}' if query else ''} - -First, let me give you some context about the project structure. Then, I'll provide the code to analyze. -""")), - types.PromptMessage( - role="assistant", - content=types.TextContent( - type="text", - text="I'll help you analyze the code. Let me first examine the project structure to get a better understanding of the codebase." - ) - ) - ] - return messages - -@mcp.prompt() -def code_search(query: str = "") -> types.TextContent: - """Prompt for searching code in the project.""" - search_text = "\"query\"" if not query else f"\"{query}\"" - return types.TextContent( - type="text", - text=f"""I need to search through my codebase for {search_text}. - -Please help me find all occurrences of this query and explain what each match means in its context. -Focus on the most relevant files and provide a brief explanation of how each match is used in the code. - -If there are too many results, prioritize the most important ones and summarize the patterns you see.""" - ) - -@mcp.prompt() -def set_project() -> list[types.PromptMessage]: - """Prompt for setting the project path.""" - messages = [ - types.PromptMessage(role="user", content=types.TextContent(type="text", text=""" - I need to analyze code from a project, but I haven't set the project path yet. Please help me set up the project path and index the code. - - First, I need to specify which project directory to analyze. - """)), - types.PromptMessage(role="assistant", content=types.TextContent(type="text", text=""" - Before I can help you analyze any code, we need to set up the project path. This is a required first step. - - Please provide the full path to your project folder. For example: - - Windows: "C:/Users/username/projects/my-project" - - macOS/Linux: "/home/username/projects/my-project" - - Once you provide the path, I'll use the `set_project_path` tool to configure the code analyzer to work with your project. - """)) - ] - return messages +# Removed: analyze_code, code_search, set_project prompts def main(): """Main function to run the MCP server.""" diff --git a/src/code_index_mcp/services/code_intelligence_service.py b/src/code_index_mcp/services/code_intelligence_service.py index 77ff894..af0f1a2 100644 --- a/src/code_index_mcp/services/code_intelligence_service.py +++ b/src/code_index_mcp/services/code_intelligence_service.py @@ -9,12 +9,12 @@ import os from typing import Dict, Any -logger = logging.getLogger(__name__) - from .base_service import BaseService from ..tools.filesystem import FileSystemTool from ..indexing import get_index_manager +logger = logging.getLogger(__name__) + class CodeIntelligenceService(BaseService): """ @@ -61,9 +61,14 @@ def analyze_file(self, file_path: str) -> Dict[str, Any]: # Get file summary from JSON index summary = index_manager.get_file_summary(file_path) logger.info(f"Summary result: {summary is not None}") - + + # If deep index isn't available yet, return a helpful hint instead of error if not summary: - raise ValueError(f"File not found in index: {file_path}") + return { + "status": "needs_deep_index", + "message": "Deep index not available. Please run build_deep_index before calling get_file_summary.", + "file_path": file_path + } return summary diff --git a/src/code_index_mcp/services/file_discovery_service.py b/src/code_index_mcp/services/file_discovery_service.py index 478beea..d777511 100644 --- a/src/code_index_mcp/services/file_discovery_service.py +++ b/src/code_index_mcp/services/file_discovery_service.py @@ -9,7 +9,7 @@ from dataclasses import dataclass from .base_service import BaseService -from ..indexing import get_index_manager +from ..indexing import get_shallow_index_manager @dataclass @@ -32,7 +32,7 @@ class FileDiscoveryService(BaseService): def __init__(self, ctx): super().__init__(ctx) - self._index_manager = get_index_manager() + self._index_manager = get_shallow_index_manager() def find_files(self, pattern: str, max_results: Optional[int] = None) -> List[str]: """ diff --git a/src/code_index_mcp/services/file_watcher_service.py b/src/code_index_mcp/services/file_watcher_service.py index cac4dd5..c2ef64c 100644 --- a/src/code_index_mcp/services/file_watcher_service.py +++ b/src/code_index_mcp/services/file_watcher_service.py @@ -50,6 +50,7 @@ def __init__(self): WATCHDOG_AVAILABLE = False from .base_service import BaseService +from ..constants import SUPPORTED_EXTENSIONS class FileWatcherService(BaseService): diff --git a/src/code_index_mcp/services/index_management_service.py b/src/code_index_mcp/services/index_management_service.py index e4714a3..f56c760 100644 --- a/src/code_index_mcp/services/index_management_service.py +++ b/src/code_index_mcp/services/index_management_service.py @@ -6,6 +6,8 @@ """ import time import logging +import os +import json from typing import Dict, Any from dataclasses import dataclass @@ -13,7 +15,7 @@ logger = logging.getLogger(__name__) from .base_service import BaseService -from ..indexing import get_index_manager +from ..indexing import get_index_manager, get_shallow_index_manager, DeepIndexManager @dataclass @@ -35,11 +37,18 @@ class IndexManagementService(BaseService): def __init__(self, ctx): super().__init__(ctx) + # Deep manager (symbols/files, legacy JSON index manager) self._index_manager = get_index_manager() + # Shallow manager (file-list only) for default workflows + self._shallow_manager = get_shallow_index_manager() + # Optional wrapper for explicit deep builds + self._deep_wrapper = DeepIndexManager() def rebuild_index(self) -> str: """ - Rebuild the project index using the new JSON indexing system. + Rebuild the project index (DEFAULT: shallow file list). + + For deep/symbol rebuilds, use build_deep_index() tool instead. Returns: Success message with rebuild information @@ -50,11 +59,17 @@ def rebuild_index(self) -> str: # Business validation self._validate_rebuild_request() - # Business workflow: Execute rebuild - result = self._execute_rebuild_workflow() + # Shallow rebuild only (fast path) + if not self._shallow_manager.set_project_path(self.base_path): + raise RuntimeError("Failed to set project path (shallow) in index manager") + if not self._shallow_manager.build_index(): + raise RuntimeError("Failed to rebuild shallow index") - # Business result formatting - return self._format_rebuild_result(result) + try: + count = len(self._shallow_manager.get_file_list()) + except Exception: + count = 0 + return f"Shallow index re-built with {count} files." def get_rebuild_status(self) -> Dict[str, Any]: """ @@ -137,3 +152,47 @@ def _format_rebuild_result(self, result: IndexRebuildResult) -> str: Formatted result string for MCP response """ return f"Project re-indexed. Found {result.file_count} files." + + def build_shallow_index(self) -> str: + """ + Build and persist the shallow index (file list only). + + Returns: + Success message including file count if available. + + Raises: + ValueError/RuntimeError on validation or build failure + """ + # Ensure project is set up + self._require_project_setup() + + # Initialize manager with current base path + if not self._shallow_manager.set_project_path(self.base_path): + raise RuntimeError("Failed to set project path in index manager") + + # Build shallow index + if not self._shallow_manager.build_index(): + raise RuntimeError("Failed to build shallow index") + + # Try to report count + count = 0 + try: + shallow_path = getattr(self._shallow_manager, 'index_path', None) + if shallow_path and os.path.exists(shallow_path): + with open(shallow_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list): + count = len(data) + except Exception as e: # noqa: BLE001 - safe fallback to zero + logger.debug(f"Unable to read shallow index count: {e}") + + return f"Shallow index built{f' with {count} files' if count else ''}." + + def rebuild_deep_index(self) -> str: + """Rebuild the deep index using the original workflow.""" + # Business validation + self._validate_rebuild_request() + + # Deep rebuild via existing workflow + result = self._execute_rebuild_workflow() + return self._format_rebuild_result(result) diff --git a/src/code_index_mcp/services/project_management_service.py b/src/code_index_mcp/services/project_management_service.py index 1aa0706..c0f3a63 100644 --- a/src/code_index_mcp/services/project_management_service.py +++ b/src/code_index_mcp/services/project_management_service.py @@ -4,7 +4,6 @@ This service handles the business logic for project initialization, configuration, and lifecycle management using the new JSON-based indexing system. """ -import json import logging from typing import Dict, Any from dataclasses import dataclass @@ -13,7 +12,7 @@ from .base_service import BaseService from ..utils.response_formatter import ResponseFormatter from ..constants import SUPPORTED_EXTENSIONS -from ..indexing import get_index_manager +from ..indexing import get_index_manager, get_shallow_index_manager logger = logging.getLogger(__name__) @@ -40,14 +39,16 @@ class ProjectManagementService(BaseService): def __init__(self, ctx): super().__init__(ctx) - # Use the global singleton index manager + # Deep index manager (legacy full index) self._index_manager = get_index_manager() + # Shallow index manager (default for initialization) + self._shallow_manager = get_shallow_index_manager() from ..tools.config import ProjectConfigTool self._config_tool = ProjectConfigTool() # Import FileWatcherTool locally to avoid circular import from ..tools.monitoring import FileWatcherTool self._watcher_tool = FileWatcherTool(ctx) - + @contextmanager def _noop_operation(self, *_args, **_kwargs): @@ -106,15 +107,15 @@ def _execute_initialization_workflow(self, path: str) -> ProjectInitializationRe """ # Business step 1: Initialize config tool self._config_tool.initialize_settings(path) - + # Normalize path for consistent processing normalized_path = self._config_tool.normalize_project_path(path) # Business step 2: Cleanup existing project state self._cleanup_existing_project() - # Business step 3: Initialize JSON index manager - index_result = self._initialize_json_index_manager(normalized_path) + # Business step 3: Initialize shallow index by default (fast path) + index_result = self._initialize_shallow_index_manager(normalized_path) # Business step 3.1: Store index manager in context for other services self.helper.update_index_manager(self._index_manager) @@ -185,6 +186,45 @@ def _initialize_json_index_manager(self, project_path: str) -> Dict[str, Any]: 'languages': stats.get('languages', []) } + def _initialize_shallow_index_manager(self, project_path: str) -> Dict[str, Any]: + """ + Business logic to initialize the shallow index manager by default. + + Args: + project_path: Project path + + Returns: + Dictionary with initialization results + """ + # Set project path in shallow manager + if not self._shallow_manager.set_project_path(project_path): + raise RuntimeError(f"Failed to set project path (shallow): {project_path}") + + # Update context + self.helper.update_base_path(project_path) + + # Try to load existing shallow index or build new one + if self._shallow_manager.load_index(): + source = "loaded_existing" + else: + if not self._shallow_manager.build_index(): + raise RuntimeError("Failed to build shallow index") + source = "built_new" + + # Determine file count from shallow list + try: + files = self._shallow_manager.get_file_list() + file_count = len(files) + except Exception: # noqa: BLE001 - safe fallback + file_count = 0 + + return { + 'file_count': file_count, + 'source': source, + 'total_symbols': 0, + 'languages': [] + } + def _is_valid_existing_index(self, index_data: Dict[str, Any]) -> bool: """ @@ -217,7 +257,7 @@ def _load_existing_index(self, index_data: Dict[str, Any]) -> Dict[str, Any]: Returns: Dictionary with loading results """ - + # Note: Legacy index loading is now handled by UnifiedIndexManager # This method is kept for backward compatibility but functionality moved @@ -225,7 +265,7 @@ def _load_existing_index(self, index_data: Dict[str, Any]) -> Dict[str, Any]: # Extract file count from metadata file_count = index_data.get('project_metadata', {}).get('total_files', 0) - + return { 'file_count': file_count, @@ -243,22 +283,30 @@ def _setup_file_monitoring(self, project_path: str) -> str: Returns: String describing monitoring setup result """ - + try: # Create rebuild callback that uses the JSON index manager def rebuild_callback(): logger.info("File watcher triggered rebuild callback") try: - logger.debug(f"Starting index rebuild for: {project_path}") - # Business logic: File changed, rebuild using JSON index manager - if self._index_manager.refresh_index(): - stats = self._index_manager.get_index_stats() - file_count = stats.get('indexed_files', 0) - logger.info(f"File watcher rebuild completed successfully - indexed {file_count} files") - return True - else: - logger.warning("File watcher rebuild failed") + logger.debug(f"Starting shallow index rebuild for: {project_path}") + # Business logic: File changed, rebuild using SHALLOW index manager + try: + if not self._shallow_manager.set_project_path(project_path): + logger.warning("Shallow manager set_project_path failed") + return False + if self._shallow_manager.build_index(): + files = self._shallow_manager.get_file_list() + logger.info(f"File watcher shallow rebuild completed successfully - files {len(files)}") + return True + else: + logger.warning("File watcher shallow rebuild failed") + return False + except Exception as e: + import traceback + logger.error(f"File watcher shallow rebuild failed: {e}") + logger.error(f"Traceback: {traceback.format_exc()}") return False except Exception as e: import traceback @@ -285,7 +333,7 @@ def rebuild_callback(): def _update_project_state(self, project_path: str, file_count: int) -> None: """Business logic to update system state after project initialization.""" - + # Update context with file count self.helper.update_file_count(file_count) @@ -360,39 +408,4 @@ def get_project_config(self) -> str: return ResponseFormatter.config_response(config_data) - def get_project_structure(self) -> str: - """ - Get the project directory structure for MCP resource. - - Returns: - JSON formatted project structure - """ - - # Check if project is configured - if not self.helper.base_path: - structure_data = { - "status": "not_configured", - "message": ("Project path not set. Please use set_project_path " - "to set a project directory first.") - } - return json.dumps(structure_data, indent=2) - - # Check if we have index cache with directory tree - if (hasattr(self.ctx.request_context.lifespan_context, 'index_cache') and - self.ctx.request_context.lifespan_context.index_cache and - 'directory_tree' in self.ctx.request_context.lifespan_context.index_cache): - - directory_tree = self.ctx.request_context.lifespan_context.index_cache['directory_tree'] - return json.dumps(directory_tree, indent=2) - - # If no directory tree available, try to build basic structure - try: - # Use config tool to get basic project structure - basic_structure = self._config_tool.get_basic_project_structure(self.helper.base_path) - return json.dumps(basic_structure, indent=2) - except Exception as e: - error_data = { - "error": f"Unable to get project structure: {e}", - "status": "error" - } - return json.dumps(error_data, indent=2) + # Removed: get_project_structure; the project structure resource is deprecated diff --git a/src/code_index_mcp/services/search_service.py b/src/code_index_mcp/services/search_service.py index 7daa3c9..a2c2799 100644 --- a/src/code_index_mcp/services/search_service.py +++ b/src/code_index_mcp/services/search_service.py @@ -5,24 +5,20 @@ and search strategy selection. """ -from typing import Dict, Any, Optional +from pathlib import Path +from typing import Any, Dict, List, Optional from .base_service import BaseService -from ..utils import ValidationHelper, ResponseFormatter +from ..utils import FileFilter, ResponseFormatter, ValidationHelper from ..search.base import is_safe_regex_pattern class SearchService(BaseService): - """ - Service for managing code search operations. - - This service handles: - - Code search with various parameters and options - - Search tool management and detection - - Search strategy selection and optimization - - Search capabilities reporting - """ + """Service for managing code search operations.""" + def __init__(self, ctx): + super().__init__(ctx) + self.file_filter = self._create_file_filter() def search_code( # pylint: disable=too-many-arguments self, @@ -31,47 +27,24 @@ def search_code( # pylint: disable=too-many-arguments context_lines: int = 0, file_pattern: Optional[str] = None, fuzzy: bool = False, - regex: Optional[bool] = None + regex: Optional[bool] = None, + max_line_length: Optional[int] = None ) -> Dict[str, Any]: - """ - Search for code patterns in the project. - - Handles the logic for search_code_advanced MCP tool. - - Args: - pattern: The search pattern - case_sensitive: Whether search should be case-sensitive - context_lines: Number of context lines to show - file_pattern: Glob pattern to filter files - fuzzy: Whether to enable fuzzy matching - regex: Regex mode - True/False to force, None for auto-detection - - Returns: - Dictionary with search results or error information - - Raises: - ValueError: If project is not set up or search parameters are invalid - """ + """Search for code patterns in the project.""" self._require_project_setup() - # Smart regex detection if regex parameter is None if regex is None: regex = is_safe_regex_pattern(pattern) - if regex: - pass - # Validate search pattern error = ValidationHelper.validate_search_pattern(pattern, regex) if error: raise ValueError(error) - # Validate file pattern if provided if file_pattern: error = ValidationHelper.validate_glob_pattern(file_pattern) if error: raise ValueError(f"Invalid file pattern: {error}") - # Get search strategy from settings if not self.settings: raise ValueError("Settings not available") @@ -79,7 +52,7 @@ def search_code( # pylint: disable=too-many-arguments if not strategy: raise ValueError("No search strategies available") - + self._configure_strategy(strategy) try: results = strategy.search( @@ -89,25 +62,16 @@ def search_code( # pylint: disable=too-many-arguments context_lines=context_lines, file_pattern=file_pattern, fuzzy=fuzzy, - regex=regex + regex=regex, + max_line_length=max_line_length ) - return ResponseFormatter.search_results_response(results) - except Exception as e: - raise ValueError(f"Search failed using '{strategy.name}': {e}") from e - + filtered = self._filter_results(results) + return ResponseFormatter.search_results_response(filtered) + except Exception as exc: + raise ValueError(f"Search failed using '{strategy.name}': {exc}") from exc def refresh_search_tools(self) -> str: - """ - Refresh the available search tools. - - Handles the logic for refresh_search_tools MCP tool. - - Returns: - Success message with available tools information - - Raises: - ValueError: If refresh operation fails - """ + """Refresh the available search tools.""" if not self.settings: raise ValueError("Settings not available") @@ -118,14 +82,8 @@ def refresh_search_tools(self) -> str: preferred = config['preferred_tool'] return f"Search tools refreshed. Available: {available}. Preferred: {preferred}." - def get_search_capabilities(self) -> Dict[str, Any]: - """ - Get information about search capabilities and available tools. - - Returns: - Dictionary with search tool information and capabilities - """ + """Get information about search capabilities and available tools.""" if not self.settings: return {"error": "Settings not available"} @@ -142,3 +100,73 @@ def get_search_capabilities(self) -> Dict[str, Any]: } return capabilities + + def _configure_strategy(self, strategy) -> None: + """Apply shared exclusion configuration to the strategy if supported.""" + configure = getattr(strategy, 'configure_excludes', None) + if not configure: + return + + try: + configure(self.file_filter) + except Exception: # pragma: no cover - defensive fallback + pass + + def _create_file_filter(self) -> FileFilter: + """Build a shared file filter drawing from project settings.""" + additional_dirs: List[str] = [] + additional_file_patterns: List[str] = [] + + settings = self.settings + if settings: + try: + config = settings.get_file_watcher_config() + except Exception: # pragma: no cover - fallback if config fails + config = {} + + for key in ('exclude_patterns', 'additional_exclude_patterns'): + patterns = config.get(key) or [] + for pattern in patterns: + if not isinstance(pattern, str): + continue + normalized = pattern.strip() + if not normalized: + continue + additional_dirs.append(normalized) + additional_file_patterns.append(normalized) + + file_filter = FileFilter(additional_dirs or None) + + if additional_file_patterns: + file_filter.exclude_files.update(additional_file_patterns) + + return file_filter + + def _filter_results(self, results: Dict[str, Any]) -> Dict[str, Any]: + """Filter out matches that reside under excluded paths.""" + if not isinstance(results, dict) or not results: + return results + + if 'error' in results or not self.file_filter or not self.base_path: + return results + + base_path = Path(self.base_path) + filtered: Dict[str, Any] = {} + + for rel_path, matches in results.items(): + if not isinstance(rel_path, str): + continue + + normalized = Path(rel_path.replace('\\', '/')) + try: + absolute = (base_path / normalized).resolve() + except Exception: # pragma: no cover - invalid path safety + continue + + try: + if self.file_filter.should_process_path(absolute, base_path): + filtered[rel_path] = matches + except Exception: # pragma: no cover - defensive fallback + continue + + return filtered diff --git a/tests/search/test_search_filters.py b/tests/search/test_search_filters.py new file mode 100644 index 0000000..787461d --- /dev/null +++ b/tests/search/test_search_filters.py @@ -0,0 +1,52 @@ +"""Tests covering shared search filtering behaviour.""" +import os +from types import SimpleNamespace +from unittest.mock import patch +from pathlib import Path as _TestPath +import sys + +ROOT = _TestPath(__file__).resolve().parents[2] +SRC_PATH = ROOT / 'src' +if str(SRC_PATH) not in sys.path: + sys.path.insert(0, str(SRC_PATH)) + +from code_index_mcp.search.basic import BasicSearchStrategy +from code_index_mcp.search.ripgrep import RipgrepStrategy +from code_index_mcp.utils.file_filter import FileFilter + + +def test_basic_strategy_skips_excluded_directories(tmp_path): + base = tmp_path + src_dir = base / "src" + src_dir.mkdir() + (src_dir / 'app.js').write_text("const db = 'mongo';\n") + + node_modules_dir = base / "node_modules" / "pkg" + node_modules_dir.mkdir(parents=True) + (node_modules_dir / 'index.js').write_text("// mongo dependency\n") + + strategy = BasicSearchStrategy() + strategy.configure_excludes(FileFilter()) + + results = strategy.search("mongo", str(base), case_sensitive=False) + + included_path = os.path.join("src", "app.js") + excluded_path = os.path.join("node_modules", "pkg", "index.js") + + assert included_path in results + assert excluded_path not in results + + +@patch("code_index_mcp.search.ripgrep.subprocess.run") +def test_ripgrep_strategy_adds_exclude_globs(mock_run, tmp_path): + mock_run.return_value = SimpleNamespace(returncode=0, stdout="", stderr="") + + strategy = RipgrepStrategy() + strategy.configure_excludes(FileFilter()) + + strategy.search("mongo", str(tmp_path)) + + cmd = mock_run.call_args[0][0] + glob_args = [cmd[i + 1] for i, arg in enumerate(cmd) if arg == '--glob' and i + 1 < len(cmd)] + + assert any(value.startswith('!**/node_modules/') for value in glob_args) diff --git a/uv.lock b/uv.lock index 6642d2e..08294cf 100644 --- a/uv.lock +++ b/uv.lock @@ -49,7 +49,7 @@ wheels = [ [[package]] name = "code-index-mcp" -version = "2.1.2" +version = "2.4.1" source = { editable = "." } dependencies = [ { name = "mcp" }, @@ -527,3 +527,4 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070 }, { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067 }, ] +