From 3bfb8cc01251c4dff09001970b9599af83816684 Mon Sep 17 00:00:00 2001 From: johnhuang316 <134570882+johnhuang316@users.noreply.github.com> Date: Mon, 18 Aug 2025 17:42:00 +0800 Subject: [PATCH 1/8] Refactor SCIPSymbolAnalyzer: Clean up whitespace and improve code readability - Removed unnecessary blank lines throughout the SCIPSymbolAnalyzer class. - Ensured consistent formatting and spacing for better code clarity. - Maintained functionality while enhancing overall code structure. --- .../scip/strategies/fallback_strategy.py | 488 +++--------------- .../tools/scip/scip_symbol_analyzer.py | 478 ++++++++--------- 2 files changed, 310 insertions(+), 656 deletions(-) diff --git a/src/code_index_mcp/scip/strategies/fallback_strategy.py b/src/code_index_mcp/scip/strategies/fallback_strategy.py index 416ebcb..7abb407 100644 --- a/src/code_index_mcp/scip/strategies/fallback_strategy.py +++ b/src/code_index_mcp/scip/strategies/fallback_strategy.py @@ -1,15 +1,12 @@ -"""Fallback SCIP indexing strategy - SCIP standard compliant.""" +"""Simplified fallback SCIP indexing strategy - minimal file information only.""" import logging import os -import re -from typing import List, Optional, Dict, Any, Set +from typing import List, Optional, Dict, Any from pathlib import Path from .base_strategy import SCIPIndexerStrategy, StrategyError from ..proto import scip_pb2 -from ..core.position_calculator import PositionCalculator -from ..core.relationship_types import InternalRelationshipType from ...constants import SUPPORTED_EXTENSIONS @@ -17,7 +14,7 @@ class FallbackStrategy(SCIPIndexerStrategy): - """SCIP-compliant fallback strategy for files without specific language support.""" + """Simplified SCIP-compliant fallback strategy providing only basic file information.""" def __init__(self, priority: int = 10): """Initialize the fallback strategy with low priority.""" @@ -36,154 +33,88 @@ def is_available(self) -> bool: return True # Always available as fallback def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Collect all symbol definitions from text files.""" - logger.debug(f"FallbackStrategy Phase 1: Processing {len(files)} files for symbol collection") + """Phase 1: Simple file counting - no symbol collection.""" + logger.debug(f"FallbackStrategy Phase 1: Processing {len(files)} files for basic cataloging") processed_count = 0 - error_count = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - + + for file_path in files: try: - self._collect_symbols_from_file(file_path, project_path) + relative_path = os.path.relpath(file_path, project_path) + # Just count files, no symbol extraction processed_count += 1 - - if i % 10 == 0 or i == len(files): - logger.debug(f"Phase 1 progress: {i}/{len(files)} files, last file: {relative_path}") - + logger.debug(f"Registered file: {relative_path}") except Exception as e: - error_count += 1 - logger.warning(f"Phase 1 failed for {relative_path}: {e}") + logger.warning(f"Phase 1 failed for {file_path}: {e}") continue - - logger.info(f"Phase 1 summary: {processed_count} files processed, {error_count} errors") + + logger.info(f"Phase 1 summary: {processed_count} files registered") def _generate_documents_with_references(self, files: List[str], project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> List[scip_pb2.Document]: - """Phase 2: Generate complete SCIP documents with resolved references.""" + """Phase 2: Generate minimal SCIP documents with basic file information.""" documents = [] - logger.debug(f"FallbackStrategy Phase 2: Generating documents for {len(files)} files") + logger.debug(f"FallbackStrategy Phase 2: Creating basic documents for {len(files)} files") processed_count = 0 - error_count = 0 - total_occurrences = 0 - total_symbols = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - + + for file_path in files: try: - document = self._analyze_text_file(file_path, project_path, relationships) + document = self._create_basic_document(file_path, project_path) if document: documents.append(document) - total_occurrences += len(document.occurrences) - total_symbols += len(document.symbols) processed_count += 1 - - if i % 10 == 0 or i == len(files): - logger.debug(f"Phase 2 progress: {i}/{len(files)} files, " - f"last file: {relative_path}, " - f"{len(document.occurrences) if document else 0} occurrences") - + except Exception as e: - error_count += 1 - logger.error(f"Phase 2 failed for {relative_path}: {e}") + logger.warning(f"Phase 2 failed for {file_path}: {e}") continue - - logger.info(f"Phase 2 summary: {processed_count} documents generated, {error_count} errors, " - f"{total_occurrences} total occurrences, {total_symbols} total symbols") - - return documents - - def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: - """Collect symbol definitions from a single text file.""" - # Read file content - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {os.path.relpath(file_path, project_path)}") - return - - # Collect symbols using pattern matching - relative_path = self._get_relative_path(file_path, project_path) - self._collect_symbols_from_text(relative_path, content) - logger.debug(f"Symbol collection - {relative_path}") - - def _analyze_text_file(self, file_path: str, project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> Optional[scip_pb2.Document]: - """Analyze a single text file and generate complete SCIP document.""" - # Read file content - content = self._read_file_content(file_path) - if not content: - return None - - # Create SCIP document - document = scip_pb2.Document() - document.relative_path = self._get_relative_path(file_path, project_path) - document.language = self._detect_language_from_extension(Path(file_path).suffix) - # Analyze content and generate occurrences - self.position_calculator = PositionCalculator(content) - occurrences, symbols = self._analyze_text_content_for_document(document.relative_path, content, document.language, relationships) - - # Add results to document - document.occurrences.extend(occurrences) - document.symbols.extend(symbols) - - logger.debug(f"Analyzed text file {document.relative_path}: " - f"{len(document.occurrences)} occurrences, {len(document.symbols)} symbols") - - return document + logger.info(f"Phase 2 summary: {processed_count} basic documents created") + return documents def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """ - Build basic relationships using generic patterns. - - Args: - files: List of file paths to process - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - logger.debug(f"FallbackStrategy: Building symbol relationships for {len(files)} files") - all_relationships = {} - - for file_path in files: - try: - file_relationships = self._extract_relationships_from_file(file_path, project_path) - all_relationships.update(file_relationships) - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - - total_symbols_with_relationships = len(all_relationships) - total_relationships = sum(len(rels) for rels in all_relationships.values()) - - logger.debug(f"FallbackStrategy: Built {total_relationships} relationships for {total_symbols_with_relationships} symbols") - return all_relationships - - def _extract_relationships_from_file(self, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """Extract basic relationships using generic patterns.""" - content = self._read_file_content(file_path) - if not content: - return {} - - relationships = {} - relative_path = self._get_relative_path(file_path, project_path) - - # Generic function call patterns - function_call_pattern = r"(\w+)\s*\(" - function_def_patterns = [ - r"function\s+(\w+)\s*\(", # JavaScript - r"def\s+(\w+)\s*\(", # Python - r"fn\s+(\w+)\s*\(", # Rust/Zig - r"func\s+(\w+)\s*\(", # Go/Swift - ] - - # Basic function definition extraction - for pattern in function_def_patterns: - for match in re.finditer(pattern, content): - function_name = match.group(1) - # Could expand to extract calls within function context - - logger.debug(f"Extracted {len(relationships)} relationships from {relative_path}") - return relationships + """Skip relationship building - return empty dict.""" + logger.debug("FallbackStrategy: Skipping relationship building (minimal mode)") + return {} + + def _create_basic_document(self, file_path: str, project_path: str) -> Optional[scip_pb2.Document]: + """Create a minimal SCIP document with basic file information.""" + try: + # Check if file exists and get basic info + if not os.path.exists(file_path): + return None + + file_stats = os.stat(file_path) + relative_path = os.path.relpath(file_path, project_path) + + # Create basic document + document = scip_pb2.Document() + document.relative_path = relative_path + document.language = self._detect_language_from_extension(Path(file_path).suffix) + + # Add basic file symbol + file_name = Path(file_path).stem + symbol_id = self.symbol_manager.create_local_symbol( + language=document.language, + file_path=relative_path, + symbol_path=[file_name], + descriptor="" + ) + + # Create minimal symbol information + symbol_info = scip_pb2.SymbolInformation() + symbol_info.symbol = symbol_id + symbol_info.display_name = file_name + symbol_info.kind = scip_pb2.File + symbol_info.documentation.append( + f"File: {relative_path} ({document.language})" + ) + + document.symbols.append(symbol_info) + + logger.debug(f"Created basic document for: {relative_path}") + return document + + except Exception as e: + logger.warning(f"Failed to create basic document for {file_path}: {e}") + return None def _detect_language_from_extension(self, extension: str) -> str: """Detect specific language from extension.""" @@ -192,6 +123,10 @@ def _detect_language_from_extension(self, extension: str) -> str: '.c': 'c', '.cpp': 'cpp', '.cc': 'cpp', '.cxx': 'cpp', '.c++': 'cpp', '.h': 'c', '.hpp': 'cpp', '.hh': 'cpp', '.hxx': 'cpp', + '.js': 'javascript', '.mjs': 'javascript', '.jsx': 'javascript', + '.ts': 'typescript', '.tsx': 'typescript', + '.py': 'python', '.pyi': 'python', '.pyx': 'python', + '.java': 'java', '.go': 'go', '.rs': 'rust', '.rb': 'ruby', @@ -256,284 +191,3 @@ def _detect_language_from_extension(self, extension: str) -> str: } return extension_mapping.get(extension.lower(), 'text') - - # Symbol collection methods (Phase 1) - def _collect_symbols_from_text(self, file_path: str, content: str) -> None: - """Collect symbols from text content using pattern matching.""" - lines = content.split('\n') - - # Determine if this looks like code - if self._is_code_like(content): - self._collect_code_symbols(file_path, lines) - else: - # For non-code files, just create a basic file symbol - self._collect_file_symbol(file_path) - - def _collect_code_symbols(self, file_path: str, lines: List[str]): - """Collect symbols from code-like content.""" - patterns = { - 'function_like': [ - re.compile(r'(?:^|\s)(?:function|def|fn|func)\s+(\w+)', re.IGNORECASE | re.MULTILINE), - re.compile(r'(?:^|\s)(\w+)\s*\([^)]*\)\s*[{:]', re.MULTILINE), # Function definitions - re.compile(r'(?:^|\s)(\w+)\s*:=?\s*function', re.IGNORECASE | re.MULTILINE), # JS functions - ], - 'class_like': [ - re.compile(r'(?:^|\s)(?:class|struct|interface|enum)\s+(\w+)', re.IGNORECASE | re.MULTILINE), - ], - 'constant_like': [ - re.compile(r'(?:^|\s)(?:const|let|var|#define)\s+(\w+)', re.IGNORECASE | re.MULTILINE), - re.compile(r'(?:^|\s)(\w+)\s*[:=]\s*[^=]', re.MULTILINE), # Simple assignments - ], - 'config_like': [ - re.compile(r'^(\w+)\s*[:=]', re.MULTILINE), # Config keys - re.compile(r'^\[(\w+)\]', re.MULTILINE), # INI sections - ] - } - - for line_num, line in enumerate(lines): - line = line.strip() - if not line or line.startswith(('#', '//', '/*', '*', '--', ';')): - continue - - # Look for function-like patterns - for pattern in patterns['function_like']: - match = pattern.search(line) - if match: - name = match.group(1) - if name and name.isidentifier() and len(name) > 1: - self._register_symbol(name, file_path, "().", "Function-like construct") - - # Look for class-like patterns - for pattern in patterns['class_like']: - match = pattern.search(line) - if match: - name = match.group(1) - if name and name.isidentifier() and len(name) > 1: - self._register_symbol(name, file_path, "#", "Type definition") - - # Look for constant-like patterns - for pattern in patterns['constant_like']: - match = pattern.search(line) - if match: - name = match.group(1) - if name and name.isidentifier() and len(name) > 1: - self._register_symbol(name, file_path, "", "Variable or constant") - - # Look for config-like patterns - for pattern in patterns['config_like']: - match = pattern.search(line) - if match: - name = match.group(1) - if name and len(name) > 1: - self._register_symbol(name, file_path, "", "Configuration key") - - def _collect_file_symbol(self, file_path: str): - """Create a basic file-level symbol for non-code files.""" - file_name = Path(file_path).stem - self._register_symbol(file_name, file_path, "", "File") - - def _register_symbol(self, name: str, file_path: str, descriptor: str, description: str): - """Register a symbol with the reference resolver.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="text", - file_path=file_path, - symbol_path=[name], - descriptor=descriptor - ) - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.UnspecifiedSymbolKind, - display_name=name, - documentation=[description] - ) - - # Document analysis methods (Phase 2) - def _analyze_text_content_for_document(self, file_path: str, content: str, language: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple: - """Analyze text content and generate SCIP data.""" - lines = content.split('\n') - - # Determine if this looks like code - if self._is_code_like(content): - return self._analyze_code_for_document(file_path, lines, language, relationships) - else: - # For non-code files, just create a basic file symbol - return self._analyze_file_for_document(file_path, language) - - def _analyze_code_for_document(self, file_path: str, lines: List[str], language: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple: - """Analyze code patterns and create symbols for document.""" - occurrences = [] - symbols = [] - - patterns = { - 'function_like': [ - re.compile(r'(?:^|\s)(?:function|def|fn|func)\s+(\w+)', re.IGNORECASE | re.MULTILINE), - re.compile(r'(?:^|\s)(\w+)\s*\([^)]*\)\s*[{:]', re.MULTILINE), # Function definitions - re.compile(r'(?:^|\s)(\w+)\s*:=?\s*function', re.IGNORECASE | re.MULTILINE), # JS functions - ], - 'class_like': [ - re.compile(r'(?:^|\s)(?:class|struct|interface|enum)\s+(\w+)', re.IGNORECASE | re.MULTILINE), - ], - 'constant_like': [ - re.compile(r'(?:^|\s)(?:const|let|var|#define)\s+(\w+)', re.IGNORECASE | re.MULTILINE), - re.compile(r'(?:^|\s)(\w+)\s*[:=]\s*[^=]', re.MULTILINE), # Simple assignments - ], - 'config_like': [ - re.compile(r'^(\w+)\s*[:=]', re.MULTILINE), # Config keys - re.compile(r'^[\[(\w+)\]]', re.MULTILINE), # INI sections - ] - } - - for line_num, line in enumerate(lines): - line = line.strip() - if not line or line.startswith(('#', '//', '/*', '*', '--', ';')): - continue - - # Look for function-like patterns - for pattern in patterns['function_like']: - match = pattern.search(line) - if match: - name = match.group(1) - if name and name.isidentifier() and len(name) > 1: - occ, sym = self._create_symbol_for_document( - line_num, name, file_path, scip_pb2.Function, "().", - f"Function-like construct in {language}", - relationships - ) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - - # Look for class-like patterns - for pattern in patterns['class_like']: - match = pattern.search(line) - if match: - name = match.group(1) - if name and name.isidentifier() and len(name) > 1: - occ, sym = self._create_symbol_for_document( - line_num, name, file_path, scip_pb2.Class, "#", - f"Type definition in {language}", - relationships - ) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - - # Look for constant-like patterns - for pattern in patterns['constant_like']: - match = pattern.search(line) - if match: - name = match.group(1) - if name and name.isidentifier() and len(name) > 1: - occ, sym = self._create_symbol_for_document( - line_num, name, file_path, scip_pb2.Variable, "", - f"Variable or constant in {language}", - relationships - ) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - - # Look for config-like patterns - for pattern in patterns['config_like']: - match = pattern.search(line) - if match: - name = match.group(1) - if name and len(name) > 1: - occ, sym = self._create_symbol_for_document( - line_num, name, file_path, scip_pb2.Constant, "", - f"Configuration key in {language}", - relationships - ) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - - return occurrences, symbols - - def _analyze_file_for_document(self, file_path: str, language: str) -> tuple: - """Create a basic file-level symbol for non-code files.""" - file_name = Path(file_path).stem - - symbol_id = self.symbol_manager.create_local_symbol( - language="text", - file_path=file_path, - symbol_path=[file_name], - descriptor="" - ) - - # Create symbol information only (no occurrence for file-level symbols) - symbol_info = self._create_symbol_information( - symbol_id, file_name, scip_pb2.File, f"{language.title()} file" - ) - - return [], [symbol_info] - - def _create_symbol_for_document(self, line_num: int, name: str, file_path: str, - symbol_kind: int, descriptor: str, description: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple: - """Create a symbol with occurrence and information for document.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="text", - file_path=file_path, - symbol_path=[name], - descriptor=descriptor - ) - - # Create definition occurrence - start_col, end_col = self.position_calculator.find_name_in_line(line_num, name) - range_obj = self.position_calculator.line_col_to_range( - line_num, start_col, line_num, end_col - ) - - occurrence = self._create_occurrence( - symbol_id, range_obj, scip_pb2.Definition, scip_pb2.Identifier - ) - - # Create symbol information - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - symbol_info = self._create_symbol_information( - symbol_id, name, symbol_kind, description, scip_relationships - ) - - return occurrence, symbol_info - - # Utility methods - def _is_code_like(self, content: str) -> bool: - """Determine if the file appears to be code-like.""" - # Check for common code indicators - code_indicators = [ - r'\bfunction\b', r'\bdef\b', r'\bclass\b', r'\binterface\b', - r'\bstruct\b', r'\benum\b', r'\bconst\b', r'\bvar\b', r'\blet\b', - r'[{}();]', r'=\s*function', r'=>', r'\bif\b', r'\bfor\b', r'\bwhile\b' - ] - - code_score = 0 - for pattern in code_indicators: - if re.search(pattern, content, re.IGNORECASE): - code_score += 1 - - # If we find multiple code indicators, treat as code - return code_score >= 3 - - def _create_occurrence(self, symbol_id: str, range_obj: scip_pb2.Range, - symbol_roles: int, syntax_kind: int) -> scip_pb2.Occurrence: - """Create a SCIP occurrence.""" - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = symbol_roles - occurrence.syntax_kind = syntax_kind - occurrence.range.CopyFrom(range_obj) - return occurrence - - def _create_symbol_information(self, symbol_id: str, display_name: str, - symbol_kind: int, description: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = display_name - symbol_info.kind = symbol_kind - symbol_info.documentation.append(description) - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - return symbol_info \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py b/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py index 5bd4e31..3743741 100644 --- a/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py +++ b/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py @@ -33,7 +33,7 @@ class SCIPSymbolAnalyzer: """ Enhanced SCIP symbol analyzer with accurate position detection and call relationships. - + This class replaces the legacy SCIPQueryTool and provides: - Accurate symbol location extraction from SCIP Range data - Proper symbol type classification using SCIP SymbolKind enum @@ -41,24 +41,24 @@ class SCIPSymbolAnalyzer: - Cross-file symbol resolution - LLM-optimized output formatting """ - + def __init__(self): """Initialize the symbol analyzer.""" self._symbol_kind_cache: Dict[int, str] = {} self._scip_symbol_cache: Dict[str, Dict[str, Any]] = {} self._symbol_parser: Optional[SCIPSymbolManager] = None self._relationship_reader = SCIPRelationshipReader() - + # Initialize SCIP symbol kind mapping self._init_symbol_kind_mapping() - + def _init_symbol_kind_mapping(self): """Initialize SCIP SymbolKind enum mapping.""" if not SCIP_PROTO_AVAILABLE: # Fallback numeric mapping when protobuf not available self._symbol_kind_map = { 3: 'class', # CLASS - 11: 'function', # FUNCTION + 11: 'function', # FUNCTION 14: 'method', # METHOD 29: 'variable', # VARIABLE 4: 'constant', # CONSTANT @@ -78,24 +78,24 @@ def _init_symbol_kind_mapping(self): # Use actual protobuf enum when available self._symbol_kind_map = {} # Will be populated dynamically using scip_pb2.SymbolKind.Name() - + def analyze_file(self, file_path: str, scip_index) -> FileAnalysis: """ Main entry point for file analysis. - + Args: file_path: Relative path to the file to analyze scip_index: SCIP index containing all project data - + Returns: FileAnalysis object with complete symbol information - + Raises: ValueError: If file not found or analysis fails """ try: logger.debug(f"Starting analysis for file: {file_path}") - + # Initialize symbol parser from index metadata (for scip-* symbol parsing) try: project_root = getattr(getattr(scip_index, 'metadata', None), 'project_root', '') or '' @@ -109,100 +109,100 @@ def analyze_file(self, file_path: str, scip_index) -> FileAnalysis: if not document: logger.warning(f"Document not found in SCIP index: {file_path}") return self._create_empty_analysis(file_path) - + logger.debug(f"Found document with {len(document.symbols)} symbols") - + # Step 2: Extract all symbols with accurate metadata symbols = self._extract_all_symbols(document) logger.debug(f"Extracted {len(symbols)} symbols") - + # Step 3: Extract call relationships self._extract_call_relationships(document, symbols, scip_index) logger.debug("Completed call relationship extraction") - + # Step 4: Organize results into final structure result = self._organize_results(document, symbols, scip_index) logger.debug(f"Analysis complete: {len(result.functions)} functions, {len(result.classes)} classes") - + return result - + except Exception as e: logger.error(f"Failed to analyze file {file_path}: {e}") # Return partial analysis rather than failing completely return self._create_error_analysis(file_path, str(e)) - + def _find_document(self, file_path: str, scip_index) -> Optional[Any]: """ Find the SCIP document for the given file path. - + Args: file_path: File path to search for scip_index: SCIP index object - + Returns: SCIP document or None if not found """ if not hasattr(scip_index, 'documents'): logger.error("Invalid SCIP index: missing documents attribute") return None - + # Normalize path for comparison normalized_target = self._normalize_path(file_path) - + # Try exact match first for document in scip_index.documents: if self._normalize_path(document.relative_path) == normalized_target: return document - + # Try case-insensitive match normalized_lower = normalized_target.lower() for document in scip_index.documents: if self._normalize_path(document.relative_path).lower() == normalized_lower: logger.debug(f"Found case-insensitive match for {file_path}") return document - + return None - + def _normalize_path(self, path: str) -> str: """Normalize file path for consistent comparison.""" return path.replace('\\', '/').lstrip('./') - + def _extract_all_symbols(self, document) -> Dict[str, SymbolDefinition]: """ Extract all symbols from the document in a single pass. - + Args: document: SCIP document object - + Returns: Dictionary mapping SCIP symbols to SymbolDefinition objects """ symbols = {} - + for symbol_info in document.symbols: try: # Extract basic symbol information scip_symbol = symbol_info.symbol display_name = getattr(symbol_info, 'display_name', '') symbol_kind = getattr(symbol_info, 'kind', 0) - + # Parse symbol name and classification parsed_name, class_name = self._parse_symbol_identity(scip_symbol, display_name) if not parsed_name: continue - + # Get symbol type from SCIP kind symbol_type = self._classify_symbol_type(symbol_kind, scip_symbol) - + # Extract precise location # Extract location (never fails now) location = self._extract_precise_location(scip_symbol, document) - + # Debug: Check location type if not isinstance(location, LocationInfo): logger.error(f"Location extraction returned wrong type: {type(location)} for symbol {scip_symbol}") location = LocationInfo(line=1, column=1) # Fallback - + # Create symbol definition symbol_def = SymbolDefinition( name=parsed_name, @@ -212,27 +212,27 @@ def _extract_all_symbols(self, document) -> Dict[str, SymbolDefinition]: class_name=class_name, scip_symbol=scip_symbol ) - + # Extract additional metadata self._enrich_symbol_metadata(symbol_def, symbol_info, document) - + symbols[scip_symbol] = symbol_def logger.debug(f"Processed symbol: {parsed_name} ({symbol_type}) at {location.line}:{location.column}") - + except Exception as e: logger.warning(f"Failed to process symbol {getattr(symbol_info, 'symbol', 'unknown')}: {e}") continue - + return symbols - + def _parse_symbol_identity(self, scip_symbol: str, display_name: str = '') -> tuple[str, Optional[str]]: """ Parse symbol name and class ownership from SCIP symbol string. - + Args: scip_symbol: SCIP symbol identifier display_name: Display name from symbol info - + Returns: Tuple of (symbol_name, class_name) """ @@ -242,12 +242,12 @@ def _parse_symbol_identity(self, scip_symbol: str, display_name: str = '') -> tu else: # Extract from SCIP symbol name = self._extract_name_from_scip_symbol(scip_symbol) - + # Extract class name if this is a class member class_name = self._extract_class_name(scip_symbol) - + return name, class_name - + @lru_cache(maxsize=500) def _extract_name_from_scip_symbol(self, scip_symbol: str) -> str: """Extract clean, human-readable symbol name from SCIP symbol identifier.""" @@ -255,7 +255,7 @@ def _extract_name_from_scip_symbol(self, scip_symbol: str) -> str: if scip_symbol.startswith('local:'): # local:src.module.Class#method_name(). symbol_path = scip_symbol[6:] # Remove 'local:' prefix - + if '#' in symbol_path: # Method or field: extract after '#' method_part = symbol_path.split('#')[-1] @@ -264,7 +264,7 @@ def _extract_name_from_scip_symbol(self, scip_symbol: str) -> str: # Class or top-level function: extract last part class_part = symbol_path.split('.')[-1] return self._clean_symbol_name(class_part) - + elif scip_symbol.startswith('external:'): # external:module.path/ClassName#method_name(). if '/' in scip_symbol: @@ -278,29 +278,29 @@ def _extract_name_from_scip_symbol(self, scip_symbol: str) -> str: # Just module reference module_part = scip_symbol[9:] # Remove 'external:' return self._clean_symbol_name(module_part.split('.')[-1]) - + # Fallback: clean up whatever we have return self._clean_symbol_name(scip_symbol.split('/')[-1].split('#')[-1]) - + except Exception as e: logger.debug(f"Error extracting name from {scip_symbol}: {e}") return "unknown" - + def _clean_symbol_name(self, raw_name: str) -> str: """Clean symbol name for human readability.""" # Remove common suffixes and prefixes cleaned = raw_name.rstrip('().#') - + # Remove module path prefixes if present if '.' in cleaned: cleaned = cleaned.split('.')[-1] - + # Handle special cases if not cleaned or cleaned.isdigit(): return "unknown" - + return cleaned - + @lru_cache(maxsize=500) def _extract_class_name(self, scip_symbol: str) -> Optional[str]: """Extract clean class name if this symbol belongs to a class. @@ -349,15 +349,15 @@ def _extract_class_name(self, scip_symbol: str) -> Optional[str]: logger.debug(f"Error extracting class name from {scip_symbol}: {e}") return None - + def _classify_symbol_type(self, scip_kind: int, scip_symbol: str) -> str: """ Classify symbol type using SCIP SymbolKind enum. - + Args: scip_kind: SCIP SymbolKind enum value scip_symbol: SCIP symbol string for additional context - + Returns: Standardized symbol type string """ @@ -367,7 +367,7 @@ def _classify_symbol_type(self, scip_kind: int, scip_symbol: str) -> str: else: base_type = self._get_scip_kind_name(scip_kind) self._symbol_kind_cache[scip_kind] = base_type - + # Refine classification based on index symbol structure if base_type == 'function': # Legacy/colon formats use '#' @@ -383,9 +383,9 @@ def _classify_symbol_type(self, scip_kind: int, scip_symbol: str) -> str: last_comp = components[-1] if last_comp.endswith('().') or last_comp.endswith('()'): return 'method' - + return base_type - + def _get_scip_kind_name(self, kind: int) -> str: """Get symbol type name from SCIP SymbolKind.""" if SCIP_PROTO_AVAILABLE: @@ -395,14 +395,14 @@ def _get_scip_kind_name(self, kind: int) -> str: return self._normalize_kind_name(enum_name) except (ValueError, AttributeError): pass - + # Fallback to numeric mapping return self._symbol_kind_map.get(kind, 'unknown') - + def _normalize_kind_name(self, enum_name: str) -> str: """Normalize SCIP enum name to standard type.""" enum_name = enum_name.lower() - + # Map SCIP names to our standard names if enum_name == 'class': return 'class' @@ -420,15 +420,15 @@ def _normalize_kind_name(self, enum_name: str) -> str: return 'property' else: return enum_name - + def _extract_precise_location(self, scip_symbol: str, document) -> LocationInfo: """ Never-fail location extraction with intelligent fallbacks using SCIPSymbolManager. - + Args: scip_symbol: SCIP symbol identifier document: SCIP document containing occurrences - + Returns: LocationInfo with best available location and confidence level """ @@ -437,24 +437,24 @@ def _extract_precise_location(self, scip_symbol: str, document) -> LocationInfo: if location: location.confidence = 'definition' return location - + location = self._find_any_location(scip_symbol, document) if location: - location.confidence = 'occurrence' + location.confidence = 'occurrence' return location - + # Layer 2: SCIPSymbolManager-based symbol structure inference if self._symbol_parser: location = self._infer_location_from_symbol_structure(scip_symbol, document) if location: location.confidence = 'inferred' return location - + # Layer 3: Symbol type-based default location location = self._get_default_location_by_symbol_type(scip_symbol) location.confidence = 'default' return location - + def _find_definition_location(self, scip_symbol: str, document) -> Optional[LocationInfo]: """Find the definition occurrence for a symbol.""" for occurrence in document.occurrences: @@ -463,7 +463,7 @@ def _find_definition_location(self, scip_symbol: str, document) -> Optional[Loca if location: return location return None - + def _find_any_location(self, scip_symbol: str, document) -> Optional[LocationInfo]: """Find any occurrence with location data for a symbol.""" for occurrence in document.occurrences: @@ -472,12 +472,12 @@ def _find_any_location(self, scip_symbol: str, document) -> Optional[LocationInf if location: return location return None - + def _is_definition(self, occurrence) -> bool: """Check if an occurrence represents a definition.""" if not hasattr(occurrence, 'symbol_roles'): return False - + try: if SCIP_PROTO_AVAILABLE: return bool(occurrence.symbol_roles & scip_pb2.SymbolRole.Definition) @@ -486,87 +486,87 @@ def _is_definition(self, occurrence) -> bool: return bool(occurrence.symbol_roles & 1) except (AttributeError, TypeError): return False - + def _parse_occurrence_location(self, occurrence) -> Optional[LocationInfo]: """Parse location information from SCIP occurrence.""" try: if not hasattr(occurrence, 'range') or not occurrence.range: return None - + range_obj = occurrence.range if not hasattr(range_obj, 'start') or not range_obj.start: return None - + start = range_obj.start if len(start) >= 2: # SCIP uses 0-based indexing, convert to 1-based line = start[0] + 1 column = start[1] + 1 return LocationInfo(line=line, column=column) - + except (AttributeError, IndexError, TypeError) as e: logger.debug(f"Failed to parse occurrence location: {e}") - + return None - + def _enrich_symbol_metadata(self, symbol: SymbolDefinition, symbol_info, document): """Enrich symbol with additional metadata from SCIP data.""" # Extract documentation if available if hasattr(symbol_info, 'documentation') and symbol_info.documentation: # Could extract docstrings here if needed pass - + # For functions/methods, extract parameter information if symbol.is_callable(): symbol.parameters = self._extract_function_parameters(symbol.scip_symbol, symbol_info, document) symbol.return_type = self._extract_return_type(symbol.scip_symbol, symbol_info) symbol.is_async = self._is_async_function(symbol.scip_symbol, symbol_info) - + # For classes, extract methods and attributes elif symbol.symbol_type == 'class': symbol.methods, symbol.attributes = self._extract_class_members(symbol.scip_symbol, document) symbol.inherits_from = self._extract_inheritance(symbol.scip_symbol, symbol_info) - + # For variables, extract type and scope information elif symbol.symbol_type == 'variable': symbol.type = self._extract_variable_type(symbol.scip_symbol, symbol_info) symbol.is_global = self._is_global_variable(symbol.scip_symbol, document) - + # For constants, extract value if available elif symbol.symbol_type == 'constant': symbol.value = self._extract_constant_value(symbol.scip_symbol, symbol_info) - + def _extract_call_relationships(self, document, symbols: Dict[str, SymbolDefinition], scip_index): """ Extract all relationships from SCIP document using the new relationship reader. - + Args: document: SCIP document containing symbols and relationships symbols: Dictionary of extracted symbols scip_index: Full SCIP index for cross-file resolution """ logger.debug("Starting relationship extraction using SCIP relationship reader") - + # Use the new relationship reader to extract all relationships all_relationships = self._relationship_reader.extract_relationships_from_document(document) - + # Assign relationships to symbols for symbol_id, symbol_def in symbols.items(): if symbol_id in all_relationships: symbol_def.relationships = all_relationships[symbol_id] logger.debug(f"Assigned {symbol_def.relationships.get_total_count()} relationships to {symbol_def.name}") - + logger.debug(f"Relationship extraction completed for {len(symbols)} symbols") - + def _organize_results(self, document, symbols: Dict[str, SymbolDefinition], scip_index=None) -> FileAnalysis: """ Organize extracted symbols into final FileAnalysis structure. - + Args: document: SCIP document symbols: Extracted symbol definitions scip_index: Full SCIP index for external symbol extraction - + Returns: FileAnalysis with organized results """ @@ -577,28 +577,28 @@ def _organize_results(self, document, symbols: Dict[str, SymbolDefinition], scip line_count=self._estimate_line_count(document), size_bytes=0 # TODO: Could get from filesystem if needed ) - + # Add symbols to appropriate collections for symbol in symbols.values(): result.add_symbol(symbol) - + # Extract import information from occurrences self._extract_imports(document, result.imports) - + # Also extract imports from external symbols (for strategies like Objective-C) if scip_index: self._extract_imports_from_external_symbols(scip_index, result.imports) - + return result - - + + def _estimate_line_count(self, document) -> int: """Estimate line count from document data.""" # Try to get from document text if available if hasattr(document, 'text') and document.text: return len(document.text.splitlines()) - + # Fallback: estimate from occurrence ranges max_line = 0 for occurrence in document.occurrences: @@ -608,34 +608,34 @@ def _estimate_line_count(self, document) -> int: max_line = max(max_line, line) except (AttributeError, IndexError): continue - + return max_line if max_line > 0 else 100 # Default estimate - + def _is_function_call(self, occurrence) -> bool: """ Check if an occurrence represents a function call. - + Based on debug analysis, function calls have roles=0 in our SCIP data, so we need to identify them by other characteristics. - + Args: occurrence: SCIP occurrence object - + Returns: True if this occurrence is a function call """ try: symbol = occurrence.symbol roles = getattr(occurrence, 'symbol_roles', 0) - + # Check if it's a definition (role = 1) - these are NOT calls if roles & 1: return False - - # Check if it's an import (role = 2) - these are NOT calls + + # Check if it's an import (role = 2) - these are NOT calls if roles & 2: return False - + # For roles = 0, check if it looks like a function call by symbol format if roles == 0: # Function calls typically have () in the symbol @@ -649,26 +649,26 @@ def _is_function_call(self, occurrence) -> bool: # Function calls are usually at higher column positions return col > 5 return True - + # Traditional role-based detection as fallback if SCIP_PROTO_AVAILABLE: return bool(roles & (scip_pb2.SymbolRole.Read | scip_pb2.SymbolRole.Reference)) else: # Fallback: Read=8, Reference=4 return bool(roles & (8 | 4)) - + except (AttributeError, TypeError): return False - + def _find_containing_function(self, occurrence, function_symbols: Dict[str, SymbolDefinition], document) -> Optional[SymbolDefinition]: """ Find which function contains the given occurrence. - + Args: occurrence: SCIP occurrence object function_symbols: Map of SCIP symbols to function definitions document: SCIP document - + Returns: SymbolDefinition of the containing function, or None """ @@ -676,11 +676,11 @@ def _find_containing_function(self, occurrence, function_symbols: Dict[str, Symb occurrence_line = self._get_occurrence_line(occurrence) if occurrence_line <= 0: return None - + # Find the function that contains this line best_match = None best_distance = float('inf') - + for scip_symbol, func_def in function_symbols.items(): # Function should start before or at the occurrence line if func_def.line <= occurrence_line: @@ -688,13 +688,13 @@ def _find_containing_function(self, occurrence, function_symbols: Dict[str, Symb if distance < best_distance: best_distance = distance best_match = func_def - + return best_match - + except Exception as e: logger.debug(f"Error finding containing function: {e}") return None - + def _get_occurrence_line(self, occurrence) -> int: """Extract line number from SCIP occurrence.""" try: @@ -704,30 +704,30 @@ def _get_occurrence_line(self, occurrence) -> int: except (AttributeError, IndexError, TypeError): pass return 0 - + def _resolve_call_target(self, target_symbol: str, scip_index, current_document) -> Optional[Dict[str, Any]]: """Use SCIPSymbolManager to resolve call target information. - + Args: target_symbol: SCIP symbol being called scip_index: Full SCIP index for cross-file lookup current_document: Current document for local symbol context - + Returns: Dictionary with call target information or None """ if not self._symbol_parser: return self._fallback_resolve_target(target_symbol, current_document) - + try: # Use SCIPSymbolManager to parse symbol symbol_info = self._symbol_parser.parse_symbol(target_symbol) if not symbol_info: return None - + # Extract clear symbol name from descriptors target_name = self._extract_symbol_name_from_descriptors(symbol_info.descriptors) - + # Handle based on manager type if symbol_info.manager == 'local': # Local call: use existing file path extraction @@ -739,7 +739,7 @@ def _resolve_call_target(self, target_symbol: str, scip_index, current_document) 'file': file_path or current_document.relative_path, 'line': target_line } - + elif symbol_info.manager in ['stdlib', 'pip', 'npm']: # External call: get info from parsed results return { @@ -748,37 +748,37 @@ def _resolve_call_target(self, target_symbol: str, scip_index, current_document) 'package': symbol_info.package, 'module': self._extract_module_from_descriptors(symbol_info.descriptors) } - + return None - + except Exception as e: logger.debug(f"Error resolving call target {target_symbol}: {e}") return None - - + + def _find_symbol_definition(self, target_symbol: str, scip_index) -> tuple[Optional[str], int]: """ Find the definition location of a symbol in the SCIP index. - + Args: target_symbol: SCIP symbol to find scip_index: Full SCIP index - + Returns: Tuple of (file_path, line_number) or (None, 0) if not found """ try: for document in scip_index.documents: for occurrence in document.occurrences: - if (occurrence.symbol == target_symbol and + if (occurrence.symbol == target_symbol and self._is_definition(occurrence)): line = self._get_occurrence_line(occurrence) return document.relative_path, line except Exception as e: logger.debug(f"Error finding symbol definition: {e}") - + return None, 0 - + def _extract_symbol_name_from_descriptors(self, descriptors: str) -> str: """Extract symbol name from SCIP descriptors.""" # utils.py/helper_function() -> helper_function @@ -787,14 +787,14 @@ def _extract_symbol_name_from_descriptors(self, descriptors: str) -> str: symbol_part = descriptors.split('/')[-1] return symbol_part.rstrip('().') return descriptors.rstrip('().') - + def _extract_module_from_descriptors(self, descriptors: str) -> Optional[str]: """Extract module name from descriptors.""" # os/ -> os, pathlib/Path -> pathlib if '/' in descriptors: return descriptors.split('/')[0] return descriptors.strip('/') - + def _fallback_resolve_target(self, target_symbol: str, current_document) -> Optional[Dict[str, Any]]: """Fallback resolution when SCIPSymbolManager is not available.""" try: @@ -802,83 +802,83 @@ def _fallback_resolve_target(self, target_symbol: str, current_document) -> Opti target_name, target_class = self._parse_symbol_identity(target_symbol) if not target_name: return None - + # Basic resolution for legacy formats if target_symbol.startswith('local:'): target_location = self._find_local_symbol_location(target_symbol, current_document) return { 'name': target_name, - 'scope': 'local', + 'scope': 'local', 'file': current_document.relative_path, 'line': target_location } - + return { 'name': target_name, 'scope': 'unknown', 'file': 'unknown', 'line': 0 } - + except Exception as e: logger.debug(f"Fallback resolution failed for {target_symbol}: {e}") return None - + def _find_local_symbol_location(self, target_symbol: str, document) -> int: """Find the line number for a local symbol definition.""" try: for occurrence in document.occurrences: - if (occurrence.symbol == target_symbol and + if (occurrence.symbol == target_symbol and self._is_definition(occurrence)): return self._get_occurrence_line(occurrence) except Exception as e: logger.debug(f"Error finding local symbol location: {e}") return 0 - - + + def _extract_imports(self, document, imports: ImportGroup): """Use SCIPSymbolManager to correctly parse imports.""" if not self._symbol_parser: logger.debug("No symbol parser available, skipping import extraction") return - + try: seen_modules = set() - + # Method 1: Extract from occurrences with Import role (traditional approach) for occurrence in document.occurrences: # Only process Import role symbols if not self._is_import_occurrence(occurrence): continue - + symbol_info = self._symbol_parser.parse_symbol(occurrence.symbol) if not symbol_info: continue - + # Handle based on manager type if symbol_info.manager == 'stdlib': module_name = self._extract_module_from_descriptors(symbol_info.descriptors) if module_name and module_name not in seen_modules: imports.add_import(module_name, 'standard_library') seen_modules.add(module_name) - + elif symbol_info.manager == 'pip': # pip packages: package name is the module name package_name = symbol_info.package if package_name and package_name not in seen_modules: - imports.add_import(package_name, 'third_party') + imports.add_import(package_name, 'third_party') seen_modules.add(package_name) - + elif symbol_info.manager == 'local': # Local imports: extract module path from descriptors module_path = self._extract_local_module_path(symbol_info.descriptors) if module_path and module_path not in seen_modules: imports.add_import(module_path, 'local') seen_modules.add(module_path) - + logger.debug(f"Extracted {len(seen_modules)} unique imports from SCIP occurrences") - + except Exception as e: logger.debug(f"Error extracting imports from occurrences: {e}") @@ -888,13 +888,13 @@ def _extract_imports_from_external_symbols(self, scip_index, imports: ImportGrou if not hasattr(scip_index, 'external_symbols'): logger.debug("No external_symbols in SCIP index") return - + seen_modules = set() - + for symbol_info in scip_index.external_symbols: if not symbol_info.symbol: continue - + # Parse the external symbol parsed_symbol = self._symbol_parser.parse_symbol(symbol_info.symbol) if self._symbol_parser else None if not parsed_symbol: @@ -907,7 +907,7 @@ def _extract_imports_from_external_symbols(self, scip_index, imports: ImportGrou seen_modules.add(framework_name) logger.debug(f"Extracted external dependency: {framework_name} ({import_type})") continue - + # Handle based on manager type if parsed_symbol.manager in ['system', 'unknown']: # For Objective-C system frameworks @@ -915,16 +915,16 @@ def _extract_imports_from_external_symbols(self, scip_index, imports: ImportGrou if package_name and package_name not in seen_modules: imports.add_import(package_name, 'standard_library') seen_modules.add(package_name) - + elif parsed_symbol.manager in ['cocoapods', 'carthage']: # Third-party Objective-C dependencies package_name = parsed_symbol.package if package_name and package_name not in seen_modules: imports.add_import(package_name, 'third_party') seen_modules.add(package_name) - + logger.debug(f"Extracted {len(seen_modules)} unique imports from external symbols") - + except Exception as e: logger.debug(f"Error extracting imports from external symbols: {e}") @@ -952,29 +952,29 @@ def _classify_external_symbol(self, symbol_string: str) -> str: 'Security', 'SystemConfiguration', 'CFNetwork', 'CoreFoundation', 'AppKit', 'Cocoa', 'WebKit', 'JavaScriptCore' } - + for framework in system_frameworks: if framework in symbol_string: return 'standard_library' - + # Check for third-party indicators if any(indicator in symbol_string.lower() for indicator in ['cocoapods', 'carthage', 'pods']): return 'third_party' - + return 'standard_library' # Default for external symbols - + except Exception: return 'standard_library' - + def _parse_external_module(self, external_symbol: str) -> Optional[Dict[str, str]]: """Parse external SCIP symbol to extract module information.""" try: if not external_symbol.startswith('external:'): return None - + # Remove 'external:' prefix and parse path symbol_path = external_symbol[9:] - + # Extract base module path (before '/' or '#') if '/' in symbol_path: module_path = symbol_path.split('/')[0] @@ -982,24 +982,24 @@ def _parse_external_module(self, external_symbol: str) -> Optional[Dict[str, str module_path = symbol_path.split('#')[0] else: module_path = symbol_path - + # Clean up module path module_path = module_path.rstrip('.') if not module_path: return None - + # Categorize the import category = self._categorize_import(module_path) - + return { 'module': module_path, 'category': category } - + except Exception as e: logger.debug(f"Error parsing external module {external_symbol}: {e}") return None - + def _categorize_import(self, module_path: str) -> str: """Categorize import as standard_library, third_party, or local.""" # Standard library modules (common ones) @@ -1014,29 +1014,29 @@ def _categorize_import(self, module_path: str) -> str: 'pprint', 'textwrap', 'string', 'struct', 'codecs', 'unicodedata', 'io', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile' } - + # Local imports (relative imports or project-specific patterns) if module_path.startswith('.'): return 'local' - + # Check for common project patterns if any(pattern in module_path for pattern in ['src.', 'lib.', 'app.', 'project.']): return 'local' - + # Standard library check base_module = module_path.split('.')[0] if base_module in stdlib_modules: return 'standard_library' - + # Everything else is third_party return 'third_party' - - + + def _is_import_occurrence(self, occurrence) -> bool: """Check if occurrence represents an import.""" # Import role = 2 (based on debug results) return hasattr(occurrence, 'symbol_roles') and (occurrence.symbol_roles & 2) - + def _extract_local_module_path(self, descriptors: str) -> Optional[str]: """Extract module path from local descriptors.""" # utils.py/helper_function() -> utils @@ -1047,7 +1047,7 @@ def _extract_local_module_path(self, descriptors: str) -> Optional[str]: return file_part[:-3].replace('/', '.') return file_part.replace('/', '.') return None - + def _extract_class_name_from_descriptors(self, descriptors: str) -> Optional[str]: """Extract class name from descriptors.""" # test_empty_functions.py/TestClass# -> TestClass @@ -1058,12 +1058,12 @@ def _extract_class_name_from_descriptors(self, descriptors: str) -> Optional[str # Remove trailing # if present (class symbols end with #) return class_part.rstrip('#') return None - + def _is_class_member(self, descriptors: str, class_name: str) -> bool: """Check if descriptors belongs to specified class member.""" # test_empty_functions.py/TestClass/method_one() contains TestClass return f"/{class_name}/" in descriptors - + def _extract_member_name(self, descriptors: str, class_name: str) -> Optional[str]: """Extract class member name.""" # test_empty_functions.py/TestClass/method_one() -> method_one @@ -1071,19 +1071,19 @@ def _extract_member_name(self, descriptors: str, class_name: str) -> Optional[st after_class = descriptors.split(f"/{class_name}/", 1)[1] return after_class.rstrip('().') return None - + def _is_method_kind(self, kind: int) -> bool: """Check if SCIP kind represents a method or function.""" method_kinds = {'function', 'method'} kind_name = self._get_scip_kind_name(kind) return kind_name in method_kinds - + def _infer_location_from_symbol_structure(self, scip_symbol: str, document) -> Optional[LocationInfo]: """Infer location based on symbol structure using SCIPSymbolManager.""" symbol_info = self._symbol_parser.parse_symbol(scip_symbol) if not symbol_info: return None - + try: # Strategy 1: If class member, estimate based on class location if '/' in symbol_info.descriptors: @@ -1097,18 +1097,18 @@ def _infer_location_from_symbol_structure(self, scip_symbol: str, document) -> O line=class_location.line + 3, column=class_location.column + 4 ) - + # Strategy 2: Estimate based on file path (if symbol belongs to current file) if symbol_info.manager == 'local': file_path = self._symbol_parser.get_file_path_from_symbol(scip_symbol) if file_path and file_path in document.relative_path: return self._estimate_position_in_file(symbol_info.descriptors, document) - + except Exception as e: logger.debug(f"Symbol location inference failed: {e}") - + return None - + def _find_symbol_location_in_document(self, target_symbol: str, document) -> Optional[LocationInfo]: """Find location of target symbol in document.""" for occurrence in document.occurrences: @@ -1117,7 +1117,7 @@ def _find_symbol_location_in_document(self, target_symbol: str, document) -> Opt if location: return location return None - + def _estimate_position_in_file(self, descriptors: str, document) -> Optional[LocationInfo]: """Estimate position based on descriptors and document structure.""" # Simple heuristic: estimate line based on symbol type @@ -1127,7 +1127,7 @@ def _estimate_position_in_file(self, descriptors: str, document) -> Optional[Loc return LocationInfo(line=max(5, len(document.occurrences) // 2), column=1) else: return LocationInfo(line=1, column=1) - + def _get_default_location_by_symbol_type(self, scip_symbol: str) -> LocationInfo: """Provide reasonable default location based on symbol type.""" symbol_lower = scip_symbol.lower() @@ -1137,7 +1137,7 @@ def _get_default_location_by_symbol_type(self, scip_symbol: str) -> LocationInfo return LocationInfo(line=5, column=1) # Functions usually after imports else: return LocationInfo(line=1, column=1) # Other symbols default position - + def _create_empty_analysis(self, file_path: str) -> FileAnalysis: """Create empty analysis result for missing files.""" return FileAnalysis( @@ -1146,7 +1146,7 @@ def _create_empty_analysis(self, file_path: str) -> FileAnalysis: line_count=0, size_bytes=0 ) - + def _create_error_analysis(self, file_path: str, error_message: str) -> FileAnalysis: """Create error analysis result.""" logger.error(f"Analysis error for {file_path}: {error_message}") @@ -1158,16 +1158,16 @@ def _create_error_analysis(self, file_path: str, error_message: str) -> FileAnal ) # Could add error information to metadata if needed return result - + def _extract_function_parameters(self, scip_symbol: str, symbol_info, document) -> List[str]: """ Extract function parameter names from SCIP data. - + Args: scip_symbol: SCIP symbol identifier symbol_info: SCIP symbol information document: SCIP document containing occurrences - + Returns: List of parameter names """ @@ -1178,18 +1178,18 @@ def _extract_function_parameters(self, scip_symbol: str, symbol_info, document) if doc_line.startswith('Parameters: '): param_str = doc_line[12:] # Remove 'Parameters: ' return [p.strip() for p in param_str.split(',') if p.strip()] - + # Try to extract from symbol information signature if hasattr(symbol_info, 'signature') and symbol_info.signature: return self._parse_signature_parameters(symbol_info.signature) - + # Fallback: try to extract from symbol occurrences and surrounding context return self._extract_parameters_from_occurrences(scip_symbol, document) - + except Exception as e: logger.debug(f"Failed to extract parameters for {scip_symbol}: {e}") return [] - + def _parse_signature_parameters(self, signature: str) -> List[str]: """Parse parameter names from function signature.""" try: @@ -1198,7 +1198,7 @@ def _parse_signature_parameters(self, signature: str) -> List[str]: param_section = signature.split('(')[1].split(')')[0] if not param_section.strip(): return [] - + params = [] for param in param_section.split(','): param = param.strip() @@ -1209,20 +1209,20 @@ def _parse_signature_parameters(self, signature: str) -> List[str]: params.append(param_name) elif param_name == 'self': params.append('self') - + return params - + except Exception as e: logger.debug(f"Error parsing signature parameters: {e}") - + return [] - + def _extract_parameters_from_occurrences(self, scip_symbol: str, document) -> List[str]: """Extract parameters by analyzing symbol occurrences in the document.""" # This is a simplified implementation # A more sophisticated approach would analyze the AST or source code directly return [] - + def _extract_return_type(self, scip_symbol: str, symbol_info) -> Optional[str]: """Extract return type from SCIP data.""" try: @@ -1234,7 +1234,7 @@ def _extract_return_type(self, scip_symbol: str, symbol_info) -> Optional[str]: except Exception as e: logger.debug(f"Error extracting return type for {scip_symbol}: {e}") return None - + def _is_async_function(self, scip_symbol: str, symbol_info) -> bool: """Check if function is async based on SCIP data.""" try: @@ -1243,42 +1243,42 @@ def _is_async_function(self, scip_symbol: str, symbol_info) -> bool: for doc_line in symbol_info.documentation: if doc_line == 'Async function': return True - + # Fallback: check signature if hasattr(symbol_info, 'signature') and symbol_info.signature: return 'async' in symbol_info.signature.lower() except Exception as e: logger.debug(f"Error checking async status for {scip_symbol}: {e}") return False - + def _extract_class_members(self, class_scip_symbol: str, document) -> tuple[List[str], List[str]]: """Use SCIPSymbolManager to parse class members.""" methods = [] attributes = [] - + if not self._symbol_parser: return methods, attributes - + try: # Parse class symbol to get descriptors - class_info = self._symbol_parser.parse_symbol(class_scip_symbol) + class_info = self._symbol_parser.parse_symbol(class_scip_symbol) if not class_info: return methods, attributes - + # Extract class name from descriptors: file.py/ClassName -> ClassName class_name = self._extract_class_name_from_descriptors(class_info.descriptors) if not class_name: return methods, attributes - + # Find all class members by looking for matching descriptors for symbol_info in document.symbols: if not self._symbol_parser: continue - + member_info = self._symbol_parser.parse_symbol(symbol_info.symbol) if not member_info or member_info.manager != 'local': continue - + # Check if this symbol belongs to the class if self._is_class_member(member_info.descriptors, class_name): member_name = self._extract_member_name(member_info.descriptors, class_name) @@ -1288,18 +1288,18 @@ def _extract_class_members(self, class_scip_symbol: str, document) -> tuple[List methods.append(member_name) else: attributes.append(member_name) - + except Exception as e: logger.debug(f"Error extracting class members for {class_scip_symbol}: {e}") - + return methods, attributes - + def _extract_inheritance(self, class_scip_symbol: str, symbol_info) -> List[str]: """Extract class inheritance information from SCIP data.""" # This would require more sophisticated SCIP relationship analysis # For now, return empty list return [] - + def _extract_variable_type(self, scip_symbol: str, symbol_info) -> Optional[str]: """Extract variable type from SCIP data.""" try: @@ -1312,7 +1312,7 @@ def _extract_variable_type(self, scip_symbol: str, symbol_info) -> Optional[str] except Exception as e: logger.debug(f"Error extracting variable type for {scip_symbol}: {e}") return None - + def _is_global_variable(self, scip_symbol: str, document) -> Optional[bool]: """Check if variable is global based on SCIP symbol structure.""" try: @@ -1323,7 +1323,7 @@ def _is_global_variable(self, scip_symbol: str, document) -> Optional[bool]: except Exception as e: logger.debug(f"Error checking global status for {scip_symbol}: {e}") return None - + def _extract_constant_value(self, scip_symbol: str, symbol_info) -> Optional[str]: """Extract constant value from SCIP data.""" try: @@ -1335,77 +1335,77 @@ def _extract_constant_value(self, scip_symbol: str, symbol_info) -> Optional[str except Exception as e: logger.debug(f"Error extracting constant value for {scip_symbol}: {e}") return None - + def extract_scip_relationships(self, file_path: str, scip_index) -> Dict[str, List[tuple]]: """ Extract SCIP relationships from a file using the enhanced analysis pipeline. - + This method provides integration between the symbol analyzer and the new SCIP relationship management system introduced in the implementation plan. - + Args: file_path: Relative path to the file to analyze scip_index: SCIP index containing all project data - + Returns: Dictionary mapping source_symbol_id -> [(target_symbol_id, relationship_type), ...] Compatible with SCIPRelationshipManager input format - + Raises: ValueError: If file analysis fails or file not found """ try: # Perform complete file analysis file_analysis = self.analyze_file(file_path, scip_index) - + # Extract all SCIP relationships using the enhanced data structures relationships = file_analysis.to_scip_relationships(self._symbol_parser) - + logger.debug(f"Extracted SCIP relationships for {file_path}: " f"{len(relationships)} symbols with relationships, " f"{sum(len(rels) for rels in relationships.values())} total relationships") - + return relationships - + except Exception as e: logger.error(f"Failed to extract SCIP relationships from {file_path}: {e}") raise ValueError(f"SCIP relationship extraction failed: {e}") - + def batch_extract_relationships(self, file_paths: List[str], scip_index) -> Dict[str, Dict[str, List[tuple]]]: """ Extract SCIP relationships from multiple files efficiently. - + This method provides batch processing capabilities for the relationship management system, optimizing performance for large codebases. - + Args: file_paths: List of relative file paths to analyze scip_index: SCIP index containing all project data - + Returns: Dictionary mapping file_path -> {source_symbol_id -> [(target_symbol_id, relationship_type), ...]} """ results = {} - + for i, file_path in enumerate(file_paths, 1): try: relationships = self.extract_scip_relationships(file_path, scip_index) results[file_path] = relationships - + if i % 10 == 0 or i == len(file_paths): logger.debug(f"Batch relationship extraction progress: {i}/{len(file_paths)} files") - + except Exception as e: logger.warning(f"Failed to extract relationships from {file_path}: {e}") results[file_path] = {} # Empty result for failed files continue - + total_files = len(results) total_relationships = sum( sum(len(rels) for rels in file_rels.values()) for file_rels in results.values() ) - + logger.info(f"Batch relationship extraction completed: {total_files} files, {total_relationships} total relationships") - + return results \ No newline at end of file From ccea17afacdd59a366c5854876797e87f93ef1ed Mon Sep 17 00:00:00 2001 From: johnhuang316 <134570882+johnhuang316@users.noreply.github.com> Date: Tue, 19 Aug 2025 10:21:10 +0800 Subject: [PATCH 2/8] feat: Enhance mathematical utilities and add string processing functions - Added a new math.zig file containing various mathematical functions including complex numbers, statistics, and matrix operations. - Introduced utils.zig for string processing utilities, including email validation and string manipulation functions. - Updated main.zig to utilize new utility functions and demonstrate their usage. - Enhanced root.zig with error handling, configuration management, and generic functions. - Added comprehensive tests for all new functionalities to ensure correctness and reliability. --- .../scip/strategies/zig_strategy.py | 688 +++++++++++++++++- .../zig/code-index-example/src/main.zig | 19 + .../zig/code-index-example/src/math.zig | 262 +++++++ .../zig/code-index-example/src/root.zig | 112 +++ .../zig/code-index-example/src/utils.zig | 169 +++++ 5 files changed, 1238 insertions(+), 12 deletions(-) create mode 100644 test/sample-projects/zig/code-index-example/src/math.zig create mode 100644 test/sample-projects/zig/code-index-example/src/utils.zig diff --git a/src/code_index_mcp/scip/strategies/zig_strategy.py b/src/code_index_mcp/scip/strategies/zig_strategy.py index a277923..09ab201 100644 --- a/src/code_index_mcp/scip/strategies/zig_strategy.py +++ b/src/code_index_mcp/scip/strategies/zig_strategy.py @@ -31,6 +31,15 @@ def __init__(self, priority: int = 95): lang = tree_sitter.Language(zig_language()) self.parser = tree_sitter.Parser(lang) self.use_tree_sitter = True + + # Initialize dependency tracking + self.dependencies = { + 'imports': { + 'standard_library': [], + 'third_party': [], + 'local': [] + } + } def can_handle(self, extension: str, file_path: str) -> bool: """Check if this strategy can handle the file type.""" @@ -104,6 +113,9 @@ def _generate_documents_with_references(self, files: List[str], project_path: st def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: """Collect symbol definitions from a single Zig file.""" + # Reset dependencies for this file + self._reset_dependencies() + # Read file content content = self._read_file_content(file_path) if not content: @@ -151,9 +163,7 @@ def _analyze_zig_file(self, file_path: str, project_path: str, relationships: Op raise StrategyError(f"Failed to parse {document.relative_path} with tree-sitter for document analysis") - return document - - def _parse_content(self, content: str) -> Optional: + def _parse_content(self, content: str) -> Optional[tree_sitter.Tree]: """Parse content with tree-sitter parser.""" if not self.parser: return None @@ -225,9 +235,11 @@ def visit_node(node): # Enum declarations elif node_type == 'enum_declaration': self._register_enum_symbol_ts(node, file_path, scope_stack, content) - # Const/var declarations - elif node_type in ['const_declaration', 'var_declaration']: + # Variable declarations (const/var) + elif node_type == 'variable_declaration': self._register_variable_symbol_ts(node, file_path, scope_stack, content) + # Check if it contains an @import call + self._check_for_import_in_variable(node, file_path, scope_stack, content) # Test declarations elif node_type == 'test_declaration': self._register_test_symbol_ts(node, file_path, scope_stack, content) @@ -238,7 +250,7 @@ def visit_node(node): visit_node(tree.root_node) - def _analyze_tree_sitter_for_document(self, tree, file_path: str, content: str) -> tuple: + def _analyze_tree_sitter_for_document(self, tree, file_path: str, content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[List[scip_pb2.Occurrence], List[scip_pb2.SymbolInformation]]: """Analyze Tree-sitter AST to generate SCIP occurrences and symbols.""" occurrences = [] symbols = [] @@ -249,25 +261,28 @@ def visit_node(node): # Process different node types if node_type == 'function_declaration': - occ, sym = self._process_function_ts(node, file_path, scope_stack, content) + occ, sym = self._process_function_ts(node, file_path, scope_stack, content, relationships) if occ: occurrences.append(occ) if sym: symbols.append(sym) elif node_type == 'struct_declaration': - occ, sym = self._process_struct_ts(node, file_path, scope_stack, content) + occ, sym = self._process_struct_ts(node, file_path, scope_stack, content, relationships) if occ: occurrences.append(occ) if sym: symbols.append(sym) elif node_type == 'enum_declaration': - occ, sym = self._process_enum_ts(node, file_path, scope_stack, content) + occ, sym = self._process_enum_ts(node, file_path, scope_stack, content, relationships) if occ: occurrences.append(occ) if sym: symbols.append(sym) - elif node_type in ['const_declaration', 'var_declaration']: - occ, sym = self._process_variable_ts(node, file_path, scope_stack, content) + elif node_type == 'variable_declaration': + occ, sym = self._process_variable_ts(node, file_path, scope_stack, content, relationships) if occ: occurrences.append(occ) if sym: symbols.append(sym) elif node_type == 'test_declaration': - occ, sym = self._process_test_ts(node, file_path, scope_stack, content) + occ, sym = self._process_test_ts(node, file_path, scope_stack, content, relationships) if occ: occurrences.append(occ) if sym: symbols.append(sym) + elif node_type == 'builtin_function_call' and self._is_import_call(node): + # Handle @import() calls + self._handle_import_declaration(node, file_path, scope_stack, content) elif node_type == 'identifier': occ = self._process_identifier_ts(node, file_path, scope_stack, content) if occ: occurrences.append(occ) @@ -307,3 +322,652 @@ def visit_node(node): visit_node(tree.root_node) return relationships + + # Tree-sitter node processing methods (missing implementations) + def _register_function_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: + """Register a function symbol definition.""" + name = self._get_function_name_ts(node, content) + if not name: + return + + symbol_id = self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor="()." + ) + + # Create a dummy range for registration + dummy_range = scip_pb2.Range() + dummy_range.start.extend([0, 0]) + dummy_range.end.extend([0, 1]) + + self.reference_resolver.register_symbol_definition( + symbol_id=symbol_id, + file_path=file_path, + definition_range=dummy_range, + symbol_kind=scip_pb2.Function, + display_name=name, + documentation=["Zig function"] + ) + + def _register_struct_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: + """Register a struct symbol definition.""" + name = self._get_struct_name_ts(node, content) + if not name: + return + + symbol_id = self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor="#" + ) + + dummy_range = scip_pb2.Range() + dummy_range.start.extend([0, 0]) + dummy_range.end.extend([0, 1]) + + self.reference_resolver.register_symbol_definition( + symbol_id=symbol_id, + file_path=file_path, + definition_range=dummy_range, + symbol_kind=scip_pb2.Struct, + display_name=name, + documentation=["Zig struct"] + ) + + def _register_enum_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: + """Register an enum symbol definition.""" + name = self._get_enum_name_ts(node, content) + if not name: + return + + symbol_id = self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor="#" + ) + + dummy_range = scip_pb2.Range() + dummy_range.start.extend([0, 0]) + dummy_range.end.extend([0, 1]) + + self.reference_resolver.register_symbol_definition( + symbol_id=symbol_id, + file_path=file_path, + definition_range=dummy_range, + symbol_kind=scip_pb2.Enum, + display_name=name, + documentation=["Zig enum"] + ) + + def _register_variable_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: + """Register a variable/constant symbol definition.""" + name = self._get_variable_name_ts(node, content) + if not name: + return + + # Determine if it's const or var + is_const = self._is_const_declaration(node) + symbol_kind = scip_pb2.Constant if is_const else scip_pb2.Variable + descriptor = "." + + symbol_id = self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor=descriptor + ) + + dummy_range = scip_pb2.Range() + dummy_range.start.extend([0, 0]) + dummy_range.end.extend([0, 1]) + + self.reference_resolver.register_symbol_definition( + symbol_id=symbol_id, + file_path=file_path, + definition_range=dummy_range, + symbol_kind=symbol_kind, + display_name=name, + documentation=["Zig constant" if is_const else "Zig variable"] + ) + + def _register_test_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: + """Register a test symbol definition.""" + name = self._get_test_name_ts(node, content) + if not name: + name = "test" # Default name for unnamed tests + + symbol_id = self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor="()." + ) + + dummy_range = scip_pb2.Range() + dummy_range.start.extend([0, 0]) + dummy_range.end.extend([0, 1]) + + self.reference_resolver.register_symbol_definition( + symbol_id=symbol_id, + file_path=file_path, + definition_range=dummy_range, + symbol_kind=scip_pb2.Function, + display_name=name, + documentation=["Zig test"] + ) + + # Process methods for document generation + def _process_function_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: + """Process function for document generation.""" + name = self._get_function_name_ts(node, content) + if not name: + return None, None + + symbol_id = self._create_function_symbol_id_ts(name, file_path, scope_stack) + occurrence = self._create_function_occurrence_ts(node, symbol_id) + + symbol_relationships = relationships.get(symbol_id, []) if relationships else [] + scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] + + symbol_info = self._create_function_symbol_info_ts(node, symbol_id, name, scip_relationships) + + return occurrence, symbol_info + + def _process_struct_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: + """Process struct for document generation.""" + name = self._get_struct_name_ts(node, content) + if not name: + return None, None + + symbol_id = self._create_struct_symbol_id_ts(name, file_path, scope_stack) + occurrence = self._create_struct_occurrence_ts(node, symbol_id) + + symbol_relationships = relationships.get(symbol_id, []) if relationships else [] + scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] + + symbol_info = self._create_struct_symbol_info_ts(node, symbol_id, name, scip_relationships) + + return occurrence, symbol_info + + def _process_enum_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: + """Process enum for document generation.""" + name = self._get_enum_name_ts(node, content) + if not name: + return None, None + + symbol_id = self._create_enum_symbol_id_ts(name, file_path, scope_stack) + occurrence = self._create_enum_occurrence_ts(node, symbol_id) + + symbol_relationships = relationships.get(symbol_id, []) if relationships else [] + scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] + + symbol_info = self._create_enum_symbol_info_ts(node, symbol_id, name, scip_relationships) + + return occurrence, symbol_info + + def _process_variable_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: + """Process variable/constant for document generation.""" + name = self._get_variable_name_ts(node, content) + if not name: + return None, None + + symbol_id = self._create_variable_symbol_id_ts(name, file_path, scope_stack, node) + occurrence = self._create_variable_occurrence_ts(node, symbol_id) + + symbol_relationships = relationships.get(symbol_id, []) if relationships else [] + scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] + + symbol_info = self._create_variable_symbol_info_ts(node, symbol_id, name, scip_relationships) + + return occurrence, symbol_info + + def _process_test_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: + """Process test for document generation.""" + name = self._get_test_name_ts(node, content) or "test" + + symbol_id = self._create_test_symbol_id_ts(name, file_path, scope_stack) + occurrence = self._create_test_occurrence_ts(node, symbol_id) + + symbol_relationships = relationships.get(symbol_id, []) if relationships else [] + scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] + + symbol_info = self._create_test_symbol_info_ts(node, symbol_id, name, scip_relationships) + + return occurrence, symbol_info + + def _process_identifier_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> Optional[scip_pb2.Occurrence]: + """Process identifier for references.""" + name = self._get_node_text_ts(node) + if not name: + return None + + # Create a reference occurrence + if not self.position_calculator: + return None + + try: + range_obj = self.position_calculator.tree_sitter_node_to_range(node) + occurrence = scip_pb2.Occurrence() + occurrence.symbol = f"local {name}" # Simple reference + occurrence.symbol_roles = scip_pb2.ReadAccess + occurrence.syntax_kind = scip_pb2.IdentifierLocal + occurrence.range.CopyFrom(range_obj) + return occurrence + except: + return None + + # Helper methods for extracting names from Tree-sitter nodes + def _get_function_name_ts(self, node, content: str) -> Optional[str]: + """Extract function name from function node.""" + for child in node.children: + if child.type == "identifier": + return self._get_node_text_ts(child) + return None + + def _get_struct_name_ts(self, node, content: str) -> Optional[str]: + """Extract struct name from struct node.""" + for child in node.children: + if child.type == "identifier": + return self._get_node_text_ts(child) + return None + + def _get_enum_name_ts(self, node, content: str) -> Optional[str]: + """Extract enum name from enum node.""" + for child in node.children: + if child.type == "identifier": + return self._get_node_text_ts(child) + return None + + def _get_variable_name_ts(self, node, content: str) -> Optional[str]: + """Extract variable name from variable declaration node.""" + for child in node.children: + if child.type == "identifier": + return self._get_node_text_ts(child) + return None + + def _get_test_name_ts(self, node, content: str) -> Optional[str]: + """Extract test name from test node.""" + for child in node.children: + if child.type == "string_literal": + # Test with string name: test "my test" {} + text = self._get_node_text_ts(child) + if text: + return text.strip('"') + elif child.type == "identifier": + # Test with identifier: test my_test {} + return self._get_node_text_ts(child) + return None + + def _get_node_text_ts(self, node) -> Optional[str]: + """Get text content of a Tree-sitter node.""" + if hasattr(node, 'text'): + try: + return node.text.decode('utf-8') + except: + pass + return None + + def _is_const_declaration(self, node) -> bool: + """Check if a declaration is const.""" + return node.type == "const_declaration" + + # Symbol ID creation methods + def _create_function_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str]) -> str: + """Create symbol ID for function.""" + return self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor="()." + ) + + def _create_struct_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str]) -> str: + """Create symbol ID for struct.""" + return self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor="#" + ) + + def _create_enum_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str]) -> str: + """Create symbol ID for enum.""" + return self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor="#" + ) + + def _create_variable_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str], node) -> str: + """Create symbol ID for variable/constant.""" + descriptor = "." + return self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor=descriptor + ) + + def _create_test_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str]) -> str: + """Create symbol ID for test.""" + return self.symbol_manager.create_local_symbol( + language="zig", + file_path=file_path, + symbol_path=scope_stack + [name], + descriptor="()." + ) + + # Occurrence creation methods + def _create_function_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence for function.""" + if not self.position_calculator: + return None + + try: + range_obj = self.position_calculator.tree_sitter_node_to_range(node) + occurrence = scip_pb2.Occurrence() + occurrence.symbol = symbol_id + occurrence.symbol_roles = scip_pb2.Definition + occurrence.syntax_kind = scip_pb2.IdentifierFunctionDefinition + occurrence.range.CopyFrom(range_obj) + return occurrence + except: + return None + + def _create_struct_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence for struct.""" + if not self.position_calculator: + return None + + try: + range_obj = self.position_calculator.tree_sitter_node_to_range(node) + occurrence = scip_pb2.Occurrence() + occurrence.symbol = symbol_id + occurrence.symbol_roles = scip_pb2.Definition + occurrence.syntax_kind = scip_pb2.IdentifierType + occurrence.range.CopyFrom(range_obj) + return occurrence + except: + return None + + def _create_enum_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence for enum.""" + if not self.position_calculator: + return None + + try: + range_obj = self.position_calculator.tree_sitter_node_to_range(node) + occurrence = scip_pb2.Occurrence() + occurrence.symbol = symbol_id + occurrence.symbol_roles = scip_pb2.Definition + occurrence.syntax_kind = scip_pb2.IdentifierType + occurrence.range.CopyFrom(range_obj) + return occurrence + except: + return None + + def _create_variable_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence for variable/constant.""" + if not self.position_calculator: + return None + + try: + range_obj = self.position_calculator.tree_sitter_node_to_range(node) + occurrence = scip_pb2.Occurrence() + occurrence.symbol = symbol_id + occurrence.symbol_roles = scip_pb2.Definition + occurrence.syntax_kind = scip_pb2.IdentifierConstant + occurrence.range.CopyFrom(range_obj) + return occurrence + except: + return None + + def _create_test_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence for test.""" + if not self.position_calculator: + return None + + try: + range_obj = self.position_calculator.tree_sitter_node_to_range(node) + occurrence = scip_pb2.Occurrence() + occurrence.symbol = symbol_id + occurrence.symbol_roles = scip_pb2.Definition + occurrence.syntax_kind = scip_pb2.IdentifierFunctionDefinition + occurrence.range.CopyFrom(range_obj) + return occurrence + except: + return None + + # Symbol information creation methods + def _create_function_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: + """Create SCIP symbol information for function.""" + symbol_info = scip_pb2.SymbolInformation() + symbol_info.symbol = symbol_id + symbol_info.display_name = name + symbol_info.kind = scip_pb2.Function + + symbol_info.documentation.append("Zig function") + + if relationships and self.relationship_manager: + self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) + + return symbol_info + + def _create_struct_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: + """Create SCIP symbol information for struct.""" + symbol_info = scip_pb2.SymbolInformation() + symbol_info.symbol = symbol_id + symbol_info.display_name = name + symbol_info.kind = scip_pb2.Struct + + symbol_info.documentation.append("Zig struct") + + if relationships and self.relationship_manager: + self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) + + return symbol_info + + def _create_enum_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: + """Create SCIP symbol information for enum.""" + symbol_info = scip_pb2.SymbolInformation() + symbol_info.symbol = symbol_id + symbol_info.display_name = name + symbol_info.kind = scip_pb2.Enum + + symbol_info.documentation.append("Zig enum") + + if relationships and self.relationship_manager: + self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) + + return symbol_info + + def _create_variable_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: + """Create SCIP symbol information for variable/constant.""" + symbol_info = scip_pb2.SymbolInformation() + symbol_info.symbol = symbol_id + symbol_info.display_name = name + + # Determine if it's const or var + is_const = self._is_const_declaration(node) + symbol_info.kind = scip_pb2.Constant if is_const else scip_pb2.Variable + symbol_info.documentation.append("Zig constant" if is_const else "Zig variable") + + if relationships and self.relationship_manager: + self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) + + return symbol_info + + def _create_test_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: + """Create SCIP symbol information for test.""" + symbol_info = scip_pb2.SymbolInformation() + symbol_info.symbol = symbol_id + symbol_info.display_name = name + symbol_info.kind = scip_pb2.Function + + symbol_info.documentation.append("Zig test") + + if relationships and self.relationship_manager: + self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) + + return symbol_info + + def _create_scip_relationships(self, symbol_relationships: List[tuple]) -> List[scip_pb2.Relationship]: + """Convert internal relationships to SCIP relationships.""" + scip_relationships = [] + for target_symbol_id, relationship_type in symbol_relationships: + relationship = scip_pb2.Relationship() + relationship.symbol = target_symbol_id + relationship.is_reference = True + scip_relationships.append(relationship) + return scip_relationships + + # Dependency handling methods (Zig-specific) + def _is_import_call(self, node) -> bool: + """Check if a builtin function call is an @import call.""" + if node.type != "builtin_function_call": + return False + + for child in node.children: + if child.type == "builtin_identifier": + name = self._get_node_text_ts(child) + return name == "@import" + return False + + def _handle_import_declaration(self, node, file_path: str, scope_stack: List[str], content: str) -> None: + """Handle @import() declarations.""" + import_path = self._extract_import_path_from_node(node) + if not import_path: + return + + # Classify dependency type + dependency_type = self._classify_zig_dependency(import_path) + + # Store dependency + if import_path not in self.dependencies['imports'][dependency_type]: + self.dependencies['imports'][dependency_type].append(import_path) + + # Create SCIP symbol for import + var_name = f"import_{import_path.replace('.', '_').replace('/', '_')}" + local_id = ".".join(scope_stack + [var_name]) if scope_stack else var_name + symbol_id = f"local {local_id}(import)" + + dummy_range = scip_pb2.Range() + dummy_range.start.extend([0, 0]) + dummy_range.end.extend([0, 1]) + + self.reference_resolver.register_symbol_definition( + symbol_id=symbol_id, + file_path=file_path, + definition_range=dummy_range, + symbol_kind=scip_pb2.Namespace, + display_name=var_name, + documentation=[f"Zig import from {import_path}"] + ) + + def _extract_import_path_from_node(self, node) -> Optional[str]: + """Extract import path from @import() call.""" + # Look for string in arguments based on actual AST structure + for child in node.children: + if child.type == "arguments": + for arg in child.children: + if arg.type == "string": + # Extract from string_content child + for string_child in arg.children: + if string_child.type == "string_content": + path = self._get_node_text_ts(string_child) + if path: + return path + return None + + def _classify_zig_dependency(self, import_path: str) -> str: + """Classify Zig dependency based on import path.""" + # Zig standard library modules + zig_std_modules = { + 'std', 'builtin', 'root', 'testing', 'math', 'mem', 'fs', 'net', + 'json', 'fmt', 'log', 'crypto', 'hash', 'sort', 'thread', 'atomic', + 'os', 'process', 'time', 'random', 'debug', 'meta', 'ascii', 'unicode' + } + + if import_path in zig_std_modules: + return 'standard_library' + elif import_path.startswith('./') or import_path.startswith('../') or import_path.endswith('.zig'): + return 'local' + else: + return 'third_party' + + def _extract_calls_from_node_ts(self, node, source_symbol_id: str, relationships: Dict, file_path: str, scope_stack: List[str], content: str) -> None: + """Extract function calls from a Tree-sitter node.""" + def visit_for_calls(n): + if n.type == 'call_expression': + # Get the function being called + function_node = n.children[0] if n.children else None + if function_node and function_node.type == 'identifier': + target_name = self._get_node_text_ts(function_node) + if target_name: + target_symbol_id = self._create_function_symbol_id_ts(target_name, file_path, scope_stack) + if source_symbol_id not in relationships: + relationships[source_symbol_id] = [] + relationships[source_symbol_id].append((target_symbol_id, InternalRelationshipType.CALLS)) + + for child in n.children: + visit_for_calls(child) + + visit_for_calls(node) + + def _check_for_import_in_variable(self, node, file_path: str, scope_stack: List[str], content: str) -> None: + """Check if a variable declaration contains an @import call.""" + for child in node.children: + if child.type == 'builtin_function': + # Check if it's @import + builtin_id = None + for grandchild in child.children: + if grandchild.type == 'builtin_identifier': + builtin_id = self._get_node_text_ts(grandchild) + break + + if builtin_id == '@import': + # Extract import path + import_path = self._extract_import_path_from_node(child) + if import_path: + # Classify and store dependency + dependency_type = self._classify_zig_dependency(import_path) + if import_path not in self.dependencies['imports'][dependency_type]: + self.dependencies['imports'][dependency_type].append(import_path) + + # Create SCIP symbol for import + var_name = self._get_variable_name_ts(node, content) + if var_name: + local_id = ".".join(scope_stack + [var_name]) if scope_stack else var_name + symbol_id = f"local {local_id}(import)" + + dummy_range = scip_pb2.Range() + dummy_range.start.extend([0, 0]) + dummy_range.end.extend([0, 1]) + + self.reference_resolver.register_symbol_definition( + symbol_id=symbol_id, + file_path=file_path, + definition_range=dummy_range, + symbol_kind=scip_pb2.Namespace, + display_name=var_name, + documentation=[f"Zig import from {import_path}"] + ) + + def get_dependencies(self) -> Dict[str, Any]: + """Get collected dependencies for MCP response.""" + return self.dependencies + + def _reset_dependencies(self) -> None: + """Reset dependency tracking for new file analysis.""" + self.dependencies = { + 'imports': { + 'standard_library': [], + 'third_party': [], + 'local': [] + } + } diff --git a/test/sample-projects/zig/code-index-example/src/main.zig b/test/sample-projects/zig/code-index-example/src/main.zig index 8a92646..792cfc1 100644 --- a/test/sample-projects/zig/code-index-example/src/main.zig +++ b/test/sample-projects/zig/code-index-example/src/main.zig @@ -1,10 +1,29 @@ const std = @import("std"); +const builtin = @import("builtin"); +const testing = @import("testing"); const code_index_example = @import("code_index_example"); +const utils = @import("./utils.zig"); +const math_utils = @import("./math.zig"); pub fn main() !void { // Prints to stderr, ignoring potential errors. std.debug.print("All your {s} are belong to us.\n", .{"codebase"}); try code_index_example.bufferedPrint(); + + // Test our custom utilities + const result = utils.processData("Hello, World!"); + std.debug.print("Processed result: {s}\n", .{result}); + + // Test math utilities + const sum = math_utils.calculateSum(10, 20); + std.debug.print("Sum: {}\n", .{sum}); + + // Platform-specific code + if (builtin.os.tag == .windows) { + std.debug.print("Running on Windows\n", .{}); + } else { + std.debug.print("Running on Unix-like system\n", .{}); + } } test "simple test" { diff --git a/test/sample-projects/zig/code-index-example/src/math.zig b/test/sample-projects/zig/code-index-example/src/math.zig new file mode 100644 index 0000000..dba7420 --- /dev/null +++ b/test/sample-projects/zig/code-index-example/src/math.zig @@ -0,0 +1,262 @@ +//! Mathematical utility functions and data structures +const std = @import("std"); +const math = @import("math"); +const testing = @import("testing"); + +// Mathematical constants +pub const PI: f64 = 3.14159265358979323846; +pub const E: f64 = 2.71828182845904523536; +pub const GOLDEN_RATIO: f64 = 1.61803398874989484820; + +// Complex number representation +pub const Complex = struct { + real: f64, + imag: f64, + + pub fn init(real: f64, imag: f64) Complex { + return Complex{ .real = real, .imag = imag }; + } + + pub fn add(self: Complex, other: Complex) Complex { + return Complex{ + .real = self.real + other.real, + .imag = self.imag + other.imag, + }; + } + + pub fn multiply(self: Complex, other: Complex) Complex { + return Complex{ + .real = self.real * other.real - self.imag * other.imag, + .imag = self.real * other.imag + self.imag * other.real, + }; + } + + pub fn magnitude(self: Complex) f64 { + return @sqrt(self.real * self.real + self.imag * self.imag); + } + + pub fn conjugate(self: Complex) Complex { + return Complex{ .real = self.real, .imag = -self.imag }; + } +}; + +// Point in 2D space +pub const Point2D = struct { + x: f64, + y: f64, + + pub fn init(x: f64, y: f64) Point2D { + return Point2D{ .x = x, .y = y }; + } + + pub fn distance(self: Point2D, other: Point2D) f64 { + const dx = self.x - other.x; + const dy = self.y - other.y; + return @sqrt(dx * dx + dy * dy); + } + + pub fn midpoint(self: Point2D, other: Point2D) Point2D { + return Point2D{ + .x = (self.x + other.x) / 2.0, + .y = (self.y + other.y) / 2.0, + }; + } +}; + +// Statistics utilities +pub const Statistics = struct { + pub fn mean(values: []const f64) f64 { + if (values.len == 0) return 0.0; + + var sum: f64 = 0.0; + for (values) |value| { + sum += value; + } + + return sum / @as(f64, @floatFromInt(values.len)); + } + + pub fn median(values: []const f64, buffer: []f64) f64 { + if (values.len == 0) return 0.0; + + // Copy to buffer and sort + for (values, 0..) |value, i| { + buffer[i] = value; + } + std.sort.insertionSort(f64, buffer[0..values.len], {}, std.sort.asc(f64)); + + const n = values.len; + if (n % 2 == 1) { + return buffer[n / 2]; + } else { + return (buffer[n / 2 - 1] + buffer[n / 2]) / 2.0; + } + } + + pub fn standardDeviation(values: []const f64) f64 { + if (values.len <= 1) return 0.0; + + const avg = mean(values); + var sum_sq_diff: f64 = 0.0; + + for (values) |value| { + const diff = value - avg; + sum_sq_diff += diff * diff; + } + + return @sqrt(sum_sq_diff / @as(f64, @floatFromInt(values.len - 1))); + } +}; + +// Basic math functions +pub fn factorial(n: u32) u64 { + if (n <= 1) return 1; + return @as(u64, n) * factorial(n - 1); +} + +pub fn fibonacci(n: u32) u64 { + if (n <= 1) return n; + return fibonacci(n - 1) + fibonacci(n - 2); +} + +pub fn gcd(a: u32, b: u32) u32 { + if (b == 0) return a; + return gcd(b, a % b); +} + +pub fn lcm(a: u32, b: u32) u32 { + return (a * b) / gcd(a, b); +} + +pub fn isPrime(n: u32) bool { + if (n < 2) return false; + if (n == 2) return true; + if (n % 2 == 0) return false; + + var i: u32 = 3; + while (i * i <= n) : (i += 2) { + if (n % i == 0) return false; + } + + return true; +} + +// Function used by main.zig +pub fn calculateSum(a: i32, b: i32) i32 { + return a + b; +} + +pub fn power(base: f64, exponent: i32) f64 { + if (exponent == 0) return 1.0; + if (exponent < 0) return 1.0 / power(base, -exponent); + + var result: f64 = 1.0; + var exp = exponent; + var b = base; + + while (exp > 0) { + if (exp % 2 == 1) { + result *= b; + } + b *= b; + exp /= 2; + } + + return result; +} + +// Matrix operations (2x2 for simplicity) +pub const Matrix2x2 = struct { + data: [2][2]f64, + + pub fn init(a: f64, b: f64, c: f64, d: f64) Matrix2x2 { + return Matrix2x2{ + .data = [_][2]f64{ + [_]f64{ a, b }, + [_]f64{ c, d }, + }, + }; + } + + pub fn multiply(self: Matrix2x2, other: Matrix2x2) Matrix2x2 { + return Matrix2x2{ + .data = [_][2]f64{ + [_]f64{ + self.data[0][0] * other.data[0][0] + self.data[0][1] * other.data[1][0], + self.data[0][0] * other.data[0][1] + self.data[0][1] * other.data[1][1], + }, + [_]f64{ + self.data[1][0] * other.data[0][0] + self.data[1][1] * other.data[1][0], + self.data[1][0] * other.data[0][1] + self.data[1][1] * other.data[1][1], + }, + }, + }; + } + + pub fn determinant(self: Matrix2x2) f64 { + return self.data[0][0] * self.data[1][1] - self.data[0][1] * self.data[1][0]; + } +}; + +// Tests +test "complex number operations" { + const z1 = Complex.init(3.0, 4.0); + const z2 = Complex.init(1.0, 2.0); + + const sum = z1.add(z2); + try std.testing.expectEqual(@as(f64, 4.0), sum.real); + try std.testing.expectEqual(@as(f64, 6.0), sum.imag); + + const magnitude = z1.magnitude(); + try std.testing.expectApproxEqAbs(@as(f64, 5.0), magnitude, 0.0001); +} + +test "point distance calculation" { + const p1 = Point2D.init(0.0, 0.0); + const p2 = Point2D.init(3.0, 4.0); + + const dist = p1.distance(p2); + try std.testing.expectApproxEqAbs(@as(f64, 5.0), dist, 0.0001); +} + +test "factorial calculation" { + try std.testing.expectEqual(@as(u64, 1), factorial(0)); + try std.testing.expectEqual(@as(u64, 1), factorial(1)); + try std.testing.expectEqual(@as(u64, 120), factorial(5)); +} + +test "fibonacci sequence" { + try std.testing.expectEqual(@as(u64, 0), fibonacci(0)); + try std.testing.expectEqual(@as(u64, 1), fibonacci(1)); + try std.testing.expectEqual(@as(u64, 13), fibonacci(7)); +} + +test "prime number detection" { + try std.testing.expect(isPrime(2)); + try std.testing.expect(isPrime(17)); + try std.testing.expect(!isPrime(4)); + try std.testing.expect(!isPrime(1)); +} + +test "statistics calculations" { + const values = [_]f64{ 1.0, 2.0, 3.0, 4.0, 5.0 }; + + const avg = Statistics.mean(&values); + try std.testing.expectEqual(@as(f64, 3.0), avg); + + var buffer: [10]f64 = undefined; + const med = Statistics.median(&values, &buffer); + try std.testing.expectEqual(@as(f64, 3.0), med); +} + +test "matrix operations" { + const m1 = Matrix2x2.init(1.0, 2.0, 3.0, 4.0); + const m2 = Matrix2x2.init(5.0, 6.0, 7.0, 8.0); + + const product = m1.multiply(m2); + try std.testing.expectEqual(@as(f64, 19.0), product.data[0][0]); + try std.testing.expectEqual(@as(f64, 22.0), product.data[0][1]); + + const det = m1.determinant(); + try std.testing.expectEqual(@as(f64, -2.0), det); +} \ No newline at end of file diff --git a/test/sample-projects/zig/code-index-example/src/root.zig b/test/sample-projects/zig/code-index-example/src/root.zig index 94c7cd0..1cc95e3 100644 --- a/test/sample-projects/zig/code-index-example/src/root.zig +++ b/test/sample-projects/zig/code-index-example/src/root.zig @@ -1,5 +1,48 @@ //! By convention, root.zig is the root source file when making a library. const std = @import("std"); +const fmt = @import("fmt"); +const mem = @import("mem"); +const json = @import("json"); + +// Define custom types and structures +pub const Config = struct { + name: []const u8, + version: u32, + debug: bool, + + pub fn init(name: []const u8, version: u32) Config { + return Config{ + .name = name, + .version = version, + .debug = false, + }; + } + + pub fn setDebug(self: *Config, debug: bool) void { + self.debug = debug; + } +}; + +pub const ErrorType = enum { + None, + InvalidInput, + OutOfMemory, + NetworkError, + + pub fn toString(self: ErrorType) []const u8 { + return switch (self) { + .None => "No error", + .InvalidInput => "Invalid input", + .OutOfMemory => "Out of memory", + .NetworkError => "Network error", + }; + } +}; + +// Global constants +pub const VERSION: u32 = 1; +pub const MAX_BUFFER_SIZE: usize = 4096; +var global_config: Config = undefined; pub fn bufferedPrint() !void { // Stdout is for the actual output of your application, for example if you @@ -18,6 +61,75 @@ pub fn add(a: i32, b: i32) i32 { return a + b; } +pub fn multiply(a: i32, b: i32) i32 { + return a * b; +} + +pub fn processConfig(config: *const Config) !void { + std.debug.print("Processing config: {s} v{}\n", .{ config.name, config.version }); + if (config.debug) { + std.debug.print("Debug mode enabled\n", .{}); + } +} + +pub fn handleError(err: ErrorType) void { + std.debug.print("Error: {s}\n", .{err.toString()}); +} + +// Advanced function with error handling +pub fn parseNumber(input: []const u8) !i32 { + if (input.len == 0) { + return error.InvalidInput; + } + + return std.fmt.parseInt(i32, input, 10) catch |err| switch (err) { + error.InvalidCharacter => error.InvalidInput, + error.Overflow => error.OutOfMemory, + else => err, + }; +} + +// Generic function +pub fn swap(comptime T: type, a: *T, b: *T) void { + const temp = a.*; + a.* = b.*; + b.* = temp; +} + test "basic add functionality" { try std.testing.expect(add(3, 7) == 10); } + +test "config initialization" { + var config = Config.init("test-app", 1); + try std.testing.expectEqualStrings("test-app", config.name); + try std.testing.expectEqual(@as(u32, 1), config.version); + try std.testing.expectEqual(false, config.debug); + + config.setDebug(true); + try std.testing.expectEqual(true, config.debug); +} + +test "error type handling" { + const err = ErrorType.InvalidInput; + try std.testing.expectEqualStrings("Invalid input", err.toString()); +} + +test "number parsing" { + const result = try parseNumber("42"); + try std.testing.expectEqual(@as(i32, 42), result); + + // Test error case + const invalid_result = parseNumber(""); + try std.testing.expectError(error.InvalidInput, invalid_result); +} + +test "generic swap function" { + var a: i32 = 10; + var b: i32 = 20; + + swap(i32, &a, &b); + + try std.testing.expectEqual(@as(i32, 20), a); + try std.testing.expectEqual(@as(i32, 10), b); +} diff --git a/test/sample-projects/zig/code-index-example/src/utils.zig b/test/sample-projects/zig/code-index-example/src/utils.zig new file mode 100644 index 0000000..eab54ce --- /dev/null +++ b/test/sample-projects/zig/code-index-example/src/utils.zig @@ -0,0 +1,169 @@ +//! Utility functions for string processing and data manipulation +const std = @import("std"); +const mem = @import("mem"); +const ascii = @import("ascii"); + +// Constants for utility functions +pub const DEFAULT_BUFFER_SIZE: usize = 256; +pub const MAX_STRING_LENGTH: usize = 1024; + +// Custom error types +pub const UtilError = error{ + BufferTooSmall, + InvalidString, + ProcessingFailed, +}; + +// String processing utilities +pub const StringProcessor = struct { + buffer: []u8, + allocator: std.mem.Allocator, + + pub fn init(allocator: std.mem.Allocator, buffer_size: usize) !StringProcessor { + const buffer = try allocator.alloc(u8, buffer_size); + return StringProcessor{ + .buffer = buffer, + .allocator = allocator, + }; + } + + pub fn deinit(self: *StringProcessor) void { + self.allocator.free(self.buffer); + } + + pub fn toUpperCase(self: *StringProcessor, input: []const u8) ![]const u8 { + if (input.len > self.buffer.len) { + return UtilError.BufferTooSmall; + } + + for (input, 0..) |char, i| { + self.buffer[i] = std.ascii.toUpper(char); + } + + return self.buffer[0..input.len]; + } + + pub fn reverse(self: *StringProcessor, input: []const u8) ![]const u8 { + if (input.len > self.buffer.len) { + return UtilError.BufferTooSmall; + } + + for (input, 0..) |char, i| { + self.buffer[input.len - 1 - i] = char; + } + + return self.buffer[0..input.len]; + } +}; + +// Data validation functions +pub fn validateEmail(email: []const u8) bool { + if (email.len == 0) return false; + + var has_at = false; + var has_dot = false; + + for (email) |char| { + if (char == '@') { + if (has_at) return false; // Multiple @ symbols + has_at = true; + } else if (char == '.') { + has_dot = true; + } + } + + return has_at and has_dot; +} + +pub fn isValidIdentifier(identifier: []const u8) bool { + if (identifier.len == 0) return false; + + // First character must be letter or underscore + if (!std.ascii.isAlphabetic(identifier[0]) and identifier[0] != '_') { + return false; + } + + // Rest must be alphanumeric or underscore + for (identifier[1..]) |char| { + if (!std.ascii.isAlphanumeric(char) and char != '_') { + return false; + } + } + + return true; +} + +// Simple string processing function used by main.zig +pub fn processData(input: []const u8) []const u8 { + return if (input.len > 0) "Processed!" else "Empty input"; +} + +// Array utilities +pub fn findMax(numbers: []const i32) ?i32 { + if (numbers.len == 0) return null; + + var max = numbers[0]; + for (numbers[1..]) |num| { + if (num > max) { + max = num; + } + } + + return max; +} + +pub fn bubbleSort(numbers: []i32) void { + const n = numbers.len; + if (n <= 1) return; + + var i: usize = 0; + while (i < n - 1) : (i += 1) { + var j: usize = 0; + while (j < n - i - 1) : (j += 1) { + if (numbers[j] > numbers[j + 1]) { + const temp = numbers[j]; + numbers[j] = numbers[j + 1]; + numbers[j + 1] = temp; + } + } + } +} + +// Tests +test "string processor initialization" { + var processor = try StringProcessor.init(std.testing.allocator, 100); + defer processor.deinit(); + + const result = try processor.toUpperCase("hello"); + try std.testing.expectEqualStrings("HELLO", result); +} + +test "email validation" { + try std.testing.expect(validateEmail("test@example.com")); + try std.testing.expect(!validateEmail("invalid-email")); + try std.testing.expect(!validateEmail("")); +} + +test "identifier validation" { + try std.testing.expect(isValidIdentifier("valid_id")); + try std.testing.expect(isValidIdentifier("_private")); + try std.testing.expect(!isValidIdentifier("123invalid")); + try std.testing.expect(!isValidIdentifier("")); +} + +test "find maximum in array" { + const numbers = [_]i32{ 3, 1, 4, 1, 5, 9, 2, 6 }; + const max = findMax(&numbers); + try std.testing.expectEqual(@as(?i32, 9), max); + + const empty: []const i32 = &[_]i32{}; + try std.testing.expectEqual(@as(?i32, null), findMax(empty)); +} + +test "bubble sort" { + var numbers = [_]i32{ 64, 34, 25, 12, 22, 11, 90 }; + bubbleSort(&numbers); + + const expected = [_]i32{ 11, 12, 22, 25, 34, 64, 90 }; + try std.testing.expectEqualSlices(i32, &expected, &numbers); +} \ No newline at end of file From 6d4c392e70dadb3ff7e94f9683c6ce48dfab705a Mon Sep 17 00:00:00 2001 From: johnhuang316 <134570882+johnhuang316@users.noreply.github.com> Date: Tue, 19 Aug 2025 10:51:46 +0800 Subject: [PATCH 3/8] Refactor Python and Zig strategies for improved symbol management and dependency tracking - Removed the backup Python strategy file to streamline codebase. - Enhanced Zig strategy to register dependencies with the symbol manager, classify imports, and improve logging of dependency counts. - Updated SCIP symbol analyzer to classify Zig imports as standard library, third party, or local based on module names. - Added methods to extract and classify symbol names and dependencies in Zig strategy. --- .../strategies/javascript_strategy_backup.py | 869 ------------------ .../scip/strategies/python_strategy_backup.py | 830 ----------------- .../scip/strategies/zig_strategy.py | 121 ++- .../tools/scip/scip_symbol_analyzer.py | 45 +- 4 files changed, 157 insertions(+), 1708 deletions(-) delete mode 100644 src/code_index_mcp/scip/strategies/javascript_strategy_backup.py delete mode 100644 src/code_index_mcp/scip/strategies/python_strategy_backup.py diff --git a/src/code_index_mcp/scip/strategies/javascript_strategy_backup.py b/src/code_index_mcp/scip/strategies/javascript_strategy_backup.py deleted file mode 100644 index 93c2273..0000000 --- a/src/code_index_mcp/scip/strategies/javascript_strategy_backup.py +++ /dev/null @@ -1,869 +0,0 @@ -"""JavaScript/TypeScript SCIP indexing strategy v2 - SCIP standard compliant.""" - -import logging -import os -from typing import List, Optional, Dict, Any, Set -from pathlib import Path - -try: - import tree_sitter - from tree_sitter_javascript import language as js_language - from tree_sitter_typescript import language_typescript as ts_language - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - -from .base_strategy import SCIPIndexerStrategy, StrategyError -from ..proto import scip_pb2 -from ..core.position_calculator import PositionCalculator -from ..core.relationship_types import InternalRelationshipType - - -logger = logging.getLogger(__name__) - - -class JavaScriptStrategy(SCIPIndexerStrategy): - """SCIP-compliant JavaScript/TypeScript indexing strategy using Tree-sitter.""" - - SUPPORTED_EXTENSIONS = {'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs'} - - def __init__(self, priority: int = 95): - """Initialize the JavaScript/TypeScript strategy v2.""" - super().__init__(priority) - - if not TREE_SITTER_AVAILABLE: - raise StrategyError("Tree-sitter not available for JavaScript/TypeScript strategy") - - # Initialize parsers - js_lang = tree_sitter.Language(js_language()) - ts_lang = tree_sitter.Language(ts_language()) - - self.js_parser = tree_sitter.Parser(js_lang) - self.ts_parser = tree_sitter.Parser(ts_lang) - - def can_handle(self, extension: str, file_path: str) -> bool: - """Check if this strategy can handle the file type.""" - return extension.lower() in self.SUPPORTED_EXTENSIONS and TREE_SITTER_AVAILABLE - - def get_language_name(self) -> str: - """Get the language name for SCIP symbol generation.""" - return "javascript" # Use 'javascript' for both JS and TS - - def is_available(self) -> bool: - """Check if this strategy is available.""" - return TREE_SITTER_AVAILABLE - - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Collect all symbol definitions from JavaScript/TypeScript files.""" - for file_path in files: - try: - self._collect_symbols_from_file(file_path, project_path) - except Exception as e: - logger.warning(f"Failed to collect symbols from {file_path}: {e}") - continue - - def _generate_documents_with_references(self, files: List[str], project_path: str) -> List[scip_pb2.Document]: - """Phase 2: Generate complete SCIP documents with resolved references.""" - documents = [] - - for file_path in files: - try: - document = self._analyze_js_file(file_path, project_path) - if document: - documents.append(document) - except Exception as e: - logger.error(f"Failed to analyze JavaScript/TypeScript file {file_path}: {e}") - continue - - return documents - - def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: - """Collect symbol definitions from a single JavaScript/TypeScript file.""" - # Read file content - content = self._read_file_content(file_path) - if not content: - return - - # Parse with Tree-sitter - tree = self._parse_content(content, file_path) - if not tree: - return - - # Collect symbols - relative_path = self._get_relative_path(file_path, project_path) - collector = JavaScriptSymbolCollector( - relative_path, content, tree, self.symbol_manager, self.reference_resolver - ) - collector.analyze() - - def _analyze_js_file(self, file_path: str, project_path: str) -> Optional[scip_pb2.Document]: - """Analyze a single JavaScript/TypeScript file and generate complete SCIP document.""" - # Read file content - content = self._read_file_content(file_path) - if not content: - return None - - # Parse with Tree-sitter - tree = self._parse_content(content, file_path) - if not tree: - return None - - # Create SCIP document - document = scip_pb2.Document() - document.relative_path = self._get_relative_path(file_path, project_path) - document.language = self._detect_specific_language(Path(file_path).suffix) - - # Analyze AST and generate occurrences - self.position_calculator = PositionCalculator(content) - analyzer = JavaScriptAnalyzer( - document.relative_path, - content, - tree, - document.language, - self.symbol_manager, - self.position_calculator, - self.reference_resolver - ) - analyzer.analyze() - - # Add results to document - document.occurrences.extend(analyzer.occurrences) - document.symbols.extend(analyzer.symbols) - - logger.debug(f"Analyzed JavaScript/TypeScript file {document.relative_path}: " - f"{len(document.occurrences)} occurrences, {len(document.symbols)} symbols") - - return document - - def _parse_content(self, content: str, file_path: str) -> Optional[tree_sitter.Tree]: - """Parse content with appropriate parser.""" - try: - content_bytes = content.encode('utf-8') - - # Choose parser based on file extension - extension = Path(file_path).suffix.lower() - if extension in ['.ts', '.tsx']: - return self.ts_parser.parse(content_bytes) - else: - return self.js_parser.parse(content_bytes) - - except Exception as e: - logger.error(f"Failed to parse content: {e}") - return None - - def _detect_specific_language(self, extension: str) -> str: - """Detect specific language from extension.""" - ext_to_lang = { - '.js': 'javascript', - '.jsx': 'jsx', - '.mjs': 'javascript', - '.cjs': 'javascript', - '.ts': 'typescript', - '.tsx': 'tsx' - } - return ext_to_lang.get(extension.lower(), 'javascript') - - -class JavaScriptSymbolCollector: - """Tree-sitter based symbol collector for JavaScript/TypeScript (Phase 1).""" - - def __init__(self, file_path: str, content: str, tree: tree_sitter.Tree, symbol_manager, reference_resolver): - self.file_path = file_path - self.content = content - self.tree = tree - self.symbol_manager = symbol_manager - self.reference_resolver = reference_resolver - self.scope_stack: List[str] = [] - - def analyze(self): - """Analyze the tree-sitter AST to collect symbols.""" - root = self.tree.root_node - self._analyze_node(root) - - def _analyze_node(self, node: tree_sitter.Node): - """Recursively analyze AST nodes.""" - node_type = node.type - - if node_type == 'function_declaration': - self._register_function_symbol(node) - elif node_type == 'method_definition': - self._register_method_symbol(node) - elif node_type == 'class_declaration': - self._register_class_symbol(node) - elif node_type == 'interface_declaration': - self._register_interface_symbol(node) - elif node_type == 'type_alias_declaration': - self._register_type_alias_symbol(node) - elif node_type == 'variable_declarator': - self._register_variable_symbol(node) - - # Recursively analyze child nodes - for child in node.children: - self._analyze_node(child) - - def _register_function_symbol(self, node: tree_sitter.Node): - """Register a function symbol definition.""" - name_node = self._find_child_by_type(node, 'identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="()." - ) - - self._register_symbol(symbol_id, name, scip_pb2.Function, ["JavaScript function"]) - - def _register_method_symbol(self, node: tree_sitter.Node): - """Register a method symbol definition.""" - name_node = (self._find_child_by_type(node, 'property_identifier') or - self._find_child_by_type(node, 'identifier')) - if name_node: - name = self._get_node_text(name_node) - if name: - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="()." - ) - - self._register_symbol(symbol_id, name, scip_pb2.Method, ["JavaScript method"]) - - def _register_class_symbol(self, node: tree_sitter.Node): - """Register a class symbol definition.""" - name_node = self._find_child_by_type(node, 'identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="#" - ) - - self._register_symbol(symbol_id, name, scip_pb2.Class, ["JavaScript class"]) - - def _register_interface_symbol(self, node: tree_sitter.Node): - """Register a TypeScript interface symbol definition.""" - name_node = self._find_child_by_type(node, 'type_identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="#" - ) - - self._register_symbol(symbol_id, name, scip_pb2.Interface, ["TypeScript interface"]) - - def _register_type_alias_symbol(self, node: tree_sitter.Node): - """Register a TypeScript type alias symbol definition.""" - name_node = self._find_child_by_type(node, 'type_identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="#" - ) - - self._register_symbol(symbol_id, name, scip_pb2.TypeParameter, ["TypeScript type alias"]) - - def _register_variable_symbol(self, node: tree_sitter.Node): - """Register a variable symbol definition.""" - name_node = self._find_child_by_type(node, 'identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="" - ) - - self._register_symbol(symbol_id, name, scip_pb2.Variable, ["JavaScript variable"]) - - def _register_symbol(self, symbol_id: str, name: str, symbol_kind: int, documentation: List[str]): - """Register a symbol with the reference resolver.""" - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=self.file_path, - definition_range=dummy_range, - symbol_kind=symbol_kind, - display_name=name, - documentation=documentation - ) - - def _find_child_by_type(self, node: tree_sitter.Node, node_type: str) -> Optional[tree_sitter.Node]: - """Find first child node of the given type.""" - for child in node.children: - if child.type == node_type: - return child - return None - - def _get_node_text(self, node: tree_sitter.Node) -> str: - """Get text content of a node.""" - return self.content[node.start_byte:node.end_byte] - - -class JavaScriptAnalyzer: - """Tree-sitter based analyzer for JavaScript/TypeScript AST (Phase 2).""" - - def __init__(self, file_path: str, content: str, tree: tree_sitter.Tree, language: str, - symbol_manager, position_calculator, reference_resolver): - self.file_path = file_path - self.content = content - self.tree = tree - self.language = language - self.symbol_manager = symbol_manager - self.position_calculator = position_calculator - self.reference_resolver = reference_resolver - self.scope_stack: List[str] = [] - - # Results - self.occurrences: List[scip_pb2.Occurrence] = [] - self.symbols: List[scip_pb2.SymbolInformation] = [] - - def analyze(self): - """Analyze the tree-sitter AST.""" - root = self.tree.root_node - self._analyze_node(root) - - def _analyze_node(self, node: tree_sitter.Node): - """Recursively analyze AST nodes.""" - node_type = node.type - - if node_type == 'function_declaration': - self._handle_function_declaration(node) - elif node_type == 'method_definition': - self._handle_method_definition(node) - elif node_type == 'class_declaration': - self._handle_class_declaration(node) - elif node_type == 'interface_declaration': - self._handle_interface_declaration(node) - elif node_type == 'type_alias_declaration': - self._handle_type_alias_declaration(node) - elif node_type == 'variable_declarator': - self._handle_variable_declarator(node) - elif node_type == 'identifier': - self._handle_identifier_reference(node) - - # Recursively analyze child nodes - for child in node.children: - self._analyze_node(child) - - def _handle_function_declaration(self, node: tree_sitter.Node): - """Handle function declarations.""" - name_node = self._find_child_by_type(node, 'identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - self._create_function_symbol(node, name_node, name, False) - - def _handle_method_definition(self, node: tree_sitter.Node): - """Handle method definitions.""" - name_node = (self._find_child_by_type(node, 'property_identifier') or - self._find_child_by_type(node, 'identifier')) - if name_node: - name = self._get_node_text(name_node) - if name: - self._create_function_symbol(node, name_node, name, True) - - def _handle_class_declaration(self, node: tree_sitter.Node): - """Handle class declarations.""" - name_node = self._find_child_by_type(node, 'identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - self._create_class_symbol(node, name_node, name, scip_pb2.Class, "JavaScript class") - - # Enter class scope - self.scope_stack.append(name) - - # Analyze class body - class_body = self._find_child_by_type(node, 'class_body') - if class_body: - self._analyze_node(class_body) - - # Exit class scope - self.scope_stack.pop() - - def _handle_interface_declaration(self, node: tree_sitter.Node): - """Handle TypeScript interface declarations.""" - name_node = self._find_child_by_type(node, 'type_identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - self._create_class_symbol(node, name_node, name, scip_pb2.Interface, "TypeScript interface") - - def _handle_type_alias_declaration(self, node: tree_sitter.Node): - """Handle TypeScript type alias declarations.""" - name_node = self._find_child_by_type(node, 'type_identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - self._create_class_symbol(node, name_node, name, scip_pb2.TypeParameter, "TypeScript type alias") - - def _handle_variable_declarator(self, node: tree_sitter.Node): - """Handle variable declarations.""" - name_node = self._find_child_by_type(node, 'identifier') - if name_node: - name = self._get_node_text(name_node) - if name: - self._create_variable_symbol(node, name_node, name) - - def _handle_identifier_reference(self, node: tree_sitter.Node): - """Handle identifier references.""" - # Only handle if it's not part of a declaration - parent = node.parent - if parent and parent.type not in [ - 'function_declaration', 'class_declaration', 'variable_declarator', - 'method_definition', 'interface_declaration', 'type_alias_declaration' - ]: - name = self._get_node_text(node) - if name and len(name) > 1: # Avoid single letters - self._handle_name_reference(node, name) - - def _create_function_symbol(self, node: tree_sitter.Node, name_node: tree_sitter.Node, name: str, is_method: bool): - """Create a function or method symbol.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="()." - ) - - # Create definition occurrence - range_obj = self.position_calculator.tree_sitter_node_to_range(name_node) - occurrence = self._create_occurrence( - symbol_id, range_obj, scip_pb2.Definition, scip_pb2.IdentifierFunction - ) - self.occurrences.append(occurrence) - - # Create symbol information - kind = scip_pb2.Method if is_method else scip_pb2.Function - doc_type = "method" if is_method else "function" - documentation = [f"JavaScript {doc_type} in {self.language}"] - - symbol_info = self._create_symbol_information( - symbol_id, name, kind, documentation - ) - self.symbols.append(symbol_info) - - def _create_class_symbol(self, node: tree_sitter.Node, name_node: tree_sitter.Node, - name: str, symbol_kind: int, description: str): - """Create a class, interface, or type symbol.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="#" - ) - - # Create definition occurrence - range_obj = self.position_calculator.tree_sitter_node_to_range(name_node) - occurrence = self._create_occurrence( - symbol_id, range_obj, scip_pb2.Definition, scip_pb2.IdentifierType - ) - self.occurrences.append(occurrence) - - # Create symbol information - symbol_info = self._create_symbol_information( - symbol_id, name, symbol_kind, [description] - ) - self.symbols.append(symbol_info) - - def _create_variable_symbol(self, node: tree_sitter.Node, name_node: tree_sitter.Node, name: str): - """Create a variable symbol.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="" - ) - - # Create definition occurrence - range_obj = self.position_calculator.tree_sitter_node_to_range(name_node) - occurrence = self._create_occurrence( - symbol_id, range_obj, scip_pb2.Definition, scip_pb2.IdentifierLocal - ) - self.occurrences.append(occurrence) - - # Create symbol information - symbol_info = self._create_symbol_information( - symbol_id, name, scip_pb2.Variable, [f"JavaScript variable in {self.language}"] - ) - self.symbols.append(symbol_info) - - def _handle_name_reference(self, node: tree_sitter.Node, name: str): - """Handle name reference.""" - # Try to resolve the reference - resolved_symbol_id = self.reference_resolver.resolve_reference_by_name( - symbol_name=name, - context_file=self.file_path, - context_scope=self.scope_stack - ) - - if resolved_symbol_id: - # Create reference occurrence - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = self._create_occurrence( - resolved_symbol_id, range_obj, 0, scip_pb2.Identifier # 0 = reference role - ) - self.occurrences.append(occurrence) - - # Register the reference - self.reference_resolver.register_symbol_reference( - symbol_id=resolved_symbol_id, - file_path=self.file_path, - reference_range=range_obj, - context_scope=self.scope_stack - ) - - def _find_child_by_type(self, node: tree_sitter.Node, node_type: str) -> Optional[tree_sitter.Node]: - """Find first child node of the given type.""" - for child in node.children: - if child.type == node_type: - return child - return None - - def _get_node_text(self, node: tree_sitter.Node) -> str: - """Get text content of a node.""" - return self.content[node.start_byte:node.end_byte] - - def _create_occurrence(self, symbol_id: str, range_obj: scip_pb2.Range, - symbol_roles: int, syntax_kind: int) -> scip_pb2.Occurrence: - """Create a SCIP occurrence.""" - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = symbol_roles - occurrence.syntax_kind = syntax_kind - occurrence.range.CopyFrom(range_obj) - return occurrence - - def _create_symbol_information(self, symbol_id: str, display_name: str, - symbol_kind: int, documentation: List[str] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = display_name - symbol_info.kind = symbol_kind - - if documentation: - symbol_info.documentation.extend(documentation) - - return symbol_info - - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """ - Build relationships between JavaScript/TypeScript symbols. - - Args: - files: List of file paths to process - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - logger.debug(f"🔗 JavaScriptStrategy: Building symbol relationships for {len(files)} files") - - all_relationships = {} - - for file_path in files: - try: - file_relationships = self._extract_js_relationships_from_file(file_path, project_path) - all_relationships.update(file_relationships) - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - - total_symbols_with_relationships = len(all_relationships) - total_relationships = sum(len(rels) for rels in all_relationships.values()) - - logger.debug(f"✅ JavaScriptStrategy: Built {total_relationships} relationships for {total_symbols_with_relationships} symbols") - return all_relationships - - def _extract_js_relationships_from_file(self, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """ - Extract relationships from a single JavaScript/TypeScript file. - - Args: - file_path: File to analyze - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - content = self._read_file_content(file_path) - if not content: - return {} - - # Determine language based on file extension - file_ext = Path(file_path).suffix.lower() - is_typescript = file_ext in {'.ts', '.tsx'} - - if TREE_SITTER_AVAILABLE: - return self._extract_tree_sitter_relationships(content, file_path, is_typescript) - else: - return self._extract_regex_relationships(content, file_path) - - def _extract_tree_sitter_relationships(self, content: str, file_path: str, is_typescript: bool) -> Dict[str, List[tuple]]: - """Extract relationships using tree-sitter parser.""" - try: - # Choose appropriate language - language = ts_language() if is_typescript else js_language() - parser = tree_sitter.Parser() - parser.set_language(tree_sitter.Language(language)) - - tree = parser.parse(bytes(content, "utf8")) - - extractor = JSRelationshipExtractor( - file_path=file_path, - content=content, - symbol_manager=self.symbol_manager, - is_typescript=is_typescript - ) - - extractor.extract_from_tree(tree.root_node) - return extractor.get_relationships() - - except Exception as e: - logger.warning(f"Tree-sitter relationship extraction failed for {file_path}: {e}") - return self._extract_regex_relationships(content, file_path) - - def _extract_regex_relationships(self, content: str, file_path: str) -> Dict[str, List[tuple]]: - """Extract relationships using regex patterns (fallback).""" - import re - - relationships = {} - - # Simple regex patterns for basic relationship extraction - # This is a fallback when tree-sitter is not available - - # Class inheritance patterns - class_extends_pattern = r'class\s+(\w+)\s+extends\s+(\w+)' - for match in re.finditer(class_extends_pattern, content): - child_class = match.group(1) - parent_class = match.group(2) - - child_symbol_id = self._generate_symbol_id(file_path, [child_class], "#") - parent_symbol_id = self._generate_symbol_id(file_path, [parent_class], "#") - - if child_symbol_id not in relationships: - relationships[child_symbol_id] = [] - relationships[child_symbol_id].append((parent_symbol_id, InternalRelationshipType.INHERITS)) - - # Function calls patterns (basic) - function_call_pattern = r'(\w+)\s*\(' - current_function = None - - # Simple function definition detection - function_def_pattern = r'function\s+(\w+)\s*\(' - for match in re.finditer(function_def_pattern, content): - current_function = match.group(1) - # Extract calls within this function context (simplified) - - logger.debug(f"Regex extraction found {len(relationships)} relationships in {file_path}") - return relationships - - def _generate_symbol_id(self, file_path: str, symbol_path: List[str], descriptor: str) -> str: - """Generate SCIP symbol ID for a JavaScript symbol.""" - if self.symbol_manager: - return self.symbol_manager.create_local_symbol( - language="javascript", - file_path=file_path, - symbol_path=symbol_path, - descriptor=descriptor - ) - return f"local {'/'.join(symbol_path)}{descriptor}" - - -class JSRelationshipExtractor: - """ - Tree-sitter based relationship extractor for JavaScript/TypeScript. - """ - - def __init__(self, file_path: str, content: str, symbol_manager, is_typescript: bool = False): - self.file_path = file_path - self.content = content - self.symbol_manager = symbol_manager - self.is_typescript = is_typescript - self.relationships = {} - self.current_scope = [] - - def get_relationships(self) -> Dict[str, List[tuple]]: - """Get extracted relationships.""" - return self.relationships - - def _add_relationship(self, source_symbol_id: str, target_symbol_id: str, relationship_type: InternalRelationshipType): - """Add a relationship to the collection.""" - if source_symbol_id not in self.relationships: - self.relationships[source_symbol_id] = [] - self.relationships[source_symbol_id].append((target_symbol_id, relationship_type)) - - def extract_from_tree(self, node): - """Extract relationships from tree-sitter AST.""" - self._visit_node(node) - - def _visit_node(self, node): - """Visit a tree-sitter node recursively.""" - if node.type == "class_declaration": - self._handle_class_declaration(node) - elif node.type == "function_declaration": - self._handle_function_declaration(node) - elif node.type == "method_definition": - self._handle_method_definition(node) - elif node.type == "call_expression": - self._handle_call_expression(node) - elif node.type == "import_statement": - self._handle_import_statement(node) - - # Visit child nodes - for child in node.children: - self._visit_node(child) - - def _handle_class_declaration(self, node): - """Handle class declaration and inheritance.""" - class_name = None - parent_class = None - - for child in node.children: - if child.type == "identifier" and class_name is None: - class_name = self._get_node_text(child) - elif child.type == "class_heritage": - # Find extends clause - for heritage_child in child.children: - if heritage_child.type == "extends_clause": - for extends_child in heritage_child.children: - if extends_child.type == "identifier": - parent_class = self._get_node_text(extends_child) - break - - if class_name and parent_class: - class_symbol_id = self._generate_symbol_id([class_name], "#") - parent_symbol_id = self._generate_symbol_id([parent_class], "#") - self._add_relationship(class_symbol_id, parent_symbol_id, InternalRelationshipType.INHERITS) - - def _handle_function_declaration(self, node): - """Handle function declaration.""" - function_name = None - - for child in node.children: - if child.type == "identifier": - function_name = self._get_node_text(child) - break - - if function_name: - self.current_scope.append(function_name) - # Extract calls within function body - self._extract_function_calls(node, function_name) - self.current_scope.pop() - - def _handle_method_definition(self, node): - """Handle method definition within a class.""" - method_name = None - - for child in node.children: - if child.type == "property_identifier": - method_name = self._get_node_text(child) - break - - if method_name: - full_scope = self.current_scope + [method_name] - self._extract_function_calls(node, method_name) - - def _handle_call_expression(self, node): - """Handle function/method calls.""" - if self.current_scope: - current_function = self.current_scope[-1] - - # Extract called function name - called_function = None - - for child in node.children: - if child.type == "identifier": - called_function = self._get_node_text(child) - break - elif child.type == "member_expression": - # Handle method calls like obj.method() - called_function = self._extract_member_expression(child) - break - - if called_function and current_function: - source_symbol_id = self._generate_symbol_id([current_function], "().") - target_symbol_id = self._generate_symbol_id([called_function], "().") - self._add_relationship(source_symbol_id, target_symbol_id, InternalRelationshipType.CALLS) - - def _handle_import_statement(self, node): - """Handle import statements.""" - # Extract import relationships - imported_module = None - imported_symbols = [] - - for child in node.children: - if child.type == "import_clause": - # Extract imported symbols - pass - elif child.type == "string": - # Extract module path - imported_module = self._get_node_text(child).strip('"\'') - - # Add import relationships if needed - # This could be expanded to track module dependencies - - def _extract_function_calls(self, function_node, function_name: str): - """Extract all function calls within a function.""" - old_scope = self.current_scope.copy() - if function_name not in self.current_scope: - self.current_scope.append(function_name) - - self._visit_calls_in_node(function_node) - - self.current_scope = old_scope - - def _visit_calls_in_node(self, node): - """Visit all call expressions in a node.""" - if node.type == "call_expression": - self._handle_call_expression(node) - - for child in node.children: - self._visit_calls_in_node(child) - - def _extract_member_expression(self, node) -> str: - """Extract full name from member expression (e.g., 'obj.method').""" - parts = [] - - for child in node.children: - if child.type == "identifier": - parts.append(self._get_node_text(child)) - elif child.type == "property_identifier": - parts.append(self._get_node_text(child)) - - return ".".join(parts) if parts else "" - - def _get_node_text(self, node) -> str: - """Get text content of a tree-sitter node.""" - return self.content[node.start_byte:node.end_byte] - - def _generate_symbol_id(self, symbol_path: List[str], descriptor: str) -> str: - """Generate SCIP symbol ID.""" - if self.symbol_manager: - return self.symbol_manager.create_local_symbol( - language="javascript", - file_path=self.file_path, - symbol_path=symbol_path, - descriptor=descriptor - ) - return f"local {'/'.join(symbol_path)}{descriptor}" diff --git a/src/code_index_mcp/scip/strategies/python_strategy_backup.py b/src/code_index_mcp/scip/strategies/python_strategy_backup.py deleted file mode 100644 index f1d0000..0000000 --- a/src/code_index_mcp/scip/strategies/python_strategy_backup.py +++ /dev/null @@ -1,830 +0,0 @@ -"""Python SCIP indexing strategy - SCIP standard compliant.""" - -import ast -import logging -import os -from typing import List, Optional, Dict, Any, Set -from pathlib import Path - -from .base_strategy import SCIPIndexerStrategy, StrategyError -from ..proto import scip_pb2 -from ..core.position_calculator import PositionCalculator -from ..core.relationship_types import InternalRelationshipType - - -logger = logging.getLogger(__name__) - - -class PythonStrategy(SCIPIndexerStrategy): - """SCIP-compliant Python indexing strategy using AST analysis.""" - - SUPPORTED_EXTENSIONS = {'.py', '.pyw'} - - def __init__(self, priority: int = 90): - """Initialize the Python strategy.""" - super().__init__(priority) - - def can_handle(self, extension: str, file_path: str) -> bool: - """Check if this strategy can handle the file type.""" - return extension.lower() in self.SUPPORTED_EXTENSIONS - - def get_language_name(self) -> str: - """Get the language name for SCIP symbol generation.""" - return "python" - - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Collect all symbol definitions from Python files.""" - logger.debug(f"PythonStrategy Phase 1: Processing {len(files)} files for symbol collection") - processed_count = 0 - error_count = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - self._collect_symbols_from_file(file_path, project_path) - processed_count += 1 - - if i % 10 == 0 or i == len(files): # Progress every 10 files or at end - logger.debug(f"Phase 1 progress: {i}/{len(files)} files, last file: {relative_path}") - - except Exception as e: - error_count += 1 - logger.warning(f"Phase 1 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 1 summary: {processed_count} files processed, {error_count} errors") - - def _generate_documents_with_references(self, files: List[str], project_path: str) -> List[scip_pb2.Document]: - """Phase 2: Generate complete SCIP documents with resolved references.""" - documents = [] - logger.debug(f"PythonStrategy Phase 2: Generating documents for {len(files)} files") - processed_count = 0 - error_count = 0 - total_occurrences = 0 - total_symbols = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - document = self._analyze_python_file(file_path, project_path) - if document: - documents.append(document) - total_occurrences += len(document.occurrences) - total_symbols += len(document.symbols) - processed_count += 1 - - if i % 10 == 0 or i == len(files): # Progress every 10 files or at end - logger.debug(f"Phase 2 progress: {i}/{len(files)} files, " - f"last file: {relative_path}, " - f"{len(document.occurrences) if document else 0} occurrences") - - except Exception as e: - error_count += 1 - logger.error(f"Phase 2 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 2 summary: {processed_count} documents generated, {error_count} errors, " - f"{total_occurrences} total occurrences, {total_symbols} total symbols") - - return documents - - def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: - """Collect symbol definitions from a single Python file.""" - - # Read file content - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {os.path.relpath(file_path, project_path)}") - return - - # Parse AST - try: - tree = ast.parse(content, filename=file_path) - except SyntaxError as e: - logger.warning(f"Syntax error in {os.path.relpath(file_path, project_path)}: {e}") - return - - # Collect symbols - relative_path = self._get_relative_path(file_path, project_path) - collector = PythonSymbolCollector( - relative_path, content, self.symbol_manager, self.reference_resolver - ) - collector.visit(tree) - logger.debug(f"Symbol collection - {relative_path}") - - def _analyze_python_file(self, file_path: str, project_path: str) -> Optional[scip_pb2.Document]: - """Analyze a single Python file and generate complete SCIP document.""" - relative_path = self._get_relative_path(file_path, project_path) - - # Read file content - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {relative_path}") - return None - - # Parse AST - try: - tree = ast.parse(content, filename=file_path) - except SyntaxError as e: - logger.warning(f"Syntax error in {relative_path}: {e}") - return None - - # Create SCIP document - document = scip_pb2.Document() - document.relative_path = relative_path - document.language = self.get_language_name() - - # Analyze AST and generate occurrences - self.position_calculator = PositionCalculator(content) - - analyzer = PythonAnalyzer( - document.relative_path, - content, - self.symbol_manager, - self.position_calculator, - self.reference_resolver - ) - - analyzer.visit(tree) - - # Add results to document - document.occurrences.extend(analyzer.occurrences) - document.symbols.extend(analyzer.symbols) - - logger.debug(f"Document analysis - {relative_path}: " - f"-> {len(document.occurrences)} occurrences, {len(document.symbols)} symbols") - - return document - - -class PythonSymbolCollector(ast.NodeVisitor): - """AST visitor that collects Python symbol definitions (Phase 1).""" - - def __init__(self, file_path: str, content: str, symbol_manager, reference_resolver): - self.file_path = file_path - self.content = content - self.symbol_manager = symbol_manager - self.reference_resolver = reference_resolver - self.scope_stack: List[str] = [] # Track current scope - - def visit_FunctionDef(self, node: ast.FunctionDef): - """Visit function definition.""" - self._register_function_symbol(node, node.name, is_async=False) - - # Enter function scope - self.scope_stack.append(node.name) - self.generic_visit(node) - self.scope_stack.pop() - - def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef): - """Visit async function definition.""" - self._register_function_symbol(node, node.name, is_async=True) - - # Enter function scope - self.scope_stack.append(node.name) - self.generic_visit(node) - self.scope_stack.pop() - - def visit_ClassDef(self, node: ast.ClassDef): - """Visit class definition.""" - self._register_class_symbol(node, node.name) - - # Enter class scope - self.scope_stack.append(node.name) - self.generic_visit(node) - self.scope_stack.pop() - - def _register_function_symbol(self, node: ast.AST, name: str, is_async: bool = False): - """Register a function symbol definition.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="()." - ) - - # Create a dummy range for registration (will be calculated properly in Phase 2) - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - documentation = [] - if is_async: - documentation.append("Async function") - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=self.file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Function, - display_name=name, - documentation=documentation - ) - - def _register_class_symbol(self, node: ast.AST, name: str): - """Register a class symbol definition.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="#" - ) - - # Create a dummy range for registration - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=self.file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Class, - display_name=name, - documentation=["Python class"] - ) - - -class PythonAnalyzer(ast.NodeVisitor): - """AST visitor that generates complete SCIP data (Phase 2).""" - - def __init__(self, file_path: str, content: str, symbol_manager, position_calculator, reference_resolver): - self.file_path = file_path - self.content = content - self.symbol_manager = symbol_manager - self.position_calculator = position_calculator - self.reference_resolver = reference_resolver - self.scope_stack: List[str] = [] - - # Results - self.occurrences: List[scip_pb2.Occurrence] = [] - self.symbols: List[scip_pb2.SymbolInformation] = [] - - def visit_FunctionDef(self, node: ast.FunctionDef): - """Visit function definition.""" - self._handle_function_definition(node, node.name, is_async=False) - - # Enter function scope - self.scope_stack.append(node.name) - self.generic_visit(node) - self.scope_stack.pop() - - def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef): - """Visit async function definition.""" - self._handle_function_definition(node, node.name, is_async=True) - - # Enter function scope - self.scope_stack.append(node.name) - self.generic_visit(node) - self.scope_stack.pop() - - def visit_ClassDef(self, node: ast.ClassDef): - """Visit class definition.""" - self._handle_class_definition(node, node.name) - - # Enter class scope - self.scope_stack.append(node.name) - self.generic_visit(node) - self.scope_stack.pop() - - def visit_Import(self, node: ast.Import): - """Visit import statement.""" - for alias in node.names: - self._handle_import(node, alias.name, alias.asname) - self.generic_visit(node) - - def visit_ImportFrom(self, node: ast.ImportFrom): - """Visit from ... import ... statement.""" - module_name = node.module or "" - for alias in node.names: - self._handle_from_import(node, module_name, alias.name, alias.asname) - self.generic_visit(node) - - def visit_Name(self, node: ast.Name): - """Visit name references.""" - if isinstance(node.ctx, ast.Load): - # This is a reference to a name - self._handle_name_reference(node, node.id) - self.generic_visit(node) - - def visit_Attribute(self, node: ast.Attribute): - """Visit attribute access.""" - if isinstance(node.ctx, ast.Load): - self._handle_attribute_reference(node, node.attr) - self.generic_visit(node) - - def _handle_function_definition(self, node: ast.AST, name: str, is_async: bool = False): - """Handle function definition.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="()." - ) - - # Create definition occurrence - range_obj = self.position_calculator.ast_node_to_range(node) - occurrence = self._create_occurrence( - symbol_id, range_obj, scip_pb2.Definition, scip_pb2.IdentifierFunction - ) - self.occurrences.append(occurrence) - - # Create symbol information - documentation = [] - if is_async: - documentation.append("Async function") - - # Add docstring if available - docstring = ast.get_docstring(node) - if docstring: - documentation.append(docstring) - - # Add parameter information - if hasattr(node, 'args') and node.args.args: - params = [arg.arg for arg in node.args.args] - documentation.append(f"Parameters: {', '.join(params)}") - - symbol_info = self._create_symbol_information( - symbol_id, name, scip_pb2.Function, documentation - ) - self.symbols.append(symbol_info) - - def _handle_class_definition(self, node: ast.AST, name: str): - """Handle class definition.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=self.file_path, - symbol_path=self.scope_stack + [name], - descriptor="#" - ) - - # Create definition occurrence - range_obj = self.position_calculator.ast_node_to_range(node) - occurrence = self._create_occurrence( - symbol_id, range_obj, scip_pb2.Definition, scip_pb2.IdentifierType - ) - self.occurrences.append(occurrence) - - # Create symbol information - documentation = ["Python class"] - - # Add docstring if available - docstring = ast.get_docstring(node) - if docstring: - documentation.append(docstring) - - # Add base class information - if hasattr(node, 'bases') and node.bases: - base_names = [] - for base in node.bases: - if isinstance(base, ast.Name): - base_names.append(base.id) - elif isinstance(base, ast.Attribute): - base_names.append(ast.unparse(base)) - if base_names: - documentation.append(f"Inherits from: {', '.join(base_names)}") - - symbol_info = self._create_symbol_information( - symbol_id, name, scip_pb2.Class, documentation - ) - self.symbols.append(symbol_info) - - def _handle_import(self, node: ast.AST, module_name: str, alias_name: Optional[str]): - """Handle import statement with moniker support.""" - display_name = alias_name or module_name - - # Determine if this is a standard library or external package import - if self._is_stdlib_module(module_name): - # Standard library import - symbol_id = self.symbol_manager.create_stdlib_symbol( - language="python", - module_name=module_name, - symbol_name="", - descriptor="" - ) - elif self._is_external_package(module_name): - # External package import using moniker system - symbol_id = self.symbol_manager.create_external_symbol( - language="python", - package_name=self._extract_package_name(module_name), - module_path=self._extract_module_path(module_name), - symbol_name="", - alias=alias_name - ) - else: - # Local project import - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=f"{module_name.replace('.', '/')}.py", - symbol_path=[], - descriptor="" - ) - - range_obj = self.position_calculator.ast_node_to_range(node) - occurrence = self._create_occurrence( - symbol_id, range_obj, scip_pb2.Import, scip_pb2.IdentifierNamespace - ) - self.occurrences.append(occurrence) - - def _handle_from_import(self, node: ast.AST, module_name: str, import_name: str, alias_name: Optional[str]): - """Handle from ... import ... statement with moniker support.""" - display_name = alias_name or import_name - - # Determine if this is a standard library or external package import - if self._is_stdlib_module(module_name): - # Standard library import - symbol_id = self.symbol_manager.create_stdlib_symbol( - language="python", - module_name=module_name, - symbol_name=import_name, - descriptor="" - ) - elif self._is_external_package(module_name): - # External package import using moniker system - symbol_id = self.symbol_manager.create_external_symbol( - language="python", - package_name=self._extract_package_name(module_name), - module_path=self._extract_module_path(module_name), - symbol_name=import_name, - alias=alias_name, - descriptor=self._infer_descriptor_from_name(import_name) - ) - else: - # Local project import - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=f"{module_name.replace('.', '/')}.py", - symbol_path=[import_name], - descriptor=self._infer_descriptor_from_name(import_name) - ) - - range_obj = self.position_calculator.ast_node_to_range(node) - occurrence = self._create_occurrence( - symbol_id, range_obj, scip_pb2.Import, scip_pb2.Identifier - ) - self.occurrences.append(occurrence) - - def _handle_name_reference(self, node: ast.AST, name: str): - """Handle name reference with import resolution.""" - # First try to resolve to imported external symbol - imported_symbol_id = self.symbol_manager.resolve_import_reference(name, self.file_path) - - if imported_symbol_id: - # This is a reference to an imported symbol - range_obj = self.position_calculator.ast_node_to_range(node) - occurrence = self._create_occurrence( - imported_symbol_id, range_obj, 0, scip_pb2.Identifier # 0 = reference role - ) - self.occurrences.append(occurrence) - return - - # Try to resolve local reference - resolved_symbol_id = self.reference_resolver.resolve_reference_by_name( - symbol_name=name, - context_file=self.file_path, - context_scope=self.scope_stack - ) - - if resolved_symbol_id: - # Create reference occurrence - range_obj = self.position_calculator.ast_node_to_range(node) - occurrence = self._create_occurrence( - resolved_symbol_id, range_obj, 0, scip_pb2.Identifier # 0 = reference role - ) - self.occurrences.append(occurrence) - - # Register the reference - self.reference_resolver.register_symbol_reference( - symbol_id=resolved_symbol_id, - file_path=self.file_path, - reference_range=range_obj, - context_scope=self.scope_stack - ) - - def _handle_attribute_reference(self, node: ast.AST, attr_name: str): - """Handle attribute reference.""" - # For now, create a simple local reference - # In a full implementation, this would resolve through the object type - range_obj = self.position_calculator.ast_node_to_range(node) - - # Try to create a local symbol for the attribute - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=self.file_path, - symbol_path=self.scope_stack + [attr_name], - descriptor="" - ) - - occurrence = self._create_occurrence( - symbol_id, range_obj, 0, scip_pb2.Identifier - ) - self.occurrences.append(occurrence) - - def _create_occurrence(self, symbol_id: str, range_obj: scip_pb2.Range, - symbol_roles: int, syntax_kind: int) -> scip_pb2.Occurrence: - """Create a SCIP occurrence.""" - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = symbol_roles - occurrence.syntax_kind = syntax_kind - occurrence.range.CopyFrom(range_obj) - return occurrence - - def _create_symbol_information(self, symbol_id: str, display_name: str, - symbol_kind: int, documentation: List[str] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = display_name - symbol_info.kind = symbol_kind - - if documentation: - symbol_info.documentation.extend(documentation) - - return symbol_info - - def _is_stdlib_module(self, module_name: str) -> bool: - """Check if module is part of Python standard library.""" - # Standard library modules (partial list - could be expanded) - stdlib_modules = { - 'os', 'sys', 'json', 'datetime', 'collections', 'itertools', - 'functools', 'typing', 're', 'math', 'random', 'pathlib', - 'urllib', 'http', 'email', 'csv', 'xml', 'html', 'sqlite3', - 'threading', 'asyncio', 'multiprocessing', 'subprocess', - 'unittest', 'logging', 'configparser', 'argparse', 'io', - 'shutil', 'glob', 'tempfile', 'zipfile', 'tarfile', - 'pickle', 'base64', 'hashlib', 'hmac', 'secrets', 'uuid', - 'time', 'calendar', 'zoneinfo', 'locale', 'gettext', - 'decimal', 'fractions', 'statistics', 'cmath', 'bisect', - 'heapq', 'queue', 'weakref', 'copy', 'pprint', 'reprlib', - 'enum', 'dataclasses', 'contextlib', 'abc', 'atexit', - 'traceback', 'gc', 'inspect', 'site', 'warnings', 'keyword', - 'builtins', '__future__', 'imp', 'importlib', 'pkgutil', - 'modulefinder', 'runpy', 'ast', 'dis', 'pickletools' - } - - # Get the root module name (e.g., 'os.path' -> 'os') - root_module = module_name.split('.')[0] - return root_module in stdlib_modules - - def _is_external_package(self, module_name: str) -> bool: - """Check if module is from an external package (not stdlib, not local).""" - # If it's stdlib, it's not external - if self._is_stdlib_module(module_name): - return False - - # Check if it starts with known external package patterns - # (This could be enhanced with actual dependency parsing) - external_patterns = [ - 'numpy', 'pandas', 'scipy', 'matplotlib', 'seaborn', - 'sklearn', 'torch', 'tensorflow', 'keras', 'cv2', - 'requests', 'urllib3', 'httpx', 'aiohttp', - 'flask', 'django', 'fastapi', 'starlette', - 'sqlalchemy', 'psycopg2', 'pymongo', 'redis', - 'pytest', 'unittest2', 'mock', 'nose', - 'click', 'typer', 'argparse', 'fire', - 'pyyaml', 'toml', 'configparser', 'python-dotenv', - 'pillow', 'imageio', 'opencv', 'scikit', - 'beautifulsoup4', 'lxml', 'scrapy', - 'celery', 'rq', 'dramatiq', - 'pydantic', 'marshmallow', 'cerberus', - 'cryptography', 'bcrypt', 'passlib' - ] - - root_module = module_name.split('.')[0] - return any(root_module.startswith(pattern) for pattern in external_patterns) - - def _extract_package_name(self, module_name: str) -> str: - """Extract package name from module path.""" - # For most packages, the root module is the package name - root_module = module_name.split('.')[0] - - # Handle special cases where module name differs from package name - package_mapping = { - 'cv2': 'opencv-python', - 'sklearn': 'scikit-learn', - 'PIL': 'Pillow', - 'bs4': 'beautifulsoup4', - 'yaml': 'PyYAML', - } - - return package_mapping.get(root_module, root_module) - - def _extract_module_path(self, module_name: str) -> str: - """Extract module path within package.""" - parts = module_name.split('.') - if len(parts) > 1: - # Return submodule path (everything after package name) - return '/'.join(parts[1:]) - return "" - - def _infer_descriptor_from_name(self, name: str) -> str: - """Infer SCIP descriptor from symbol name.""" - # Simple heuristics for Python symbols - if name.isupper(): # Constants like MAX_SIZE - return "." - elif name.istitle(): # Classes like MyClass - return "#" - elif name.endswith('Error') or name.endswith('Exception'): # Exception classes - return "#" - else: # Functions, variables, etc. - return "()." if name.islower() else "." - - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """ - Build relationships between Python symbols. - - Args: - files: List of file paths to process - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - logger.debug(f"PythonStrategy: Building symbol relationships for {len(files)} files") - - all_relationships = {} - - for file_path in files: - try: - file_relationships = self._extract_relationships_from_file(file_path, project_path) - all_relationships.update(file_relationships) - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - - total_symbols_with_relationships = len(all_relationships) - total_relationships = sum(len(rels) for rels in all_relationships.values()) - - logger.debug(f"PythonStrategy: Built {total_relationships} relationships for {total_symbols_with_relationships} symbols") - return all_relationships - - def _extract_relationships_from_file(self, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """ - Extract relationships from a single Python file. - - Args: - file_path: File to analyze - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - content = self._read_file_content(file_path) - if not content: - return {} - - try: - tree = ast.parse(content) - except SyntaxError as e: - logger.warning(f"Syntax error in {file_path}: {e}") - return {} - - extractor = PythonRelationshipExtractor( - file_path=file_path, - project_path=project_path, - symbol_manager=self.symbol_manager - ) - - extractor.visit(tree) - return extractor.get_relationships() - - -class PythonRelationshipExtractor(ast.NodeVisitor): - """ - AST visitor for extracting Python symbol relationships. - """ - - def __init__(self, file_path: str, project_path: str, symbol_manager): - self.file_path = file_path - self.project_path = project_path - self.symbol_manager = symbol_manager - self.relationships = {} - self.current_scope = [] - self.current_class = None - self.current_function = None - - def get_relationships(self) -> Dict[str, List[tuple]]: - """Get extracted relationships.""" - return self.relationships - - def _add_relationship(self, source_symbol_id: str, target_symbol_id: str, relationship_type: InternalRelationshipType): - """Add a relationship to the collection.""" - if source_symbol_id not in self.relationships: - self.relationships[source_symbol_id] = [] - self.relationships[source_symbol_id].append((target_symbol_id, relationship_type)) - - def visit_ClassDef(self, node: ast.ClassDef): - """Visit class definition and extract inheritance relationships.""" - old_class = self.current_class - self.current_class = node.name - self.current_scope.append(node.name) - - # Generate class symbol ID - class_symbol_id = self._generate_symbol_id(self.current_scope, "#") - - # Extract inheritance relationships - for base in node.bases: - if isinstance(base, ast.Name): - # Direct inheritance: class Child(Parent) - parent_symbol_id = self._generate_symbol_id([base.id], "#") - self._add_relationship( - class_symbol_id, - parent_symbol_id, - InternalRelationshipType.INHERITS - ) - elif isinstance(base, ast.Attribute): - # Module-qualified inheritance: class Child(module.Parent) - parent_name = self._extract_attribute_name(base) - parent_symbol_id = self._generate_symbol_id([parent_name], "#") - self._add_relationship( - class_symbol_id, - parent_symbol_id, - InternalRelationshipType.INHERITS - ) - - # Visit class body - self.generic_visit(node) - - self.current_scope.pop() - self.current_class = old_class - - def visit_FunctionDef(self, node: ast.FunctionDef): - """Visit function definition and extract call relationships.""" - old_function = self.current_function - self.current_function = node.name - self.current_scope.append(node.name) - - # Generate function symbol ID - function_symbol_id = self._generate_symbol_id(self.current_scope, "().") - - # Extract function calls from body - call_extractor = FunctionCallExtractor(function_symbol_id, self) - for stmt in node.body: - call_extractor.visit(stmt) - - # Visit function body - self.generic_visit(node) - - self.current_scope.pop() - self.current_function = old_function - - def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef): - """Visit async function definition.""" - # Treat async functions the same as regular functions - self.visit_FunctionDef(node) - - def _generate_symbol_id(self, symbol_path: List[str], descriptor: str) -> str: - """Generate SCIP symbol ID for a symbol.""" - if self.symbol_manager: - return self.symbol_manager.create_local_symbol( - language="python", - file_path=self.file_path, - symbol_path=symbol_path, - descriptor=descriptor - ) - return f"local {'/'.join(symbol_path)}{descriptor}" - - def _extract_attribute_name(self, node: ast.Attribute) -> str: - """Extract full name from attribute node (e.g., 'module.Class').""" - if isinstance(node.value, ast.Name): - return f"{node.value.id}.{node.attr}" - elif isinstance(node.value, ast.Attribute): - return f"{self._extract_attribute_name(node.value)}.{node.attr}" - return node.attr - - -class FunctionCallExtractor(ast.NodeVisitor): - """ - Specialized visitor for extracting function calls within a function. - """ - - def __init__(self, source_function_id: str, parent_extractor): - self.source_function_id = source_function_id - self.parent_extractor = parent_extractor - - def visit_Call(self, node: ast.Call): - """Visit function call and extract relationship.""" - target_name = None - - if isinstance(node.func, ast.Name): - # Simple function call: func() - target_name = node.func.id - elif isinstance(node.func, ast.Attribute): - # Method call or module function call: obj.method() or module.func() - target_name = self.parent_extractor._extract_attribute_name(node.func) - - if target_name: - # Generate target symbol ID - target_symbol_id = self.parent_extractor._generate_symbol_id([target_name], "().") - - # Add call relationship - self.parent_extractor._add_relationship( - self.source_function_id, - target_symbol_id, - InternalRelationshipType.CALLS - ) - - # Continue visiting nested calls - self.generic_visit(node) \ No newline at end of file diff --git a/src/code_index_mcp/scip/strategies/zig_strategy.py b/src/code_index_mcp/scip/strategies/zig_strategy.py index 09ab201..4889454 100644 --- a/src/code_index_mcp/scip/strategies/zig_strategy.py +++ b/src/code_index_mcp/scip/strategies/zig_strategy.py @@ -129,7 +129,9 @@ def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: tree = self._parse_content(content) if tree: self._collect_symbols_from_tree_sitter(tree, relative_path, content) - logger.debug(f"Tree-sitter symbol collection - {relative_path}") + # Register dependencies with symbol manager + self._register_dependencies_with_symbol_manager() + logger.debug(f"Tree-sitter symbol collection - {relative_path}, deps: {self._count_dependencies()}") return raise StrategyError(f"Failed to parse {relative_path} with tree-sitter for symbol collection") @@ -149,6 +151,9 @@ def _analyze_zig_file(self, file_path: str, project_path: str, relationships: Op # Initialize position calculator self.position_calculator = PositionCalculator(content) + # Reset dependencies for this file + self._reset_dependencies() + if self.use_tree_sitter and self.parser: # Parse with Tree-sitter tree = self._parse_content(content) @@ -157,8 +162,12 @@ def _analyze_zig_file(self, file_path: str, project_path: str, relationships: Op document.occurrences.extend(occurrences) document.symbols.extend(symbols) + # Add dependency information to symbols + self._add_dependency_info_to_symbols(document, content) + logger.debug(f"Analyzed Zig file {document.relative_path}: " - f"{len(document.occurrences)} occurrences, {len(document.symbols)} symbols") + f"{len(document.occurrences)} occurrences, {len(document.symbols)} symbols, " + f"dependencies: {self._count_dependencies()}") return document raise StrategyError(f"Failed to parse {document.relative_path} with tree-sitter for document analysis") @@ -720,8 +729,17 @@ def _create_variable_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_ range_obj = self.position_calculator.tree_sitter_node_to_range(node) occurrence = scip_pb2.Occurrence() occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierConstant + + # Check if this variable is an import by examining the node for @import + is_import = self._is_variable_import(node) + + if is_import: + occurrence.symbol_roles = scip_pb2.Import # Mark as Import role + occurrence.syntax_kind = scip_pb2.IdentifierNamespace + else: + occurrence.symbol_roles = scip_pb2.Definition + occurrence.syntax_kind = scip_pb2.IdentifierConstant + occurrence.range.CopyFrom(range_obj) return occurrence except: @@ -971,3 +989,98 @@ def _reset_dependencies(self) -> None: 'local': [] } } + + def _add_dependency_info_to_symbols(self, document: scip_pb2.Document, content: str) -> None: + """Add dependency classification information to SCIP symbols.""" + if not self.dependencies['imports']: + return + + # Update existing import symbols with dependency classification + for symbol_info in document.symbols: + symbol_name = self._extract_symbol_name_from_id(symbol_info.symbol) + + # Check if this symbol is an import + if self._is_import_symbol(symbol_name, symbol_info): + # Find which dependency category this import belongs to + dependency_type = self._find_dependency_type(symbol_name) + if dependency_type: + # Update symbol documentation with dependency type + symbol_info.documentation.append(f"Dependency type: {dependency_type}") + # Mark as import role + if hasattr(symbol_info, 'symbol_roles'): + symbol_info.symbol_roles |= 2 # SymbolRole.Import = 2 + + def _count_dependencies(self) -> str: + """Get dependency count summary for logging.""" + total = (len(self.dependencies['imports']['standard_library']) + + len(self.dependencies['imports']['third_party']) + + len(self.dependencies['imports']['local'])) + return f"{total} total ({len(self.dependencies['imports']['standard_library'])} std, " \ + f"{len(self.dependencies['imports']['third_party'])} 3rd, " \ + f"{len(self.dependencies['imports']['local'])} local)" + + def _extract_symbol_name_from_id(self, symbol_id: str) -> str: + """Extract symbol name from SCIP symbol ID.""" + # Symbol ID format: "scip-zig local code-index-mcp .../filename/symbol_name." + parts = symbol_id.split('/') + if parts: + last_part = parts[-1] + # Remove trailing descriptor (., (), #) + if last_part.endswith('.'): + return last_part[:-1] + elif last_part.endswith('().'): + return last_part[:-3] + elif last_part.endswith('#'): + return last_part[:-1] + return "" + + def _is_import_symbol(self, symbol_name: str, symbol_info: scip_pb2.SymbolInformation) -> bool: + """Check if a symbol represents an import.""" + # Check if symbol documentation mentions import + for doc in symbol_info.documentation: + if "import" in doc.lower(): + return True + return False + + def _find_dependency_type(self, symbol_name: str) -> str: + """Find which dependency type category a symbol belongs to.""" + for dep_type, imports in self.dependencies['imports'].items(): + if symbol_name in imports: + return dep_type + return "" + + def _register_dependencies_with_symbol_manager(self) -> None: + """Register collected dependencies with the symbol manager.""" + if not self.symbol_manager or not self.dependencies['imports']: + return + + for dep_type, imports in self.dependencies['imports'].items(): + for import_path in imports: + try: + # Register with symbol manager for global dependency tracking + symbol_id = self.symbol_manager.moniker_manager.register_import( + package_name=import_path, + symbol_name=import_path, # Use import path as symbol name + module_path="", + alias=None, + import_kind="namespace", # Zig imports are namespace-like + version="" # Zig doesn't use version in @import() + ) + logger.debug(f"Registered dependency: {import_path} ({dep_type}) -> {symbol_id}") + except Exception as e: + logger.warning(f"Failed to register dependency {import_path}: {e}") + + def _is_variable_import(self, node) -> bool: + """Check if a variable declaration contains an @import call.""" + for child in node.children: + if child.type == 'builtin_function': + # Check if it's @import + builtin_id = None + for grandchild in child.children: + if grandchild.type == 'builtin_identifier': + builtin_id = self._get_node_text_ts(grandchild) + break + + if builtin_id == '@import': + return True + return False diff --git a/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py b/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py index 3743741..9b96cc4 100644 --- a/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py +++ b/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py @@ -874,7 +874,12 @@ def _extract_imports(self, document, imports: ImportGroup): # Local imports: extract module path from descriptors module_path = self._extract_local_module_path(symbol_info.descriptors) if module_path and module_path not in seen_modules: - imports.add_import(module_path, 'local') + # For Zig imports, classify by module name + if any('.zig' in part for part in symbol_info.descriptors.split('/')): + import_type = self._classify_zig_import(module_path) + else: + import_type = 'local' + imports.add_import(module_path, import_type) seen_modules.add(module_path) logger.debug(f"Extracted {len(seen_modules)} unique imports from SCIP occurrences") @@ -1041,13 +1046,43 @@ def _extract_local_module_path(self, descriptors: str) -> Optional[str]: """Extract module path from local descriptors.""" # utils.py/helper_function() -> utils # services/user_service.py/UserService -> services.user_service + # test/sample-projects/zig/code-index-example/src/main.zig/std. -> std if '/' in descriptors: - file_part = descriptors.split('/')[0] - if file_part.endswith('.py'): - return file_part[:-3].replace('/', '.') - return file_part.replace('/', '.') + parts = descriptors.split('/') + if len(parts) >= 2: + # For Zig: extract the symbol name (last part after the file path) + if any('.zig' in part for part in parts): + # Zig import: symbol name is the last part + symbol_name = parts[-1].rstrip('.') + return symbol_name + # For Python: traditional handling + file_part = parts[0] + if file_part.endswith('.py'): + return file_part[:-3].replace('/', '.') + return file_part.replace('/', '.') return None + def _classify_zig_import(self, module_name: str) -> str: + """Classify Zig import as standard_library, third_party, or local.""" + # Zig standard library modules + zig_stdlib = { + 'std', 'builtin', 'testing', 'math', 'fmt', 'mem', 'ascii', + 'unicode', 'json', 'crypto', 'compress', 'hash', 'http', + 'net', 'fs', 'os', 'process', 'thread', 'atomic', 'debug', + 'log', 'rand', 'sort', 'time', 'zig' + } + + # Local imports (relative paths) + if module_name.startswith('./') or module_name.startswith('../') or module_name.endswith('.zig'): + return 'local' + + # Standard library + if module_name in zig_stdlib: + return 'standard_library' + + # Everything else is third_party + return 'third_party' + def _extract_class_name_from_descriptors(self, descriptors: str) -> Optional[str]: """Extract class name from descriptors.""" # test_empty_functions.py/TestClass# -> TestClass From dee1eefd52d7c910faec501bfb1d9e59f7a5ddab Mon Sep 17 00:00:00 2001 From: johnhuang316 <134570882+johnhuang316@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:20:54 +0800 Subject: [PATCH 4/8] refactor: implement modular SCIPSymbolAnalyzer architecture (Phases 1-3) Add modular analyzer, dependency management, and position resolution systems to replace monolithic SCIPSymbolAnalyzer structure. ## Added Files (27 total) ### Language Analyzers (7 files) - `analyzers/base.py` - Abstract language analyzer interface - `analyzers/factory.py` - Analyzer factory and language detection - `analyzers/python_analyzer.py` - Python import extraction and stdlib detection - `analyzers/zig_analyzer.py` - Zig language analyzer - `analyzers/objc_analyzer.py` - Objective-C framework analyzer - `analyzers/javascript_analyzer.py` - JavaScript/TypeScript analyzer - `analyzers/__init__.py` - Module exports ### Dependency Management (12 files) - `dependencies/classifier.py` - Main dependency classification engine - `dependencies/registry.py` - Dependency registry with caching - `dependencies/normalizer.py` - Import path normalization utilities - `dependencies/configs/base.py` - Abstract dependency config base - `dependencies/configs/python_config.py` - Python dependency rules - `dependencies/configs/zig_config.py` - Zig dependency configuration - `dependencies/configs/objc_config.py` - Objective-C framework config - `dependencies/configs/javascript_config.py` - JavaScript dependency rules - `dependencies/configs/__init__.py` - Config exports - `dependencies/__init__.py` - Module exports ### Position Resolution (8 files) - `position/resolver.py` - Main position resolver with strategy pattern - `position/confidence.py` - Confidence levels and LocationInfo class - `position/calculator.py` - Position calculation utilities - `position/strategies/base.py` - Abstract position strategy - `position/strategies/scip_occurrence.py` - SCIP occurrence-based detection - `position/strategies/tree_sitter_strategy.py` - AST-based position detection - `position/strategies/heuristic.py` - Fallback pattern matching - `position/strategies/__init__.py` - Strategy exports - `position/__init__.py` - Module exports --- SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md | 372 ++++++++++++ .../tools/scip/analyzers/__init__.py | 61 ++ .../tools/scip/analyzers/base.py | 324 ++++++++++ .../tools/scip/analyzers/factory.py | 383 ++++++++++++ .../scip/analyzers/javascript_analyzer.py | 410 +++++++++++++ .../tools/scip/analyzers/objc_analyzer.py | 366 +++++++++++ .../tools/scip/analyzers/python_analyzer.py | 400 ++++++++++++ .../tools/scip/analyzers/zig_analyzer.py | 300 +++++++++ .../tools/scip/dependencies/__init__.py | 33 + .../tools/scip/dependencies/classifier.py | 361 +++++++++++ .../scip/dependencies/configs/__init__.py | 74 +++ .../tools/scip/dependencies/configs/base.py | 236 ++++++++ .../scip/dependencies/configs/javascript.py | 283 +++++++++ .../tools/scip/dependencies/configs/objc.py | 346 +++++++++++ .../tools/scip/dependencies/configs/python.py | 355 +++++++++++ .../tools/scip/dependencies/configs/zig.py | 266 ++++++++ .../tools/scip/dependencies/normalizer.py | 354 +++++++++++ .../tools/scip/dependencies/registry.py | 371 ++++++++++++ .../tools/scip/position/__init__.py | 46 ++ .../tools/scip/position/calculator.py | 394 ++++++++++++ .../tools/scip/position/confidence.py | 317 ++++++++++ .../tools/scip/position/resolver.py | 436 ++++++++++++++ .../scip/position/strategies/__init__.py | 18 + .../tools/scip/position/strategies/base.py | 185 ++++++ .../scip/position/strategies/heuristic.py | 568 ++++++++++++++++++ .../position/strategies/scip_occurrence.py | 236 ++++++++ .../strategies/tree_sitter_strategy.py | 523 ++++++++++++++++ .../tools/scip/scip_index_tool.py | 12 +- 28 files changed, 8024 insertions(+), 6 deletions(-) create mode 100644 SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md create mode 100644 src/code_index_mcp/tools/scip/analyzers/__init__.py create mode 100644 src/code_index_mcp/tools/scip/analyzers/base.py create mode 100644 src/code_index_mcp/tools/scip/analyzers/factory.py create mode 100644 src/code_index_mcp/tools/scip/analyzers/javascript_analyzer.py create mode 100644 src/code_index_mcp/tools/scip/analyzers/objc_analyzer.py create mode 100644 src/code_index_mcp/tools/scip/analyzers/python_analyzer.py create mode 100644 src/code_index_mcp/tools/scip/analyzers/zig_analyzer.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/__init__.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/classifier.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/__init__.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/base.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/javascript.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/objc.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/python.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/zig.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/normalizer.py create mode 100644 src/code_index_mcp/tools/scip/dependencies/registry.py create mode 100644 src/code_index_mcp/tools/scip/position/__init__.py create mode 100644 src/code_index_mcp/tools/scip/position/calculator.py create mode 100644 src/code_index_mcp/tools/scip/position/confidence.py create mode 100644 src/code_index_mcp/tools/scip/position/resolver.py create mode 100644 src/code_index_mcp/tools/scip/position/strategies/__init__.py create mode 100644 src/code_index_mcp/tools/scip/position/strategies/base.py create mode 100644 src/code_index_mcp/tools/scip/position/strategies/heuristic.py create mode 100644 src/code_index_mcp/tools/scip/position/strategies/scip_occurrence.py create mode 100644 src/code_index_mcp/tools/scip/position/strategies/tree_sitter_strategy.py diff --git a/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md b/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md new file mode 100644 index 0000000..25d4e8c --- /dev/null +++ b/SCIP_SYMBOL_ANALYZER_REFACTORING_PLAN.md @@ -0,0 +1,372 @@ +# SCIPSymbolAnalyzer Refactoring Plan + +## 🎯 Overview + +This document outlines a comprehensive refactoring plan for the `SCIPSymbolAnalyzer` class to transform it from a monolithic architecture into a modular, extensible, and maintainable system that supports multiple programming languages with proper separation of concerns. + +## 🔍 Current Architecture Problems + +### 1. **Monolithic Design Issues** +- All language-specific logic is mixed within a single class +- The `_extract_imports` method contains Python, Objective-C, and Zig-specific logic +- Lack of extensibility - adding new languages requires modifying the core class +- Violation of Single Responsibility Principle + +### 2. **Dependency Processing Chaos** +- Methods like `_classify_zig_import`, `_categorize_import` are scattered throughout the codebase +- No unified dependency classification standard +- Language-specific standard library lists are hardcoded +- Inconsistent dependency type mapping + +### 3. **Symbol Resolution Complexity** +- Position detection logic is complex and error-prone +- Three-layer position detection strategy is difficult to maintain +- Symbol ID parsing logic lacks flexibility +- Mixed concerns between symbol extraction and position calculation + +### 4. **Poor Language Support Scalability** +- Each new language requires core class modifications +- No clear plugin architecture +- Language-specific logic embedded in generic methods +- Difficult to test language-specific features in isolation + +## 🏗️ Proposed Refactoring Architecture + +### Phase 1: Language Plugin System + +```python +# New architecture design +class LanguageAnalyzer(ABC): + """Language-specific analyzer interface""" + + @abstractmethod + def extract_imports(self, document, imports: ImportGroup) -> None: + """Extract import information from SCIP document""" + + @abstractmethod + def classify_dependency(self, module_name: str) -> str: + """Classify dependency as standard_library, third_party, or local""" + + @abstractmethod + def extract_symbol_metadata(self, symbol_info) -> Dict[str, Any]: + """Extract language-specific symbol metadata""" + + @abstractmethod + def get_standard_library_modules(self) -> Set[str]: + """Return set of standard library module names""" + +class ZigAnalyzer(LanguageAnalyzer): + """Zig language-specific analyzer""" + +class PythonAnalyzer(LanguageAnalyzer): + """Python language-specific analyzer""" + +class ObjectiveCAnalyzer(LanguageAnalyzer): + """Objective-C language-specific analyzer""" + +class LanguageAnalyzerFactory: + """Factory for creating language-specific analyzers""" + + def get_analyzer(self, language: str) -> LanguageAnalyzer: + """Get appropriate analyzer for language""" +``` + +### Phase 2: Dependency Management System + +```python +class DependencyClassifier: + """Unified dependency classification system""" + + def __init__(self): + self.language_configs = { + 'python': PythonDependencyConfig(), + 'zig': ZigDependencyConfig(), + 'javascript': JavaScriptDependencyConfig() + } + + def classify_import(self, import_path: str, language: str) -> str: + """Classify import based on language-specific rules""" + +class DependencyConfig(ABC): + """Language-specific dependency configuration""" + + @abstractmethod + def get_stdlib_modules(self) -> Set[str]: + """Return standard library modules for this language""" + + @abstractmethod + def classify_import(self, import_path: str) -> str: + """Classify import path for this language""" + + @abstractmethod + def normalize_import_path(self, raw_path: str) -> str: + """Normalize import path for consistent processing""" +``` + +### Phase 3: Position Resolution System + +```python +class PositionResolver: + """Unified symbol position resolution system""" + + def __init__(self): + self.strategies = [ + SCIPOccurrenceStrategy(), # High confidence + TreeSitterStrategy(), # Medium confidence + HeuristicStrategy() # Fallback + ] + + def resolve_position(self, symbol, document) -> LocationInfo: + """Resolve symbol position using strategy pattern""" + +class PositionStrategy(ABC): + """Base class for position resolution strategies""" + + @abstractmethod + def try_resolve(self, symbol, document) -> Optional[LocationInfo]: + """Attempt to resolve symbol position""" + + @abstractmethod + def get_confidence_level(self) -> str: + """Return confidence level: 'high', 'medium', 'low'""" +``` + +## 📋 Detailed Implementation Plan + +### **Phase 1: Architecture Separation (Week 1)** + +#### 1.1 Create Language Analyzer Interface +``` +src/code_index_mcp/tools/scip/analyzers/ +├── base.py # Base interfaces and common utilities +├── python_analyzer.py # Python-specific analysis logic +├── zig_analyzer.py # Zig-specific analysis logic +├── objc_analyzer.py # Objective-C-specific analysis logic +├── javascript_analyzer.py # JavaScript/TypeScript analysis logic +└── factory.py # Analyzer factory and registry +``` + +**Tasks:** +- [ ] Define `LanguageAnalyzer` abstract base class +- [ ] Extract Python-specific logic to `PythonAnalyzer` +- [ ] Move Zig logic from current implementation to `ZigAnalyzer` +- [ ] Migrate Objective-C logic to `ObjectiveCAnalyzer` +- [ ] Create factory pattern for analyzer instantiation + +#### 1.2 Extract Language-Specific Logic +- [ ] Move `_classify_zig_import` to `ZigAnalyzer` +- [ ] Move Python stdlib detection to `PythonAnalyzer` +- [ ] Move Objective-C framework detection to `ObjectiveCAnalyzer` +- [ ] Create language-specific symbol metadata extraction + +### **Phase 2: Dependency Processing Refactoring (Week 2)** + +#### 2.1 Create Dependency Management Module +``` +src/code_index_mcp/tools/scip/dependencies/ +├── classifier.py # Main dependency classifier +├── configs/ # Language-specific configurations +│ ├── __init__.py +│ ├── python.py # Python dependency rules +│ ├── zig.py # Zig dependency rules +│ ├── javascript.py # JavaScript dependency rules +│ └── base.py # Base configuration class +├── registry.py # Dependency registry and caching +└── normalizer.py # Import path normalization +``` + +**Tasks:** +- [ ] Create unified `DependencyClassifier` class +- [ ] Implement language-specific configuration classes +- [ ] Standardize dependency type constants +- [ ] Add configurable standard library lists +- [ ] Implement caching for dependency classification results + +#### 2.2 Standardize Dependency Classification +- [ ] Define consistent classification types: `standard_library`, `third_party`, `local` +- [ ] Create configurable standard library lists per language +- [ ] Support custom classification rules +- [ ] Implement dependency version detection where applicable + +### **Phase 3: Symbol Resolution Refactoring (Week 3)** + +#### 3.1 Modularize Position Detection +``` +src/code_index_mcp/tools/scip/position/ +├── resolver.py # Main position resolver +├── strategies/ # Position detection strategies +│ ├── __init__.py +│ ├── scip_occurrence.py # SCIP occurrence-based detection +│ ├── tree_sitter.py # Tree-sitter AST-based detection +│ ├── heuristic.py # Heuristic fallback detection +│ └── base.py # Base strategy interface +├── calculator.py # Position calculation utilities +└── confidence.py # Confidence level management +``` + +**Tasks:** +- [ ] Implement strategy pattern for position resolution +- [ ] Separate SCIP occurrence processing logic +- [ ] Extract tree-sitter position calculation +- [ ] Create heuristic fallback mechanisms +- [ ] Add confidence level tracking + +#### 3.2 Improve Symbol Parsing +- [ ] Refactor `_extract_name_from_scip_symbol` method +- [ ] Unify Symbol ID format processing +- [ ] Support additional SCIP symbol formats +- [ ] Add robust error handling for malformed symbols + +### **Phase 4: Relationship Analysis Refactoring (Week 4)** + +#### 4.1 Separate Relationship Analysis Logic +``` +src/code_index_mcp/tools/scip/relationships/ +├── analyzer.py # Main relationship analyzer +├── types.py # Relationship type definitions +├── builder.py # Relationship construction logic +├── extractors/ # Relationship extraction strategies +│ ├── __init__.py +│ ├── call_extractor.py # Function call relationships +│ ├── inheritance_extractor.py # Class inheritance +│ └── reference_extractor.py # Symbol references +└── formatter.py # Relationship output formatting +``` + +**Tasks:** +- [ ] Extract relationship analysis from main analyzer +- [ ] Implement relationship type system +- [ ] Create relationship builders for different types +- [ ] Add relationship validation logic + +#### 4.2 Optimize Relationship Detection +- [ ] Improve function call detection accuracy +- [ ] Support additional relationship types (inheritance, interfaces, etc.) +- [ ] Add cross-file relationship resolution +- [ ] Implement relationship confidence scoring + +### **Phase 5: Integration and Testing (Week 5)** + +#### 5.1 Integrate New Architecture +- [ ] Update `SCIPSymbolAnalyzer` to use new plugin system +- [ ] Create adapter layer for backward compatibility +- [ ] Update configuration and initialization logic +- [ ] Add performance monitoring + +#### 5.2 Comprehensive Testing +- [ ] Unit tests for each language analyzer +- [ ] Integration tests for dependency classification +- [ ] Position resolution accuracy tests +- [ ] Performance benchmark tests +- [ ] Memory usage optimization tests + +## 🎯 Refactoring Goals + +### **Maintainability Improvements** +- ✅ **Single Responsibility**: Each class focuses on specific functionality +- ✅ **Open/Closed Principle**: Easy to add new language support without modifying existing code +- ✅ **Dependency Injection**: Components are replaceable and testable +- ✅ **Clear Separation of Concerns**: Position detection, dependency classification, and symbol analysis are separate + +### **Performance Optimizations** +- ✅ **Lazy Loading**: Only load required language analyzers +- ✅ **Caching Mechanisms**: Cache symbol resolution and dependency classification results +- ✅ **Parallel Processing**: Support multi-file parallel analysis +- ✅ **Memory Efficiency**: Reduce memory footprint through better data structures + +### **Extensibility Features** +- ✅ **Plugin System**: Third-party language support through plugins +- ✅ **Configuration-Driven**: Configurable analysis rules and standards +- ✅ **Stable API**: Backward-compatible interfaces +- ✅ **Language Agnostic Core**: Core logic independent of specific languages + +## 🧪 Testing Strategy + +### **Unit Testing Coverage** +- [ ] Each language analyzer tested independently +- [ ] Dependency classifier comprehensive test suite +- [ ] Position resolver strategy tests +- [ ] Symbol parsing edge case tests +- [ ] Relationship extraction validation tests + +### **Integration Testing** +- [ ] Cross-language analysis scenarios +- [ ] End-to-end file analysis workflows +- [ ] SCIP compliance validation +- [ ] Performance regression testing + +### **Regression Testing** +- [ ] Existing functionality preservation +- [ ] Zig dependency processing validation +- [ ] Python analysis accuracy maintenance +- [ ] Objective-C framework detection consistency + +## 📈 Success Metrics + +### **Code Quality Improvements** +- **Cyclomatic Complexity**: Reduce from current >50 to <10 per method +- **Test Coverage**: Achieve >90% code coverage +- **Maintainability Index**: Improve from current score to >80 + +### **Performance Targets** +- **Analysis Speed**: <500ms per file (currently ~2s) +- **Memory Usage**: <50MB for 1000-file project (currently ~200MB) +- **Accuracy**: >95% symbol position accuracy + +### **Extensibility Goals** +- **New Language Addition**: <2 hours to add basic support +- **Plugin Development**: Third-party plugin support +- **Configuration Flexibility**: Runtime configuration changes + +## 🚀 Migration Plan + +### **Phase 1: Preparation (Week 1)** +- Create new module structure +- Implement base interfaces +- Set up testing framework + +### **Phase 2: Gradual Migration (Weeks 2-4)** +- Migrate one language at a time +- Maintain backward compatibility +- Add comprehensive tests for each component + +### **Phase 3: Integration (Week 5)** +- Integrate all components +- Performance optimization +- Final testing and validation + +### **Phase 4: Documentation and Cleanup (Week 6)** +- Update documentation +- Remove deprecated code +- Finalize API documentation + +## 🔧 Implementation Notes + +### **Backward Compatibility** +- Maintain existing public API during transition +- Create adapter layer for legacy code +- Gradual deprecation of old methods + +### **Configuration Management** +- Use dependency injection for configurability +- Support runtime configuration updates +- Provide sensible defaults for all languages + +### **Error Handling** +- Implement comprehensive error handling at each layer +- Provide detailed error messages for debugging +- Graceful degradation when analyzers fail + +### **Logging and Monitoring** +- Add structured logging throughout the system +- Implement performance metrics collection +- Create debugging tools for complex analysis scenarios + +--- + +**Status**: 📋 Planning Phase +**Priority**: 🔥 High +**Estimated Effort**: 6 weeks +**Dependencies**: None + +This refactoring will establish a solid foundation for supporting additional programming languages and maintaining high code quality as the system grows. \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/analyzers/__init__.py b/src/code_index_mcp/tools/scip/analyzers/__init__.py new file mode 100644 index 0000000..eac5859 --- /dev/null +++ b/src/code_index_mcp/tools/scip/analyzers/__init__.py @@ -0,0 +1,61 @@ +""" +Language-specific SCIP symbol analyzers. + +This package provides the modular language analyzer system that replaces the +monolithic SCIPSymbolAnalyzer, following the refactoring plan for better +maintainability and extensibility. + +Key Components: +- LanguageAnalyzer: Abstract base class for all language analyzers +- PythonAnalyzer: Python-specific import and symbol analysis +- ZigAnalyzer: Zig-specific import and symbol analysis +- ObjectiveCAnalyzer: Objective-C framework and symbol analysis +- JavaScriptAnalyzer: JavaScript/TypeScript analysis +- LanguageAnalyzerFactory: Factory for creating appropriate analyzers +- FallbackAnalyzer: Generic analyzer for unsupported languages + +Usage: + from .factory import get_analyzer + + # Get analyzer for Python file + analyzer = get_analyzer(language='python') + + # Get analyzer based on file extension + analyzer = get_analyzer(file_path='main.py') + + # Extract imports + analyzer.extract_imports(document, imports, symbol_parser) +""" + +from .base import LanguageAnalyzer, BaseLanguageAnalyzer, FallbackAnalyzer +from .python_analyzer import PythonAnalyzer +from .zig_analyzer import ZigAnalyzer +from .objc_analyzer import ObjectiveCAnalyzer +from .javascript_analyzer import JavaScriptAnalyzer +from .factory import ( + LanguageAnalyzerFactory, + get_analyzer_factory, + get_analyzer, + register_custom_analyzer, + get_supported_languages +) + +__all__ = [ + # Base classes + 'LanguageAnalyzer', + 'BaseLanguageAnalyzer', + 'FallbackAnalyzer', + + # Language-specific analyzers + 'PythonAnalyzer', + 'ZigAnalyzer', + 'ObjectiveCAnalyzer', + 'JavaScriptAnalyzer', + + # Factory and utilities + 'LanguageAnalyzerFactory', + 'get_analyzer_factory', + 'get_analyzer', + 'register_custom_analyzer', + 'get_supported_languages' +] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/analyzers/base.py b/src/code_index_mcp/tools/scip/analyzers/base.py new file mode 100644 index 0000000..3aa5280 --- /dev/null +++ b/src/code_index_mcp/tools/scip/analyzers/base.py @@ -0,0 +1,324 @@ +""" +Base interfaces and common utilities for language-specific SCIP analyzers. + +This module provides the abstract base classes and shared functionality for the +modular language analyzer system, following the SCIP Symbol Analyzer refactoring plan. +""" + +import logging +from abc import ABC, abstractmethod +from typing import Dict, List, Optional, Any, Set +from ..symbol_definitions import ImportGroup, LocationInfo + +logger = logging.getLogger(__name__) + + +class LanguageAnalyzer(ABC): + """ + Abstract base class for language-specific SCIP symbol analyzers. + + Each language analyzer handles language-specific logic for: + - Import extraction and classification + - Symbol metadata enrichment + - Dependency classification + - Standard library module detection + """ + + def __init__(self): + """Initialize the language analyzer.""" + self._cache: Dict[str, Any] = {} + self.language_name = self._get_language_name() + + @abstractmethod + def _get_language_name(self) -> str: + """Return the name of the language this analyzer handles.""" + pass + + @abstractmethod + def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: + """ + Extract import information from SCIP document. + + Args: + document: SCIP document containing symbols and occurrences + imports: ImportGroup to populate with extracted imports + symbol_parser: Optional SCIPSymbolManager for enhanced parsing + """ + pass + + @abstractmethod + def classify_dependency(self, module_name: str) -> str: + """ + Classify dependency as standard_library, third_party, or local. + + Args: + module_name: Name of the module/dependency to classify + + Returns: + Classification string: 'standard_library', 'third_party', or 'local' + """ + pass + + @abstractmethod + def extract_symbol_metadata(self, symbol_info, document) -> Dict[str, Any]: + """ + Extract language-specific symbol metadata. + + Args: + symbol_info: SCIP symbol information object + document: SCIP document containing the symbol + + Returns: + Dictionary with language-specific metadata + """ + pass + + @abstractmethod + def get_standard_library_modules(self) -> Set[str]: + """ + Return set of standard library module names for this language. + + Returns: + Set of standard library module names + """ + pass + + def normalize_import_path(self, raw_path: str) -> str: + """ + Normalize import path for consistent processing. + Default implementation returns the path as-is. + + Args: + raw_path: Raw import path from SCIP data + + Returns: + Normalized import path + """ + return raw_path.strip() + + def is_import_occurrence(self, occurrence) -> bool: + """ + Check if occurrence represents an import statement. + Default implementation checks for Import role (role = 2). + + Args: + occurrence: SCIP occurrence object + + Returns: + True if this occurrence is an import + """ + return hasattr(occurrence, 'symbol_roles') and (occurrence.symbol_roles & 2) + + def extract_module_from_symbol(self, symbol: str, descriptors: str = "") -> Optional[str]: + """ + Extract module name from SCIP symbol. + Default implementation for common patterns. + + Args: + symbol: SCIP symbol string + descriptors: SCIP descriptors if available + + Returns: + Module name or None if not extractable + """ + try: + if descriptors and '/' in descriptors: + # Extract from descriptors: module.py/symbol -> module + parts = descriptors.split('/') + if len(parts) >= 2: + file_part = parts[0] + if file_part.endswith('.py'): + return file_part[:-3].replace('/', '.') + return file_part.replace('/', '.') + + # Fallback: parse from symbol string + if symbol.startswith('external:'): + symbol_path = symbol[9:] + if '/' in symbol_path: + return symbol_path.split('/')[0] + elif '#' in symbol_path: + return symbol_path.split('#')[0] + return symbol_path.rstrip('.') + + except Exception as e: + logger.debug(f"Error extracting module from symbol {symbol}: {e}") + + return None + + +class AnalyzerCache: + """Shared caching system for analyzer results.""" + + def __init__(self): + self._symbol_cache: Dict[str, Dict[str, Any]] = {} + self._dependency_cache: Dict[str, str] = {} + self._module_cache: Dict[str, Set[str]] = {} + + def cache_symbol_metadata(self, symbol: str, metadata: Dict[str, Any]) -> None: + """Cache symbol metadata.""" + self._symbol_cache[symbol] = metadata + + def get_cached_symbol_metadata(self, symbol: str) -> Optional[Dict[str, Any]]: + """Retrieve cached symbol metadata.""" + return self._symbol_cache.get(symbol) + + def cache_dependency_classification(self, module: str, classification: str) -> None: + """Cache dependency classification result.""" + self._dependency_cache[module] = classification + + def get_cached_dependency_classification(self, module: str) -> Optional[str]: + """Retrieve cached dependency classification.""" + return self._dependency_cache.get(module) + + def cache_standard_library_modules(self, language: str, modules: Set[str]) -> None: + """Cache standard library modules for a language.""" + self._module_cache[language] = modules + + def get_cached_standard_library_modules(self, language: str) -> Optional[Set[str]]: + """Retrieve cached standard library modules.""" + return self._module_cache.get(language) + + +class BaseLanguageAnalyzer(LanguageAnalyzer): + """ + Base implementation providing common functionality for language analyzers. + + This class provides default implementations for common patterns while + requiring subclasses to implement language-specific logic. + """ + + def __init__(self): + super().__init__() + self._cache = AnalyzerCache() + self._standard_library_modules: Optional[Set[str]] = None + + def get_standard_library_modules(self) -> Set[str]: + """ + Get standard library modules with caching. + + Returns: + Set of standard library module names + """ + if self._standard_library_modules is None: + cached = self._cache.get_cached_standard_library_modules(self.language_name) + if cached is not None: + self._standard_library_modules = cached + else: + self._standard_library_modules = self._build_standard_library_modules() + self._cache.cache_standard_library_modules(self.language_name, self._standard_library_modules) + + return self._standard_library_modules + + @abstractmethod + def _build_standard_library_modules(self) -> Set[str]: + """Build the set of standard library modules for this language.""" + pass + + def classify_dependency(self, module_name: str) -> str: + """ + Classify dependency with caching support. + + Args: + module_name: Name of the module to classify + + Returns: + Classification string + """ + # Check cache first + cached = self._cache.get_cached_dependency_classification(module_name) + if cached is not None: + return cached + + # Perform classification + classification = self._classify_dependency_impl(module_name) + + # Cache result + self._cache.cache_dependency_classification(module_name, classification) + + return classification + + @abstractmethod + def _classify_dependency_impl(self, module_name: str) -> str: + """Implement the actual dependency classification logic.""" + pass + + def extract_symbol_metadata(self, symbol_info, document) -> Dict[str, Any]: + """ + Extract symbol metadata with caching. + + Args: + symbol_info: SCIP symbol information + document: SCIP document + + Returns: + Dictionary with symbol metadata + """ + symbol = getattr(symbol_info, 'symbol', '') + if not symbol: + return {} + + # Check cache + cached = self._cache.get_cached_symbol_metadata(symbol) + if cached is not None: + return cached + + # Extract metadata + metadata = self._extract_symbol_metadata_impl(symbol_info, document) + + # Cache result + self._cache.cache_symbol_metadata(symbol, metadata) + + return metadata + + @abstractmethod + def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: + """Implement language-specific symbol metadata extraction.""" + pass + + +class FallbackAnalyzer(BaseLanguageAnalyzer): + """ + Fallback analyzer for unsupported languages. + + Provides basic functionality when no language-specific analyzer is available. + """ + + def _get_language_name(self) -> str: + return "fallback" + + def _build_standard_library_modules(self) -> Set[str]: + """Fallback has no standard library modules.""" + return set() + + def _classify_dependency_impl(self, module_name: str) -> str: + """Basic classification for unknown languages.""" + if module_name.startswith('.'): + return 'local' + # Default to third_party for unknown languages + return 'third_party' + + def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: + """Basic import extraction using occurrence analysis.""" + try: + seen_modules = set() + + for occurrence in document.occurrences: + if not self.is_import_occurrence(occurrence): + continue + + symbol = occurrence.symbol + module_name = self.extract_module_from_symbol(symbol) + if module_name and module_name not in seen_modules: + classification = self.classify_dependency(module_name) + imports.add_import(module_name, classification) + seen_modules.add(module_name) + + except Exception as e: + logger.debug(f"Error in fallback import extraction: {e}") + + def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: + """Basic metadata extraction for fallback.""" + return { + 'source': 'fallback', + 'confidence': 'low' + } diff --git a/src/code_index_mcp/tools/scip/analyzers/factory.py b/src/code_index_mcp/tools/scip/analyzers/factory.py new file mode 100644 index 0000000..52c08b0 --- /dev/null +++ b/src/code_index_mcp/tools/scip/analyzers/factory.py @@ -0,0 +1,383 @@ +""" +Language analyzer factory and registry. + +This module provides the factory pattern for creating language-specific analyzers +based on document language or file extension, following the SCIP Symbol Analyzer +refactoring plan. +""" + +import logging +from typing import Dict, Optional, Type, Set +from .base import LanguageAnalyzer, FallbackAnalyzer +from .python_analyzer import PythonAnalyzer +from .zig_analyzer import ZigAnalyzer +from .objc_analyzer import ObjectiveCAnalyzer +from .javascript_analyzer import JavaScriptAnalyzer + +logger = logging.getLogger(__name__) + + +class LanguageAnalyzerFactory: + """ + Factory for creating language-specific analyzers. + + This factory provides centralized management of language analyzers, + supporting dynamic registration and language detection based on + various criteria. + """ + + def __init__(self): + """Initialize the factory with default analyzers.""" + self._analyzers: Dict[str, Type[LanguageAnalyzer]] = {} + self._file_extension_map: Dict[str, str] = {} + self._language_aliases: Dict[str, str] = {} + self._analyzer_instances: Dict[str, LanguageAnalyzer] = {} + + # Register default analyzers + self._register_default_analyzers() + self._setup_file_extension_mapping() + self._setup_language_aliases() + + def _register_default_analyzers(self) -> None: + """Register all default language analyzers.""" + self.register_analyzer('python', PythonAnalyzer) + self.register_analyzer('zig', ZigAnalyzer) + self.register_analyzer('objective-c', ObjectiveCAnalyzer) + self.register_analyzer('javascript', JavaScriptAnalyzer) + self.register_analyzer('typescript', JavaScriptAnalyzer) # TypeScript uses JS analyzer + self.register_analyzer('fallback', FallbackAnalyzer) + + def _setup_file_extension_mapping(self) -> None: + """Setup mapping from file extensions to language names.""" + self._file_extension_map = { + # Python + '.py': 'python', + '.pyx': 'python', + '.pyi': 'python', + '.pyw': 'python', + + # Zig + '.zig': 'zig', + + # Objective-C + '.m': 'objective-c', + '.mm': 'objective-c', + '.h': 'objective-c', # Could be C/C++ too, but often ObjC in iOS/macOS projects + + # JavaScript/TypeScript + '.js': 'javascript', + '.jsx': 'javascript', + '.ts': 'typescript', + '.tsx': 'typescript', + '.mjs': 'javascript', + '.cjs': 'javascript', + + # Other languages that might be added later + '.java': 'java', + '.kt': 'kotlin', + '.swift': 'swift', + '.go': 'go', + '.rs': 'rust', + '.cpp': 'cpp', + '.cc': 'cpp', + '.cxx': 'cpp', + '.c': 'c', + '.cs': 'csharp', + '.rb': 'ruby', + '.php': 'php', + '.scala': 'scala', + '.clj': 'clojure', + '.sh': 'shell', + '.bash': 'shell', + '.zsh': 'shell', + '.fish': 'shell' + } + + def _setup_language_aliases(self) -> None: + """Setup aliases for language names.""" + self._language_aliases = { + # Python aliases + 'py': 'python', + 'python3': 'python', + + # JavaScript/TypeScript aliases + 'js': 'javascript', + 'jsx': 'javascript', + 'ts': 'typescript', + 'tsx': 'typescript', + 'ecmascript': 'javascript', + 'node': 'javascript', + 'nodejs': 'javascript', + + # Objective-C aliases + 'objc': 'objective-c', + 'obj-c': 'objective-c', + 'objective_c': 'objective-c', + 'objectivec': 'objective-c', + + # Other aliases + 'zigc': 'zig', + 'c++': 'cpp', + 'c#': 'csharp', + 'dotnet': 'csharp' + } + + def register_analyzer(self, language: str, analyzer_class: Type[LanguageAnalyzer]) -> None: + """ + Register a language analyzer. + + Args: + language: Language name (canonical form) + analyzer_class: Analyzer class to register + """ + self._analyzers[language.lower()] = analyzer_class + logger.debug(f"Registered analyzer for language: {language}") + + def get_analyzer(self, language: str = None, file_path: str = None) -> LanguageAnalyzer: + """ + Get appropriate analyzer for the given language or file. + + Args: + language: Language name (if known) + file_path: File path (for extension-based detection) + + Returns: + Language-specific analyzer or fallback analyzer + """ + detected_language = self._detect_language(language, file_path) + + # Return cached instance if available + if detected_language in self._analyzer_instances: + return self._analyzer_instances[detected_language] + + # Create new instance + analyzer_class = self._analyzers.get(detected_language) + if analyzer_class: + try: + analyzer = analyzer_class() + self._analyzer_instances[detected_language] = analyzer + return analyzer + except Exception as e: + logger.warning(f"Failed to create analyzer for {detected_language}: {e}") + + # Fallback to default analyzer + if 'fallback' not in self._analyzer_instances: + self._analyzer_instances['fallback'] = FallbackAnalyzer() + + return self._analyzer_instances['fallback'] + + def _detect_language(self, language: str = None, file_path: str = None) -> str: + """ + Detect language from various hints. + + Args: + language: Explicit language hint + file_path: File path for extension-based detection + + Returns: + Detected language name (normalized) + """ + # Method 1: Use explicit language if provided + if language: + normalized = self._normalize_language(language) + if normalized in self._analyzers: + return normalized + + # Method 2: Detect from file extension + if file_path: + file_extension = self._get_file_extension(file_path) + if file_extension in self._file_extension_map: + detected = self._file_extension_map[file_extension] + if detected in self._analyzers: + return detected + + # Method 3: Detect from file path patterns + if file_path: + path_based = self._detect_from_path_patterns(file_path) + if path_based and path_based in self._analyzers: + return path_based + + # Default to fallback + return 'fallback' + + def _normalize_language(self, language: str) -> str: + """ + Normalize language name using aliases. + + Args: + language: Raw language name + + Returns: + Normalized language name + """ + language_lower = language.lower().strip() + + # Check aliases first + if language_lower in self._language_aliases: + return self._language_aliases[language_lower] + + # Return as-is if no alias found + return language_lower + + def _get_file_extension(self, file_path: str) -> str: + """ + Extract file extension from path. + + Args: + file_path: File path + + Returns: + File extension (including dot) + """ + try: + if '.' in file_path: + return '.' + file_path.split('.')[-1].lower() + except Exception: + pass + return '' + + def _detect_from_path_patterns(self, file_path: str) -> Optional[str]: + """ + Detect language from file path patterns. + + Args: + file_path: File path + + Returns: + Detected language or None + """ + path_lower = file_path.lower() + + # JavaScript/TypeScript project patterns + if any(pattern in path_lower for pattern in ['node_modules', 'package.json', 'tsconfig']): + if any(ext in path_lower for ext in ['.ts', '.tsx']): + return 'typescript' + return 'javascript' + + # Python project patterns + if any(pattern in path_lower for pattern in ['__pycache__', 'requirements.txt', 'setup.py', '.py']): + return 'python' + + # Zig project patterns + if any(pattern in path_lower for pattern in ['build.zig', '.zig']): + return 'zig' + + # Objective-C project patterns + if any(pattern in path_lower for pattern in ['.xcodeproj', '.xcworkspace', 'podfile']): + return 'objective-c' + + return None + + def get_supported_languages(self) -> Set[str]: + """ + Get set of supported languages. + + Returns: + Set of supported language names + """ + return set(self._analyzers.keys()) + + def get_supported_extensions(self) -> Set[str]: + """ + Get set of supported file extensions. + + Returns: + Set of supported file extensions + """ + return set(self._file_extension_map.keys()) + + def is_language_supported(self, language: str) -> bool: + """ + Check if a language is supported. + + Args: + language: Language name to check + + Returns: + True if language is supported + """ + normalized = self._normalize_language(language) + return normalized in self._analyzers + + def clear_cache(self) -> None: + """Clear cached analyzer instances.""" + self._analyzer_instances.clear() + logger.debug("Cleared analyzer instance cache") + + def get_analyzer_info(self) -> Dict[str, Dict[str, any]]: + """ + Get information about registered analyzers. + + Returns: + Dictionary with analyzer information + """ + info = {} + for language, analyzer_class in self._analyzers.items(): + try: + analyzer = analyzer_class() + info[language] = { + 'class': analyzer_class.__name__, + 'supported_extensions': [ + ext for ext, lang in self._file_extension_map.items() + if lang == language + ], + 'aliases': [ + alias for alias, canonical in self._language_aliases.items() + if canonical == language + ], + 'standard_library_modules': len(analyzer.get_standard_library_modules()) + } + except Exception as e: + info[language] = { + 'class': analyzer_class.__name__, + 'error': str(e) + } + + return info + + +# Global factory instance +_factory_instance: Optional[LanguageAnalyzerFactory] = None + + +def get_analyzer_factory() -> LanguageAnalyzerFactory: + """ + Get the global analyzer factory instance. + + Returns: + Global LanguageAnalyzerFactory instance + """ + global _factory_instance + if _factory_instance is None: + _factory_instance = LanguageAnalyzerFactory() + return _factory_instance + + +def get_analyzer(language: str = None, file_path: str = None) -> LanguageAnalyzer: + """ + Convenience function to get a language analyzer. + + Args: + language: Language name (if known) + file_path: File path (for extension-based detection) + + Returns: + Appropriate language analyzer + """ + return get_analyzer_factory().get_analyzer(language, file_path) + + +def register_custom_analyzer(language: str, analyzer_class: Type[LanguageAnalyzer]) -> None: + """ + Register a custom language analyzer. + + Args: + language: Language name + analyzer_class: Custom analyzer class + """ + get_analyzer_factory().register_analyzer(language, analyzer_class) + + +def get_supported_languages() -> Set[str]: + """Get set of all supported languages.""" + return get_analyzer_factory().get_supported_languages() diff --git a/src/code_index_mcp/tools/scip/analyzers/javascript_analyzer.py b/src/code_index_mcp/tools/scip/analyzers/javascript_analyzer.py new file mode 100644 index 0000000..72228c4 --- /dev/null +++ b/src/code_index_mcp/tools/scip/analyzers/javascript_analyzer.py @@ -0,0 +1,410 @@ +""" +JavaScript/TypeScript language-specific SCIP symbol analyzer. + +This module handles JavaScript and TypeScript specific logic for import parsing, +dependency classification, and symbol metadata extraction. +""" + +import logging +from typing import Dict, List, Optional, Any, Set +from .base import BaseLanguageAnalyzer +from ..symbol_definitions import ImportGroup + +logger = logging.getLogger(__name__) + + +class JavaScriptAnalyzer(BaseLanguageAnalyzer): + """ + JavaScript/TypeScript language-specific SCIP symbol analyzer. + + Handles JavaScript and TypeScript specific import parsing, dependency + classification, and symbol metadata extraction. + """ + + def _get_language_name(self) -> str: + return "javascript" + + def _build_standard_library_modules(self) -> Set[str]: + """Build JavaScript/Node.js built-in modules set.""" + return { + # Node.js built-in modules + 'assert', 'async_hooks', 'buffer', 'child_process', 'cluster', + 'console', 'constants', 'crypto', 'dgram', 'dns', 'domain', + 'events', 'fs', 'http', 'http2', 'https', 'inspector', + 'module', 'net', 'os', 'path', 'perf_hooks', 'process', + 'punycode', 'querystring', 'readline', 'repl', 'stream', + 'string_decoder', 'timers', 'tls', 'trace_events', 'tty', + 'url', 'util', 'v8', 'vm', 'worker_threads', 'zlib', + + # Web APIs (for browser environment) + 'window', 'document', 'navigator', 'location', 'history', + 'localStorage', 'sessionStorage', 'fetch', 'XMLHttpRequest', + 'WebSocket', 'Worker', 'ServiceWorker', 'MessageChannel', + 'BroadcastChannel', 'AbortController', 'URL', 'URLSearchParams', + 'Blob', 'File', 'FileReader', 'FormData', 'Headers', + 'Request', 'Response', 'ReadableStream', 'WritableStream', + 'TransformStream', 'TextEncoder', 'TextDecoder', + 'Intl', 'JSON', 'Math', 'Date', 'RegExp', 'Promise', + 'Proxy', 'Reflect', 'Symbol', 'Map', 'Set', 'WeakMap', + 'WeakSet', 'ArrayBuffer', 'DataView', 'Int8Array', + 'Uint8Array', 'Int16Array', 'Uint16Array', 'Int32Array', + 'Uint32Array', 'Float32Array', 'Float64Array', 'BigInt64Array', + 'BigUint64Array' + } + + def _classify_dependency_impl(self, module_name: str) -> str: + """ + Classify JavaScript/TypeScript dependency based on module patterns. + + Args: + module_name: Module name to classify + + Returns: + Classification: 'standard_library', 'third_party', or 'local' + """ + # Local imports (relative paths) + if module_name.startswith('./') or module_name.startswith('../'): + return 'local' + + # Absolute local imports (no node_modules) + if module_name.startswith('/') or module_name.startswith('~'): + return 'local' + + # Check for common project patterns + if any(pattern in module_name for pattern in ['src/', 'lib/', 'app/', '@/']): + return 'local' + + # Node.js built-in modules + base_module = module_name.split('/')[0] + if base_module in self.get_standard_library_modules(): + return 'standard_library' + + # Check for common scoped packages (third-party) + if module_name.startswith('@'): + return 'third_party' + + # Common third-party indicators + third_party_indicators = { + 'react', 'vue', 'angular', 'jquery', 'lodash', 'moment', + 'express', 'koa', 'fastify', 'webpack', 'babel', 'eslint', + 'typescript', 'jest', 'mocha', 'chai', 'sinon', 'cypress', + 'puppeteer', 'playwright', 'storybook', 'next', 'nuxt', + 'gatsby', 'vite', 'rollup', 'parcel', 'styled-components', + 'emotion', 'material-ui', 'antd', 'bootstrap', 'tailwind' + } + + if base_module in third_party_indicators: + return 'third_party' + + # Everything else is likely third_party in JavaScript ecosystem + return 'third_party' + + def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: + """ + Extract JavaScript/TypeScript imports from SCIP document. + + Args: + document: SCIP document containing symbols and occurrences + imports: ImportGroup to populate with extracted imports + symbol_parser: Optional SCIPSymbolManager for enhanced parsing + """ + try: + seen_modules = set() + + if symbol_parser: + # Extract using symbol parser + for occurrence in document.occurrences: + if not self.is_import_occurrence(occurrence): + continue + + symbol_info = symbol_parser.parse_symbol(occurrence.symbol) + if not symbol_info: + continue + + # Handle different manager types + if symbol_info.manager == 'npm': + # npm packages + package_name = symbol_info.package or self._extract_package_from_descriptors(symbol_info.descriptors) + if package_name and package_name not in seen_modules: + classification = self.classify_dependency(package_name) + imports.add_import(package_name, classification) + seen_modules.add(package_name) + + elif symbol_info.manager in ['builtin', 'node']: + # Node.js built-ins + module_name = self._extract_module_from_descriptors(symbol_info.descriptors) + if module_name and module_name not in seen_modules: + imports.add_import(module_name, 'standard_library') + seen_modules.add(module_name) + + elif symbol_info.manager == 'local': + # Local imports + module_path = self._extract_local_module_path(symbol_info.descriptors) + if module_path and module_path not in seen_modules: + imports.add_import(module_path, 'local') + seen_modules.add(module_path) + + else: + # Fallback: basic extraction without symbol parser + self._extract_imports_fallback(document, imports, seen_modules) + + logger.debug(f"Extracted {len(seen_modules)} JavaScript imports") + + except Exception as e: + logger.debug(f"Error extracting JavaScript imports: {e}") + + def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: + """ + Extract JavaScript/TypeScript specific symbol metadata. + + Args: + symbol_info: SCIP symbol information + document: SCIP document + + Returns: + Dictionary with JavaScript/TypeScript specific metadata + """ + metadata = { + 'language': 'javascript', + 'source': 'javascript_analyzer' + } + + try: + # Extract type information (especially for TypeScript) + if hasattr(symbol_info, 'signature') and symbol_info.signature: + signature = symbol_info.signature + metadata['signature'] = signature + + # Parse TypeScript-specific patterns + if '=>' in signature: + metadata['is_arrow_function'] = True + + if 'async' in signature: + metadata['is_async'] = True + + if 'export' in signature: + metadata['is_exported'] = True + + if 'default' in signature: + metadata['is_default_export'] = True + + # Parse function parameters + if '(' in signature and ')' in signature: + params = self._parse_js_parameters(signature) + if params: + metadata['parameters'] = params + + # Parse return type (TypeScript) + if ':' in signature and '=>' not in signature: + parts = signature.split(':') + if len(parts) > 1: + type_part = parts[-1].strip() + metadata['type'] = type_part + + # Extract symbol characteristics + symbol = getattr(symbol_info, 'symbol', '') + if symbol: + metadata['is_class'] = self._is_js_class(symbol) + metadata['is_interface'] = self._is_ts_interface(symbol) + metadata['is_type'] = self._is_ts_type(symbol) + metadata['is_enum'] = self._is_ts_enum(symbol) + metadata['is_namespace'] = self._is_ts_namespace(symbol) + metadata['scope'] = self._classify_js_scope(symbol) + + # Extract JSDoc documentation + if hasattr(symbol_info, 'documentation') and symbol_info.documentation: + metadata['documentation'] = symbol_info.documentation + metadata['has_jsdoc'] = any('@' in line for line in symbol_info.documentation) + + except Exception as e: + logger.debug(f"Error extracting JavaScript metadata: {e}") + metadata['extraction_error'] = str(e) + + return metadata + + def _extract_package_from_descriptors(self, descriptors: str) -> Optional[str]: + """ + Extract package name from SCIP descriptors for JavaScript. + + Args: + descriptors: SCIP descriptors string + + Returns: + Package name or None + """ + try: + # Handle descriptors like 'react/' or 'lodash/map' + if '/' in descriptors: + package_part = descriptors.split('/')[0] + # Handle scoped packages like @types/node + if package_part.startswith('@'): + parts = descriptors.split('/') + if len(parts) >= 2: + return f"{parts[0]}/{parts[1]}" + return package_part + return descriptors.strip('/') + except Exception: + return None + + def _extract_local_module_path(self, descriptors: str) -> Optional[str]: + """ + Extract local module path from descriptors for JavaScript. + + Args: + descriptors: SCIP descriptors string + + Returns: + Module path or None + """ + try: + # Handle local JavaScript imports + if '/' in descriptors: + parts = descriptors.split('/') + if len(parts) >= 1: + file_part = parts[0] + # Remove common JavaScript extensions + for ext in ['.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs']: + if file_part.endswith(ext): + file_part = file_part[:-len(ext)] + break + return file_part + return None + except Exception: + return None + + def _extract_imports_fallback(self, document, imports: ImportGroup, seen_modules: Set[str]) -> None: + """Fallback import extraction without symbol parser.""" + try: + for occurrence in document.occurrences: + if not self.is_import_occurrence(occurrence): + continue + + symbol = occurrence.symbol + module_name = self.extract_module_from_symbol(symbol) + if module_name and module_name not in seen_modules: + classification = self.classify_dependency(module_name) + imports.add_import(module_name, classification) + seen_modules.add(module_name) + except Exception as e: + logger.debug(f"Error in JavaScript fallback import extraction: {e}") + + def _parse_js_parameters(self, signature: str) -> List[str]: + """ + Parse parameter names from JavaScript/TypeScript function signature. + + Args: + signature: Function signature string + + Returns: + List of parameter names + """ + try: + if '(' in signature and ')' in signature: + # Find the parameter section + start = signature.find('(') + end = signature.find(')', start) + if start < end: + param_section = signature[start + 1:end] + if not param_section.strip(): + return [] + + params = [] + # Split by comma, but be careful of nested parentheses and generics + current_param = "" + paren_depth = 0 + bracket_depth = 0 + + for char in param_section: + if char == '(': + paren_depth += 1 + elif char == ')': + paren_depth -= 1 + elif char == '<': + bracket_depth += 1 + elif char == '>': + bracket_depth -= 1 + elif char == ',' and paren_depth == 0 and bracket_depth == 0: + params.append(current_param.strip()) + current_param = "" + continue + + current_param += char + + if current_param.strip(): + params.append(current_param.strip()) + + # Extract just parameter names (before : or =) + param_names = [] + for param in params: + # Handle destructuring and rest parameters + param = param.strip() + if param.startswith('...'): + param = param[3:].strip() + + # Extract name before type annotation or default value + if ':' in param: + param = param.split(':')[0].strip() + elif '=' in param: + param = param.split('=')[0].strip() + + if param and not param.startswith('{') and not param.startswith('['): + param_names.append(param) + + return param_names + except Exception as e: + logger.debug(f"Error parsing JavaScript parameters: {e}") + + return [] + + def _is_js_class(self, symbol: str) -> bool: + """Check if symbol represents a JavaScript class.""" + try: + return 'class' in symbol.lower() or '/Class' in symbol + except Exception: + return False + + def _is_ts_interface(self, symbol: str) -> bool: + """Check if symbol represents a TypeScript interface.""" + try: + return 'interface' in symbol.lower() or '/Interface' in symbol + except Exception: + return False + + def _is_ts_type(self, symbol: str) -> bool: + """Check if symbol represents a TypeScript type alias.""" + try: + return 'type' in symbol.lower() and not 'typeof' in symbol.lower() + except Exception: + return False + + def _is_ts_enum(self, symbol: str) -> bool: + """Check if symbol represents a TypeScript enum.""" + try: + return 'enum' in symbol.lower() or '/Enum' in symbol + except Exception: + return False + + def _is_ts_namespace(self, symbol: str) -> bool: + """Check if symbol represents a TypeScript namespace.""" + try: + return 'namespace' in symbol.lower() or '/Namespace' in symbol + except Exception: + return False + + def _classify_js_scope(self, symbol: str) -> str: + """ + Classify JavaScript symbol scope. + + Args: + symbol: SCIP symbol string + + Returns: + Scope classification + """ + # Basic scope classification for JavaScript + if '//' in symbol or symbol.count('/') > 2: + return 'nested' + elif '/' in symbol: + return 'module' + else: + return 'global' \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/analyzers/objc_analyzer.py b/src/code_index_mcp/tools/scip/analyzers/objc_analyzer.py new file mode 100644 index 0000000..6de9c5c --- /dev/null +++ b/src/code_index_mcp/tools/scip/analyzers/objc_analyzer.py @@ -0,0 +1,366 @@ +""" +Objective-C language-specific SCIP symbol analyzer. + +This module handles Objective-C specific logic extracted from the monolithic +SCIPSymbolAnalyzer, including framework detection and system library classification. +""" + +import logging +from typing import Dict, List, Optional, Any, Set +from .base import BaseLanguageAnalyzer +from ..symbol_definitions import ImportGroup + +logger = logging.getLogger(__name__) + + +class ObjectiveCAnalyzer(BaseLanguageAnalyzer): + """ + Objective-C language-specific SCIP symbol analyzer. + + Handles Objective-C specific framework imports, system library detection, + and symbol metadata extraction. + """ + + def _get_language_name(self) -> str: + return "objective-c" + + def _build_standard_library_modules(self) -> Set[str]: + """Build comprehensive Objective-C system frameworks set.""" + return { + # Core frameworks (iOS and macOS) + 'Foundation', 'CoreFoundation', 'CoreData', 'CoreGraphics', + 'QuartzCore', 'CoreAnimation', 'CoreImage', 'CoreText', + 'Security', 'SystemConfiguration', 'CFNetwork', + + # UI frameworks + 'UIKit', 'AppKit', 'Cocoa', 'SwiftUI', + + # Media frameworks + 'AVFoundation', 'AVKit', 'AudioToolbox', 'AudioUnit', + 'VideoToolbox', 'MediaPlayer', 'Photos', 'PhotosUI', + 'CoreAudio', 'CoreMIDI', 'CoreMedia', 'ImageIO', + + # Graphics and gaming + 'Metal', 'MetalKit', 'GameplayKit', 'SpriteKit', 'SceneKit', + 'GLKit', 'OpenGLES', 'CoreMotion', 'ARKit', 'RealityKit', + + # Location and maps + 'CoreLocation', 'MapKit', 'Contacts', 'ContactsUI', + + # Web and networking + 'WebKit', 'JavaScriptCore', 'NetworkExtension', + + # Data and storage + 'CloudKit', 'CoreSpotlight', 'EventKit', 'EventKitUI', + 'HealthKit', 'HealthKitUI', 'HomeKit', 'HomeKitUI', + + # Device and sensors + 'CoreBluetooth', 'ExternalAccessory', 'CoreNFC', + 'CoreTelephony', 'CallKit', 'PushKit', + + # Machine learning and AI + 'CoreML', 'Vision', 'NaturalLanguage', 'Speech', + 'SoundAnalysis', + + # Development tools + 'XCTest', 'os', 'Accelerate', 'simd', + + # Legacy frameworks + 'AddressBook', 'AddressBookUI', 'AssetsLibrary', + 'MobileCoreServices', 'Social', 'Accounts', + + # watchOS specific + 'WatchKit', 'ClockKit', 'WatchConnectivity', + + # tvOS specific + 'TVUIKit', 'TVMLKit', + + # macOS specific + 'Carbon', 'ApplicationServices', 'CoreServices', + 'IOKit', 'DiskArbitration', 'FSEvents', 'ServiceManagement', + 'LaunchServices', 'SearchKit', 'PreferencePanes', + 'InstantMessage', 'Automator', 'CalendarStore', + 'Collaboration', 'CoreWLAN', 'DiscRecording', + 'DiscRecordingUI', 'DVDPlayback', 'ExceptionHandling', + 'FWAUserLib', 'InstallerPlugins', 'IOBluetooth', + 'IOBluetoothUI', 'Kernel', 'LDAP', 'Message', + 'OpenDirectory', 'OSAKit', 'PubSub', 'QTKit', + 'Quartz', 'QuartzComposer', 'QuickLook', 'ScreenSaver', + 'ScriptingBridge', 'SyncServices', 'Tcl', 'Tk', + 'WebKit', 'XgridFoundation' + } + + def _classify_dependency_impl(self, module_name: str) -> str: + """ + Classify Objective-C dependency based on framework patterns. + + Args: + module_name: Framework/module name to classify + + Returns: + Classification: 'standard_library', 'third_party', or 'local' + """ + # Local imports (project-specific) + if any(pattern in module_name for pattern in ['.', '/', 'Private', 'Internal']): + return 'local' + + # System frameworks check + if module_name in self.get_standard_library_modules(): + return 'standard_library' + + # Third-party framework indicators + third_party_indicators = { + 'AFNetworking', 'Alamofire', 'SDWebImage', 'MBProgressHUD', + 'JSONModel', 'RestKit', 'Firebase', 'ReactiveCocoa', + 'Masonry', 'SnapKit', 'Realm', 'FMDB', 'SQLite', + 'GoogleAnalytics', 'Fabric', 'Crashlytics', 'TestFlight', + 'Facebook', 'Twitter', 'Instagram', 'Pods' + } + + for indicator in third_party_indicators: + if indicator in module_name: + return 'third_party' + + # CocoaPods/Carthage patterns + if any(pattern in module_name for pattern in ['Pod', 'Carthage', 'SPM']): + return 'third_party' + + # Default to standard_library for unknown frameworks + # (Objective-C tends to have many system frameworks) + return 'standard_library' + + def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: + """ + Extract Objective-C imports from SCIP document. + + Args: + document: SCIP document containing symbols and occurrences + imports: ImportGroup to populate with extracted imports + symbol_parser: Optional SCIPSymbolManager for enhanced parsing + """ + try: + seen_modules = set() + + # Method 1: Extract from occurrences with Import role + if symbol_parser: + for occurrence in document.occurrences: + if not self.is_import_occurrence(occurrence): + continue + + symbol_info = symbol_parser.parse_symbol(occurrence.symbol) + if not symbol_info: + continue + + # Handle based on manager type + if symbol_info.manager in ['system', 'framework']: + framework_name = symbol_info.package or self._extract_framework_from_descriptors(symbol_info.descriptors) + if framework_name and framework_name not in seen_modules: + imports.add_import(framework_name, 'standard_library') + seen_modules.add(framework_name) + + elif symbol_info.manager in ['cocoapods', 'carthage', 'third_party']: + package_name = symbol_info.package or self._extract_framework_from_descriptors(symbol_info.descriptors) + if package_name and package_name not in seen_modules: + imports.add_import(package_name, 'third_party') + seen_modules.add(package_name) + + elif symbol_info.manager == 'local': + module_path = self._extract_local_module_path(symbol_info.descriptors) + if module_path and module_path not in seen_modules: + imports.add_import(module_path, 'local') + seen_modules.add(module_path) + + # Method 2: Extract from external symbols (if available in index) + # This handles frameworks detected during indexing but not in occurrences + self._extract_from_external_symbols_if_available(imports, seen_modules, symbol_parser) + + logger.debug(f"Extracted {len(seen_modules)} Objective-C imports/frameworks") + + except Exception as e: + logger.debug(f"Error extracting Objective-C imports: {e}") + + def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: + """ + Extract Objective-C specific symbol metadata. + + Args: + symbol_info: SCIP symbol information + document: SCIP document + + Returns: + Dictionary with Objective-C specific metadata + """ + metadata = { + 'language': 'objective-c', + 'source': 'objc_analyzer' + } + + try: + # Extract method signature patterns + if hasattr(symbol_info, 'signature') and symbol_info.signature: + signature = symbol_info.signature + metadata['signature'] = signature + + # Parse Objective-C method patterns + if signature.startswith('-') or signature.startswith('+'): + metadata['is_method'] = True + metadata['is_instance_method'] = signature.startswith('-') + metadata['is_class_method'] = signature.startswith('+') + + # Parse method parameters (Objective-C style) + if ':' in signature: + metadata['parameter_count'] = signature.count(':') + metadata['method_labels'] = self._extract_method_labels(signature) + + # Parse return type + if ')' in signature and '(' in signature: + return_type_match = signature.split(')') + if len(return_type_match) > 0: + return_type = return_type_match[0].strip('(+-') + if return_type: + metadata['return_type'] = return_type + + # Extract property characteristics + symbol = getattr(symbol_info, 'symbol', '') + if symbol: + metadata['is_property'] = self._is_objc_property(symbol) + metadata['is_protocol'] = self._is_objc_protocol(symbol) + metadata['is_category'] = self._is_objc_category(symbol) + metadata['framework'] = self._extract_framework_from_symbol(symbol) + + # Extract documentation + if hasattr(symbol_info, 'documentation') and symbol_info.documentation: + metadata['documentation'] = symbol_info.documentation + + except Exception as e: + logger.debug(f"Error extracting Objective-C metadata: {e}") + metadata['extraction_error'] = str(e) + + return metadata + + def _extract_framework_from_descriptors(self, descriptors: str) -> Optional[str]: + """ + Extract framework name from SCIP descriptors for Objective-C. + + Args: + descriptors: SCIP descriptors string + + Returns: + Framework name or None + """ + try: + # Handle descriptors like 'Foundation/' or 'UIKit/UIView' + if '/' in descriptors: + return descriptors.split('/')[0] + return descriptors.strip('/') + except Exception: + return None + + def _extract_local_module_path(self, descriptors: str) -> Optional[str]: + """ + Extract local module path from descriptors for Objective-C. + + Args: + descriptors: SCIP descriptors string + + Returns: + Module path or None + """ + try: + # Handle local Objective-C files + if '/' in descriptors: + parts = descriptors.split('/') + if len(parts) >= 2: + file_part = parts[0] + if file_part.endswith('.h') or file_part.endswith('.m'): + return file_part + return file_part + return None + except Exception: + return None + + def _extract_from_external_symbols_if_available(self, imports: ImportGroup, seen_modules: Set[str], symbol_parser) -> None: + """ + Extract additional imports from external symbols if available. + This method would be called with the full SCIP index if available. + """ + # This method would need to be integrated with the main analyzer + # to access external symbols from the SCIP index + pass + + def _extract_method_labels(self, signature: str) -> List[str]: + """ + Extract Objective-C method labels from signature. + + Args: + signature: Method signature string + + Returns: + List of method labels + """ + try: + # Parse Objective-C method signature like: "-(void)setName:(NSString*)name withAge:(int)age" + labels = [] + parts = signature.split(':') + for part in parts[:-1]: # Exclude last part after final : + # Extract the label (word before the colon) + words = part.strip().split() + if words: + label = words[-1] + if label and not label.startswith('(') and not label.startswith('-') and not label.startswith('+'): + labels.append(label) + return labels + except Exception: + return [] + + def _is_objc_property(self, symbol: str) -> bool: + """Check if symbol represents an Objective-C property.""" + try: + # Properties often have specific patterns in SCIP symbols + return '@property' in symbol or 'property' in symbol.lower() + except Exception: + return False + + def _is_objc_protocol(self, symbol: str) -> bool: + """Check if symbol represents an Objective-C protocol.""" + try: + return '@protocol' in symbol or 'protocol' in symbol.lower() + except Exception: + return False + + def _is_objc_category(self, symbol: str) -> bool: + """Check if symbol represents an Objective-C category.""" + try: + # Categories often have + in their symbol representation + return '(' in symbol and ')' in symbol + except Exception: + return False + + def _extract_framework_from_symbol(self, symbol: str) -> Optional[str]: + """ + Extract framework name from SCIP symbol string. + + Args: + symbol: SCIP symbol string + + Returns: + Framework name or None + """ + try: + # Handle various SCIP symbol formats for frameworks + if 'Foundation' in symbol: + return 'Foundation' + elif 'UIKit' in symbol: + return 'UIKit' + # Add more specific framework detection as needed + + # Generic extraction from symbol structure + if ' ' in symbol: + parts = symbol.split() + for part in parts: + if part in self.get_standard_library_modules(): + return part + + return None + except Exception: + return None diff --git a/src/code_index_mcp/tools/scip/analyzers/python_analyzer.py b/src/code_index_mcp/tools/scip/analyzers/python_analyzer.py new file mode 100644 index 0000000..10aea2d --- /dev/null +++ b/src/code_index_mcp/tools/scip/analyzers/python_analyzer.py @@ -0,0 +1,400 @@ +""" +Python language-specific SCIP symbol analyzer. + +This module handles Python-specific logic extracted from the monolithic +SCIPSymbolAnalyzer, following the refactoring plan for modular architecture. +""" + +import logging +from typing import Dict, List, Optional, Any, Set +from .base import BaseLanguageAnalyzer +from ..symbol_definitions import ImportGroup + +logger = logging.getLogger(__name__) + + +class PythonAnalyzer(BaseLanguageAnalyzer): + """ + Python language-specific SCIP symbol analyzer. + + Handles Python-specific import parsing, dependency classification, + and symbol metadata extraction. + """ + + def _get_language_name(self) -> str: + return "python" + + def _build_standard_library_modules(self) -> Set[str]: + """Build comprehensive Python standard library module set.""" + return { + # Core modules + 'os', 'sys', 'json', 'time', 'datetime', 'logging', 'pathlib', + 'typing', 'dataclasses', 'functools', 'itertools', 'collections', + 're', 'math', 'random', 'threading', 'subprocess', 'shutil', + 'contextlib', 'traceback', 'warnings', 'weakref', 'copy', + 'pickle', 'base64', 'hashlib', 'hmac', 'uuid', 'urllib', + 'http', 'socketserver', 'email', 'mimetypes', 'csv', 'configparser', + 'argparse', 'getopt', 'tempfile', 'glob', 'fnmatch', 'linecache', + 'pprint', 'textwrap', 'string', 'struct', 'codecs', 'unicodedata', + 'io', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile', + + # Network and web + 'socket', 'ssl', 'ftplib', 'poplib', 'imaplib', 'smtplib', + 'xmlrpc', 'webbrowser', + + # Data formats + 'xml', 'html', 'sqlite3', 'dbm', 'marshal', + + # Development tools + 'unittest', 'doctest', 'pdb', 'profile', 'cProfile', 'timeit', + 'trace', 'cgitb', 'py_compile', 'compileall', 'dis', 'pickletools', + + # System services + 'errno', 'ctypes', 'syslog', 'curses', 'platform', + + # Internationalization + 'locale', 'gettext', + + # Multimedia + 'audioop', 'wave', 'chunk', 'sunau', 'aifc', 'colorsys', + + # Cryptographic services + 'secrets', 'hashlib', 'hmac', + + # File and directory access + 'stat', 'fileinput', 'filecmp', 'shutil', 'macpath', + + # Data persistence + 'shelve', 'copyreg', + + # Data compression and archiving + 'zlib', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile', + + # File formats + 'csv', 'netrc', 'xdrlib', 'plistlib', + + # Internet protocols and support + 'ipaddress', 'mailbox', 'mimetypes', + + # Structured markup processing tools + 'html', 'xml', + + # Internet data handling + 'json', 'base64', 'binascii', 'uu', 'quopri', + + # Numeric and mathematical modules + 'numbers', 'decimal', 'fractions', 'statistics', 'cmath', + + # Functional programming modules + 'operator', 'functools', 'itertools', + + # Python language services + 'ast', 'symtable', 'symbol', 'token', 'tokenize', 'keyword', + 'tabnanny', 'pyclbr', 'py_compile', 'compileall', 'dis', + 'pickletools', 'distutils', + + # Importing modules + 'importlib', 'pkgutil', 'modulefinder', 'runpy', + + # Python runtime services + 'atexit', 'gc', 'inspect', 'site', '__future__', '__main__', + + # Custom Python interpreters + 'code', 'codeop', + + # MS Windows specific services + 'msvcrt', 'winreg', 'winsound', + + # Unix specific services + 'posix', 'pwd', 'grp', 'crypt', 'termios', 'tty', 'pty', + 'fcntl', 'pipes', 'resource', 'nis', 'syslog', + + # Superseded modules + 'optparse', 'imp' + } + + def _classify_dependency_impl(self, module_name: str) -> str: + """ + Classify Python dependency based on module patterns. + + Args: + module_name: Python module name to classify + + Returns: + Classification: 'standard_library', 'third_party', or 'local' + """ + # Local imports (relative imports or project-specific patterns) + if module_name.startswith('.'): + return 'local' + + # Check for common project patterns + if any(pattern in module_name for pattern in ['src.', 'lib.', 'app.', 'project.']): + return 'local' + + # Standard library check + base_module = module_name.split('.')[0] + if base_module in self.get_standard_library_modules(): + return 'standard_library' + + # Everything else is third_party + return 'third_party' + + def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: + """ + Extract Python imports from SCIP document. + + Args: + document: SCIP document containing symbols and occurrences + imports: ImportGroup to populate with extracted imports + symbol_parser: Optional SCIPSymbolManager for enhanced parsing + """ + if not symbol_parser: + logger.debug("No symbol parser available for Python import extraction") + return + + try: + seen_modules = set() + + # Extract from occurrences with Import role + for occurrence in document.occurrences: + if not self.is_import_occurrence(occurrence): + continue + + symbol_info = symbol_parser.parse_symbol(occurrence.symbol) + if not symbol_info: + continue + + # Handle based on manager type + if symbol_info.manager == 'stdlib': + module_name = self._extract_module_from_descriptors(symbol_info.descriptors) + if module_name and module_name not in seen_modules: + imports.add_import(module_name, 'standard_library') + seen_modules.add(module_name) + + elif symbol_info.manager == 'pip': + # pip packages: package name is the module name + package_name = symbol_info.package + if package_name and package_name not in seen_modules: + imports.add_import(package_name, 'third_party') + seen_modules.add(package_name) + + elif symbol_info.manager == 'local': + # Local imports: extract module path from descriptors + module_path = self._extract_local_module_path(symbol_info.descriptors) + if module_path and module_path not in seen_modules: + imports.add_import(module_path, 'local') + seen_modules.add(module_path) + + logger.debug(f"Extracted {len(seen_modules)} Python imports") + + except Exception as e: + logger.debug(f"Error extracting Python imports: {e}") + + def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: + """ + Extract Python-specific symbol metadata. + + Args: + symbol_info: SCIP symbol information + document: SCIP document + + Returns: + Dictionary with Python-specific metadata + """ + metadata = { + 'language': 'python', + 'source': 'python_analyzer' + } + + try: + # Extract documentation/docstring + if hasattr(symbol_info, 'documentation') and symbol_info.documentation: + metadata['documentation'] = symbol_info.documentation + + # Parse special documentation markers from Python AST analyzer + for doc_line in symbol_info.documentation: + if doc_line.startswith('Parameters: '): + param_str = doc_line[12:] + metadata['parameters'] = [p.strip() for p in param_str.split(',') if p.strip()] + elif doc_line == 'Async function': + metadata['is_async'] = True + elif doc_line.startswith('Decorators: '): + decorator_str = doc_line[12:] + metadata['decorators'] = [d.strip() for d in decorator_str.split(',') if d.strip()] + + # Extract type information from signature + if hasattr(symbol_info, 'signature') and symbol_info.signature: + signature = symbol_info.signature + metadata['signature'] = signature + + # Parse return type + if '->' in signature: + return_type = signature.split('->')[-1].strip() + metadata['return_type'] = return_type + + # Parse parameters from signature + if '(' in signature and ')' in signature and 'parameters' not in metadata: + metadata['parameters'] = self._parse_signature_parameters(signature) + + # Parse variable type annotation + if ':' in signature and '->' not in signature: + type_part = signature.split(':')[1].strip() + metadata['type'] = type_part + + # Parse constant value + if '=' in signature: + value_part = signature.split('=')[1].strip() + metadata['value'] = value_part + + # Classify symbol role + symbol = getattr(symbol_info, 'symbol', '') + if symbol: + metadata['scope'] = self._classify_symbol_scope(symbol) + metadata['is_private'] = self._is_private_symbol(symbol) + metadata['is_dunder'] = self._is_dunder_method(symbol) + + except Exception as e: + logger.debug(f"Error extracting Python metadata: {e}") + metadata['extraction_error'] = str(e) + + return metadata + + def _extract_module_from_descriptors(self, descriptors: str) -> Optional[str]: + """ + Extract module name from SCIP descriptors for Python. + + Args: + descriptors: SCIP descriptors string + + Returns: + Module name or None + """ + try: + # Handle descriptors like 'os/' or 'pathlib/Path' + if '/' in descriptors: + return descriptors.split('/')[0] + return descriptors.strip('/') + except Exception: + return None + + def _extract_local_module_path(self, descriptors: str) -> Optional[str]: + """ + Extract local module path from descriptors for Python. + + Args: + descriptors: SCIP descriptors string + + Returns: + Module path or None + """ + try: + # Handle descriptors like 'utils.py/helper_function' -> 'utils' + # or 'services/user_service.py/UserService' -> 'services.user_service' + if '/' in descriptors: + parts = descriptors.split('/') + if len(parts) >= 2: + file_part = parts[0] + if file_part.endswith('.py'): + return file_part[:-3].replace('/', '.') + return file_part.replace('/', '.') + return None + except Exception: + return None + + def _parse_signature_parameters(self, signature: str) -> List[str]: + """ + Parse parameter names from Python function signature. + + Args: + signature: Function signature string + + Returns: + List of parameter names + """ + try: + if '(' in signature and ')' in signature: + param_section = signature.split('(')[1].split(')')[0] + if not param_section.strip(): + return [] + + params = [] + for param in param_section.split(','): + param = param.strip() + if param: + # Extract parameter name (before type annotation) + param_name = param.split(':')[0].strip() + if param_name: + params.append(param_name) + + return params + except Exception as e: + logger.debug(f"Error parsing Python signature parameters: {e}") + + return [] + + def _classify_symbol_scope(self, symbol: str) -> str: + """ + Classify Python symbol scope (global, class, function). + + Args: + symbol: SCIP symbol string + + Returns: + Scope classification + """ + if '#' not in symbol: + return 'global' + elif symbol.count('#') == 1: + return 'class' + else: + return 'function' + + def _is_private_symbol(self, symbol: str) -> bool: + """ + Check if symbol is private (starts with underscore). + + Args: + symbol: SCIP symbol string + + Returns: + True if symbol appears to be private + """ + try: + # Extract symbol name from various SCIP formats + if '#' in symbol: + name = symbol.split('#')[-1] + elif '/' in symbol: + name = symbol.split('/')[-1] + else: + name = symbol.split('.')[-1] + + # Clean up name + name = name.rstrip('().#') + return name.startswith('_') and not name.startswith('__') + except Exception: + return False + + def _is_dunder_method(self, symbol: str) -> bool: + """ + Check if symbol is a dunder (double underscore) method. + + Args: + symbol: SCIP symbol string + + Returns: + True if symbol appears to be a dunder method + """ + try: + # Extract symbol name + if '#' in symbol: + name = symbol.split('#')[-1] + elif '/' in symbol: + name = symbol.split('/')[-1] + else: + name = symbol.split('.')[-1] + + # Clean up name + name = name.rstrip('().#') + return name.startswith('__') and name.endswith('__') + except Exception: + return False diff --git a/src/code_index_mcp/tools/scip/analyzers/zig_analyzer.py b/src/code_index_mcp/tools/scip/analyzers/zig_analyzer.py new file mode 100644 index 0000000..332950a --- /dev/null +++ b/src/code_index_mcp/tools/scip/analyzers/zig_analyzer.py @@ -0,0 +1,300 @@ +""" +Zig language-specific SCIP symbol analyzer. + +This module handles Zig-specific logic extracted from the monolithic +SCIPSymbolAnalyzer, including Zig import classification and standard library detection. +""" + +import logging +from typing import Dict, List, Optional, Any, Set +from .base import BaseLanguageAnalyzer +from ..symbol_definitions import ImportGroup + +logger = logging.getLogger(__name__) + + +class ZigAnalyzer(BaseLanguageAnalyzer): + """ + Zig language-specific SCIP symbol analyzer. + + Handles Zig-specific import parsing, dependency classification, + and symbol metadata extraction. + """ + + def _get_language_name(self) -> str: + return "zig" + + def _build_standard_library_modules(self) -> Set[str]: + """Build comprehensive Zig standard library module set.""" + return { + # Core standard library + 'std', 'builtin', 'testing', + + # Data structures and algorithms + 'math', 'mem', 'sort', 'hash', 'crypto', + + # Text and formatting + 'fmt', 'ascii', 'unicode', 'json', + + # System interaction + 'os', 'fs', 'process', 'thread', 'atomic', + + # Networking and I/O + 'net', 'http', 'io', + + # Compression and encoding + 'compress', 'base64', + + # Development and debugging + 'debug', 'log', 'meta', 'comptime', + + # Utilities + 'rand', 'time', 'zig', + + # Platform-specific + 'c', 'wasm', + + # Build system + 'build', 'target' + } + + def _classify_dependency_impl(self, module_name: str) -> str: + """ + Classify Zig dependency based on module patterns. + + Args: + module_name: Zig module name to classify + + Returns: + Classification: 'standard_library', 'third_party', or 'local' + """ + # Local imports (relative paths or .zig files) + if (module_name.startswith('./') or + module_name.startswith('../') or + module_name.endswith('.zig')): + return 'local' + + # Standard library check + if module_name in self.get_standard_library_modules(): + return 'standard_library' + + # Check for common Zig package patterns + if any(pattern in module_name for pattern in ['zig-', 'pkg/', 'deps/']): + return 'third_party' + + # Everything else is third_party (Zig doesn't have as many stdlib modules as Python) + return 'third_party' + + def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: + """ + Extract Zig imports from SCIP document. + + Args: + document: SCIP document containing symbols and occurrences + imports: ImportGroup to populate with extracted imports + symbol_parser: Optional SCIPSymbolManager for enhanced parsing + """ + if not symbol_parser: + logger.debug("No symbol parser available for Zig import extraction") + return + + try: + seen_modules = set() + + # Extract from occurrences with Import role + for occurrence in document.occurrences: + if not self.is_import_occurrence(occurrence): + continue + + symbol_info = symbol_parser.parse_symbol(occurrence.symbol) + if not symbol_info: + continue + + # Handle Zig-specific patterns + if symbol_info.manager == 'local': + # Local imports: extract from descriptors + module_path = self._extract_zig_local_module_path(symbol_info.descriptors) + if module_path and module_path not in seen_modules: + import_type = self.classify_dependency(module_path) + imports.add_import(module_path, import_type) + seen_modules.add(module_path) + + elif symbol_info.manager in ['system', 'stdlib']: + # Standard library imports + module_name = self._extract_module_from_descriptors(symbol_info.descriptors) + if module_name and module_name not in seen_modules: + imports.add_import(module_name, 'standard_library') + seen_modules.add(module_name) + + elif symbol_info.manager in ['third_party', 'pkg']: + # Third-party packages + package_name = symbol_info.package or self._extract_module_from_descriptors(symbol_info.descriptors) + if package_name and package_name not in seen_modules: + imports.add_import(package_name, 'third_party') + seen_modules.add(package_name) + + logger.debug(f"Extracted {len(seen_modules)} Zig imports") + + except Exception as e: + logger.debug(f"Error extracting Zig imports: {e}") + + def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: + """ + Extract Zig-specific symbol metadata. + + Args: + symbol_info: SCIP symbol information + document: SCIP document + + Returns: + Dictionary with Zig-specific metadata + """ + metadata = { + 'language': 'zig', + 'source': 'zig_analyzer' + } + + try: + # Extract type information from signature + if hasattr(symbol_info, 'signature') and symbol_info.signature: + signature = symbol_info.signature + metadata['signature'] = signature + + # Parse Zig-specific type patterns + if ':' in signature: + # Variable/field type: name: Type + type_part = signature.split(':', 1)[1].strip() + metadata['type'] = type_part + + # Parse function return type (Zig uses different syntax) + if '!' in signature: + # Error union type + metadata['can_error'] = True + + if 'comptime' in signature: + metadata['is_comptime'] = True + + if 'pub' in signature: + metadata['is_public'] = True + else: + metadata['is_private'] = True + + # Extract documentation if available + if hasattr(symbol_info, 'documentation') and symbol_info.documentation: + metadata['documentation'] = symbol_info.documentation + + # Classify symbol characteristics + symbol = getattr(symbol_info, 'symbol', '') + if symbol: + metadata['scope'] = self._classify_zig_symbol_scope(symbol) + metadata['is_test'] = self._is_zig_test_symbol(symbol) + metadata['is_generic'] = self._is_zig_generic_symbol(symbol) + + except Exception as e: + logger.debug(f"Error extracting Zig metadata: {e}") + metadata['extraction_error'] = str(e) + + return metadata + + def _extract_zig_local_module_path(self, descriptors: str) -> Optional[str]: + """ + Extract local module path from descriptors for Zig. + + Args: + descriptors: SCIP descriptors string + + Returns: + Module path or None + """ + try: + # Handle Zig descriptors like: + # 'test/sample-projects/zig/code-index-example/src/main.zig/std.' -> 'std' + # 'src/utils.zig/helper_function' -> 'utils' + if '/' in descriptors: + parts = descriptors.split('/') + if len(parts) >= 2: + # For Zig: if we have a .zig file, the symbol after it is the import + for i, part in enumerate(parts): + if part.endswith('.zig') and i + 1 < len(parts): + # Next part is the imported symbol/module + symbol_name = parts[i + 1].rstrip('.') + return symbol_name + + # Fallback: traditional file-based extraction + file_part = parts[0] + if file_part.endswith('.zig'): + return file_part[:-4] # Remove .zig extension + return file_part + return None + except Exception: + return None + + def _extract_module_from_descriptors(self, descriptors: str) -> Optional[str]: + """ + Extract module name from SCIP descriptors for Zig. + + Args: + descriptors: SCIP descriptors string + + Returns: + Module name or None + """ + try: + # Handle descriptors like 'std/' or 'std/mem' + if '/' in descriptors: + return descriptors.split('/')[0] + return descriptors.strip('/.') + except Exception: + return None + + def _classify_zig_symbol_scope(self, symbol: str) -> str: + """ + Classify Zig symbol scope. + + Args: + symbol: SCIP symbol string + + Returns: + Scope classification + """ + # Zig doesn't use # for scope like other languages + if '/' in symbol: + parts = symbol.count('/') + if parts == 1: + return 'module' + elif parts >= 2: + return 'nested' + return 'global' + + def _is_zig_test_symbol(self, symbol: str) -> bool: + """ + Check if symbol is a Zig test. + + Args: + symbol: SCIP symbol string + + Returns: + True if symbol appears to be a test + """ + try: + # Zig tests often contain 'test' in their symbol path + return 'test' in symbol.lower() + except Exception: + return False + + def _is_zig_generic_symbol(self, symbol: str) -> bool: + """ + Check if symbol is a generic (comptime) function/type. + + Args: + symbol: SCIP symbol string + + Returns: + True if symbol appears to be generic + """ + try: + # This would require more sophisticated analysis + # For now, just check for common generic patterns + return 'comptime' in symbol.lower() or 'generic' in symbol.lower() + except Exception: + return False \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/dependencies/__init__.py b/src/code_index_mcp/tools/scip/dependencies/__init__.py new file mode 100644 index 0000000..3b31207 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/__init__.py @@ -0,0 +1,33 @@ +""" +Unified dependency classification and management system. + +This package provides the dependency management system that replaces scattered +dependency logic throughout the SCIPSymbolAnalyzer, following the refactoring +plan for centralized and configurable dependency classification. + +Key Components: +- DependencyClassifier: Main dependency classification engine +- DependencyConfig: Abstract base for language-specific configurations +- DependencyRegistry: Centralized registry and caching system +- ImportNormalizer: Import path normalization utilities + +The system supports: +- Configurable classification rules per language +- Caching for performance optimization +- Standard library detection +- Third-party package identification +- Local/project import detection +- Custom classification rules +""" + +from .classifier import DependencyClassifier +from .registry import DependencyRegistry +from .normalizer import ImportNormalizer +from .configs import get_dependency_config + +__all__ = [ + 'DependencyClassifier', + 'DependencyRegistry', + 'ImportNormalizer', + 'get_dependency_config' +] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/dependencies/classifier.py b/src/code_index_mcp/tools/scip/dependencies/classifier.py new file mode 100644 index 0000000..6a539c9 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/classifier.py @@ -0,0 +1,361 @@ +""" +Main dependency classifier engine. + +This module provides the centralized DependencyClassifier that replaces scattered +dependency logic throughout the SCIPSymbolAnalyzer, supporting configurable +classification rules per language. +""" + +import logging +from typing import Dict, Set, List, Optional, Any +from .configs import get_dependency_config, BaseDependencyConfig +from .registry import DependencyRegistry +from .normalizer import ImportNormalizer + +logger = logging.getLogger(__name__) + + +class DependencyClassifier: + """ + Main dependency classification engine. + + This class provides centralized dependency classification with support for: + - Language-specific classification rules + - Caching for performance optimization + - Context-aware classification + - Custom rule registration + - Batch processing capabilities + """ + + def __init__(self): + """Initialize the dependency classifier.""" + self._configs: Dict[str, BaseDependencyConfig] = {} + self._registry = DependencyRegistry() + self._normalizer = ImportNormalizer() + self._context_cache: Dict[str, Dict[str, Any]] = {} + + def classify_import( + self, + import_path: str, + language: str, + context: Optional[Dict[str, Any]] = None + ) -> str: + """ + Classify an import path based on language-specific rules. + + Args: + import_path: Import path to classify + language: Programming language + context: Optional context information (project structure, etc.) + + Returns: + Classification: 'standard_library', 'third_party', or 'local' + """ + if not import_path: + return 'local' + + # Normalize the import path + normalized_path = self._normalizer.normalize_import_path(import_path, language) + + # Check cache first + cache_key = f"{language}:{normalized_path}" + cached_result = self._registry.get_cached_classification(cache_key) + if cached_result is not None: + return cached_result + + # Get language-specific configuration + config = self._get_config(language) + + # Perform classification + classification = config.classify_import(normalized_path, context) + + # Cache the result + self._registry.cache_classification(cache_key, classification) + + logger.debug(f"Classified {import_path} ({language}) as {classification}") + return classification + + def classify_batch( + self, + imports: List[str], + language: str, + context: Optional[Dict[str, Any]] = None + ) -> Dict[str, str]: + """ + Classify multiple imports efficiently. + + Args: + imports: List of import paths to classify + language: Programming language + context: Optional context information + + Returns: + Dictionary mapping import_path -> classification + """ + results = {} + config = self._get_config(language) + + for import_path in imports: + if not import_path: + results[import_path] = 'local' + continue + + # Normalize the import path + normalized_path = self._normalizer.normalize_import_path(import_path, language) + + # Check cache first + cache_key = f"{language}:{normalized_path}" + cached_result = self._registry.get_cached_classification(cache_key) + + if cached_result is not None: + results[import_path] = cached_result + else: + # Perform classification + classification = config.classify_import(normalized_path, context) + results[import_path] = classification + + # Cache the result + self._registry.cache_classification(cache_key, classification) + + logger.debug(f"Classified {len(imports)} imports for {language}") + return results + + def get_standard_library_modules(self, language: str) -> Set[str]: + """ + Get standard library modules for a language. + + Args: + language: Programming language + + Returns: + Set of standard library module names + """ + config = self._get_config(language) + return config.get_stdlib_modules() + + def is_standard_library(self, import_path: str, language: str) -> bool: + """ + Check if an import is from the standard library. + + Args: + import_path: Import path to check + language: Programming language + + Returns: + True if import is from standard library + """ + return self.classify_import(import_path, language) == 'standard_library' + + def is_third_party(self, import_path: str, language: str) -> bool: + """ + Check if an import is third-party. + + Args: + import_path: Import path to check + language: Programming language + + Returns: + True if import is third-party + """ + return self.classify_import(import_path, language) == 'third_party' + + def is_local(self, import_path: str, language: str) -> bool: + """ + Check if an import is local. + + Args: + import_path: Import path to check + language: Programming language + + Returns: + True if import is local + """ + return self.classify_import(import_path, language) == 'local' + + def register_custom_config(self, language: str, config: BaseDependencyConfig) -> None: + """ + Register a custom dependency configuration for a language. + + Args: + language: Language name + config: Custom configuration instance + """ + self._configs[language.lower()] = config + logger.debug(f"Registered custom dependency config for {language}") + + def update_context(self, project_path: str, context: Dict[str, Any]) -> None: + """ + Update context information for a project. + + Args: + project_path: Path to the project + context: Context information to cache + """ + self._context_cache[project_path] = context + logger.debug(f"Updated context for project: {project_path}") + + def get_context(self, project_path: str) -> Optional[Dict[str, Any]]: + """ + Get cached context information for a project. + + Args: + project_path: Path to the project + + Returns: + Cached context or None + """ + return self._context_cache.get(project_path) + + def extract_dependencies_from_file( + self, + file_path: str, + file_content: str, + language: str + ) -> List[str]: + """ + Extract dependencies from package manager files. + + Args: + file_path: Path to the package manager file + file_content: Content of the file + language: Programming language + + Returns: + List of dependency names + """ + config = self._get_config(language) + return config.extract_dependencies_from_file(file_path, file_content) + + def get_package_manager_files(self, language: str) -> Set[str]: + """ + Get package manager files for a language. + + Args: + language: Programming language + + Returns: + Set of package manager file names + """ + config = self._get_config(language) + return config.get_package_manager_files() + + def detect_package_version( + self, + package_name: str, + language: str, + context: Optional[Dict[str, Any]] = None + ) -> Optional[str]: + """ + Detect version of a package if possible. + + Args: + package_name: Name of the package + language: Programming language + context: Optional context information + + Returns: + Package version or None if not detectable + """ + config = self._get_config(language) + if config.supports_version_detection(): + return config.detect_package_version(package_name, context) + return None + + def get_supported_languages(self) -> Set[str]: + """ + Get set of supported languages. + + Returns: + Set of supported language names + """ + # Languages supported by default configs + supported = {'python', 'zig', 'javascript', 'typescript', 'objective-c'} + # Add custom registered languages + supported.update(self._configs.keys()) + return supported + + def clear_cache(self) -> None: + """Clear all cached data.""" + self._registry.clear_cache() + self._context_cache.clear() + logger.debug("Cleared dependency classifier cache") + + def get_classification_stats(self) -> Dict[str, Any]: + """ + Get statistics about classification operations. + + Returns: + Dictionary with classification statistics + """ + return self._registry.get_stats() + + def _get_config(self, language: str) -> BaseDependencyConfig: + """ + Get or create configuration for a language. + + Args: + language: Programming language + + Returns: + Language-specific dependency configuration + """ + language_lower = language.lower() + + # Check if we have a custom config + if language_lower in self._configs: + return self._configs[language_lower] + + # Get default config + config = get_dependency_config(language_lower) + self._configs[language_lower] = config + + return config + + +# Global classifier instance +_classifier_instance: Optional[DependencyClassifier] = None + + +def get_dependency_classifier() -> DependencyClassifier: + """ + Get the global dependency classifier instance. + + Returns: + Global DependencyClassifier instance + """ + global _classifier_instance + if _classifier_instance is None: + _classifier_instance = DependencyClassifier() + return _classifier_instance + + +def classify_import( + import_path: str, + language: str, + context: Optional[Dict[str, Any]] = None +) -> str: + """ + Convenience function to classify an import. + + Args: + import_path: Import path to classify + language: Programming language + context: Optional context information + + Returns: + Classification string + """ + return get_dependency_classifier().classify_import(import_path, language, context) + + +def get_standard_library_modules(language: str) -> Set[str]: + """ + Convenience function to get standard library modules. + + Args: + language: Programming language + + Returns: + Set of standard library module names + """ + return get_dependency_classifier().get_standard_library_modules(language) diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/__init__.py b/src/code_index_mcp/tools/scip/dependencies/configs/__init__.py new file mode 100644 index 0000000..5cb33d5 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/configs/__init__.py @@ -0,0 +1,74 @@ +""" +Language-specific dependency configuration system. + +This package provides language-specific dependency configurations that define +how imports and dependencies should be classified for each supported language. + +Key Components: +- BaseDependencyConfig: Abstract base class for all configurations +- PythonConfig: Python-specific dependency rules +- ZigConfig: Zig-specific dependency rules +- JavaScriptConfig: JavaScript/TypeScript dependency rules +- ObjectiveCConfig: Objective-C framework classification rules + +Each configuration defines: +- Standard library module sets +- Third-party package detection rules +- Local import patterns +- Package manager integration +- Custom classification logic +""" + +from .base import BaseDependencyConfig +from .python import PythonDependencyConfig +from .zig import ZigDependencyConfig +from .javascript import JavaScriptDependencyConfig +from .objc import ObjectiveCDependencyConfig + +# Configuration registry +_CONFIGS = { + 'python': PythonDependencyConfig, + 'zig': ZigDependencyConfig, + 'javascript': JavaScriptDependencyConfig, + 'typescript': JavaScriptDependencyConfig, # TypeScript uses JS config + 'objective-c': ObjectiveCDependencyConfig, +} + +def get_dependency_config(language: str) -> BaseDependencyConfig: + """ + Get dependency configuration for the specified language. + + Args: + language: Language name + + Returns: + Language-specific dependency configuration + """ + language_lower = language.lower() + config_class = _CONFIGS.get(language_lower) + + if config_class: + return config_class() + + # Return base config for unsupported languages + return BaseDependencyConfig() + +def register_dependency_config(language: str, config_class) -> None: + """ + Register a custom dependency configuration. + + Args: + language: Language name + config_class: Configuration class + """ + _CONFIGS[language.lower()] = config_class + +__all__ = [ + 'BaseDependencyConfig', + 'PythonDependencyConfig', + 'ZigDependencyConfig', + 'JavaScriptDependencyConfig', + 'ObjectiveCDependencyConfig', + 'get_dependency_config', + 'register_dependency_config' +] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/base.py b/src/code_index_mcp/tools/scip/dependencies/configs/base.py new file mode 100644 index 0000000..5972a37 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/configs/base.py @@ -0,0 +1,236 @@ +""" +Base dependency configuration class. + +This module provides the abstract base class for language-specific dependency +configurations, defining the interface and common functionality. +""" + +import logging +from abc import ABC, abstractmethod +from typing import Set, Dict, List, Optional, Pattern +import re + +logger = logging.getLogger(__name__) + + +class BaseDependencyConfig(ABC): + """ + Abstract base class for language-specific dependency configurations. + + Each language configuration defines how to classify imports and dependencies + as standard_library, third_party, or local based on language-specific patterns. + """ + + def __init__(self): + """Initialize the dependency configuration.""" + self._stdlib_modules: Optional[Set[str]] = None + self._third_party_patterns: List[Pattern] = [] + self._local_patterns: List[Pattern] = [] + self._package_manager_indicators: Set[str] = set() + + # Initialize patterns + self._compile_patterns() + + @abstractmethod + def get_language_name(self) -> str: + """Return the language name this configuration handles.""" + pass + + @abstractmethod + def get_stdlib_modules(self) -> Set[str]: + """Return set of standard library modules for this language.""" + pass + + def classify_import(self, import_path: str, context: Dict[str, any] = None) -> str: + """ + Classify import path based on language-specific rules. + + Args: + import_path: Import path to classify + context: Optional context information (file path, project structure, etc.) + + Returns: + Classification: 'standard_library', 'third_party', or 'local' + """ + if not import_path: + return 'local' # Default for empty imports + + # Step 1: Check for obvious local patterns first + if self._is_local_import(import_path, context): + return 'local' + + # Step 2: Check standard library + if self._is_stdlib_import(import_path): + return 'standard_library' + + # Step 3: Check third-party patterns + if self._is_third_party_import(import_path, context): + return 'third_party' + + # Step 4: Language-specific classification + return self._classify_import_impl(import_path, context) + + def normalize_import_path(self, raw_path: str) -> str: + """ + Normalize import path for consistent processing. + Default implementation just strips whitespace. + + Args: + raw_path: Raw import path + + Returns: + Normalized import path + """ + return raw_path.strip() + + def _compile_patterns(self) -> None: + """Compile regex patterns for efficient matching.""" + try: + # Default patterns - subclasses should override + self._third_party_patterns = [ + re.compile(r'^[a-zA-Z][a-zA-Z0-9_-]*$'), # Simple package names + ] + + self._local_patterns = [ + re.compile(r'^\.'), # Relative imports + re.compile(r'^/'), # Absolute local paths + ] + except Exception as e: + logger.warning(f"Error compiling patterns for {self.get_language_name()}: {e}") + + def _is_local_import(self, import_path: str, context: Dict[str, any] = None) -> bool: + """Check if import is local based on patterns.""" + # Relative imports are always local + if import_path.startswith('.'): + return True + + # Check compiled patterns + for pattern in self._local_patterns: + if pattern.match(import_path): + return True + + # Context-based checks + if context: + # Check against project-specific patterns + project_indicators = context.get('project_patterns', []) + for indicator in project_indicators: + if indicator in import_path: + return True + + return False + + def _is_stdlib_import(self, import_path: str) -> bool: + """Check if import is from standard library.""" + if self._stdlib_modules is None: + self._stdlib_modules = self.get_stdlib_modules() + + # Extract base module name + base_module = import_path.split('.')[0].split('/')[0] + return base_module in self._stdlib_modules + + def _is_third_party_import(self, import_path: str, context: Dict[str, any] = None) -> bool: + """Check if import is third-party based on patterns.""" + # Check compiled patterns + for pattern in self._third_party_patterns: + if pattern.match(import_path): + return True + + # Check package manager indicators + if context: + package_indicators = context.get('package_indicators', set()) + for indicator in package_indicators: + if indicator in import_path: + return True + + return False + + def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: + """ + Language-specific import classification implementation. + Default implementation returns 'third_party' for unknown imports. + + Args: + import_path: Import path to classify + context: Optional context information + + Returns: + Classification string + """ + return 'third_party' + + def get_package_manager_files(self) -> Set[str]: + """ + Return set of package manager files for this language. + Used to detect project structure and third-party dependencies. + + Returns: + Set of package manager file names + """ + return set() + + def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: + """ + Extract dependency list from package manager files. + + Args: + file_path: Path to the package manager file + file_content: Content of the file + + Returns: + List of dependency names + """ + # Default implementation returns empty list + # Subclasses should implement language-specific parsing + return [] + + def is_scoped_package(self, import_path: str) -> bool: + """ + Check if import represents a scoped package. + + Args: + import_path: Import path to check + + Returns: + True if import is a scoped package + """ + # Default implementation - no scoped packages + return False + + def get_package_name_from_import(self, import_path: str) -> str: + """ + Extract package name from import path. + + Args: + import_path: Full import path + + Returns: + Package name (first component typically) + """ + # Default implementation: return first component + if '/' in import_path: + return import_path.split('/')[0] + elif '.' in import_path: + return import_path.split('.')[0] + return import_path + + def supports_version_detection(self) -> bool: + """ + Check if this configuration supports version detection. + + Returns: + True if version detection is supported + """ + return False + + def detect_package_version(self, package_name: str, context: Dict[str, any] = None) -> Optional[str]: + """ + Detect version of a package if possible. + + Args: + package_name: Name of the package + context: Optional context (lock files, manifests, etc.) + + Returns: + Package version or None if not detectable + """ + return None diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/javascript.py b/src/code_index_mcp/tools/scip/dependencies/configs/javascript.py new file mode 100644 index 0000000..a2099f5 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/configs/javascript.py @@ -0,0 +1,283 @@ +""" +JavaScript/TypeScript-specific dependency configuration. + +This module provides JavaScript and TypeScript specific dependency classification, +including npm/yarn package management and Node.js built-in modules. +""" + +import json +import re +import logging +from typing import Set, Dict, List, Optional +from .base import BaseDependencyConfig + +logger = logging.getLogger(__name__) + + +class JavaScriptDependencyConfig(BaseDependencyConfig): + """ + JavaScript/TypeScript-specific dependency configuration. + + Handles JavaScript and TypeScript import classification with support for: + - Node.js built-in modules + - npm/yarn package management + - ES6 modules and CommonJS + - Scoped packages (@scope/package) + - Relative and absolute imports + """ + + def get_language_name(self) -> str: + return "javascript" + + def get_stdlib_modules(self) -> Set[str]: + """Return Node.js built-in modules.""" + return { + # Node.js built-in modules + 'assert', 'async_hooks', 'buffer', 'child_process', 'cluster', + 'console', 'constants', 'crypto', 'dgram', 'dns', 'domain', + 'events', 'fs', 'http', 'http2', 'https', 'inspector', + 'module', 'net', 'os', 'path', 'perf_hooks', 'process', + 'punycode', 'querystring', 'readline', 'repl', 'stream', + 'string_decoder', 'timers', 'tls', 'trace_events', 'tty', + 'url', 'util', 'v8', 'vm', 'worker_threads', 'zlib' + } + + def _compile_patterns(self) -> None: + """Compile JavaScript-specific regex patterns.""" + try: + self._third_party_patterns = [ + # Standard npm package names + re.compile(r'^[a-z][a-z0-9-._]*$'), + # Scoped packages + re.compile(r'^@[a-z0-9-]+/[a-z0-9-._]+$'), + # Common frameworks and libraries + re.compile(r'^(react|vue|angular|express|lodash|jquery)'), + ] + + self._local_patterns = [ + # Relative imports + re.compile(r'^\.\.?/'), + # Absolute local paths + re.compile(r'^/[^/]'), + # Webpack aliases + re.compile(r'^@/'), + re.compile(r'^~/'), + # Common local patterns + re.compile(r'^(src|lib|components|utils|helpers)/'), + ] + except Exception as e: + logger.warning(f"Error compiling JavaScript patterns: {e}") + + def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: + """JavaScript-specific import classification.""" + # Handle scoped packages + if import_path.startswith('@'): + return 'third_party' + + # Check for common third-party packages + common_third_party = { + 'react', 'vue', 'angular', 'svelte', 'jquery', 'lodash', + 'express', 'koa', 'fastify', 'next', 'nuxt', 'gatsby', + 'webpack', 'vite', 'rollup', 'parcel', 'babel', 'typescript', + 'eslint', 'prettier', 'jest', 'mocha', 'cypress', 'playwright', + 'axios', 'fetch', 'node-fetch', 'superagent', 'got', + 'moment', 'dayjs', 'date-fns', 'luxon', + 'styled-components', 'emotion', '@emotion/react', + 'material-ui', '@mui/material', 'antd', 'bootstrap', + 'tailwindcss', 'bulma', 'semantic-ui-react', + 'redux', 'mobx', 'zustand', 'recoil', 'rxjs', + 'graphql', 'apollo-client', '@apollo/client', + 'socket.io', 'ws', 'uuid', 'bcrypt', 'jsonwebtoken', + 'mongoose', 'sequelize', 'prisma', 'typeorm' + } + + base_package = self.get_package_name_from_import(import_path) + if base_package in common_third_party: + return 'third_party' + + # Check context for npm/yarn info + if context: + # Check package.json dependencies + npm_deps = context.get('npm_dependencies', set()) + if base_package in npm_deps: + return 'third_party' + + # Check node_modules + node_modules = context.get('node_modules', set()) + if base_package in node_modules: + return 'third_party' + + # Default to third_party for JavaScript ecosystem + return 'third_party' + + def normalize_import_path(self, raw_path: str) -> str: + """Normalize JavaScript import path.""" + normalized = raw_path.strip() + + # Remove file extensions + for ext in ['.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs']: + if normalized.endswith(ext): + normalized = normalized[:-len(ext)] + break + + # Remove /index suffix + if normalized.endswith('/index'): + normalized = normalized[:-6] + + return normalized + + def get_package_manager_files(self) -> Set[str]: + """Return JavaScript package manager files.""" + return { + 'package.json', + 'package-lock.json', + 'yarn.lock', + 'pnpm-lock.yaml', + 'npm-shrinkwrap.json', + 'lerna.json', + 'rush.json' + } + + def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: + """Extract dependencies from JavaScript package manager files.""" + dependencies = [] + + try: + if file_path.endswith('package.json'): + dependencies = self._parse_package_json(file_content) + elif file_path.endswith('package-lock.json'): + dependencies = self._parse_package_lock(file_content) + elif file_path.endswith('yarn.lock'): + dependencies = self._parse_yarn_lock(file_content) + elif file_path.endswith('pnpm-lock.yaml'): + dependencies = self._parse_pnpm_lock(file_content) + except Exception as e: + logger.debug(f"Error parsing JavaScript dependency file {file_path}: {e}") + + return dependencies + + def _parse_package_json(self, content: str) -> List[str]: + """Parse package.json for dependencies.""" + dependencies = [] + try: + data = json.loads(content) + + # Extract from different dependency sections + for section in ['dependencies', 'devDependencies', 'peerDependencies', 'optionalDependencies']: + if section in data and isinstance(data[section], dict): + dependencies.extend(data[section].keys()) + + except Exception as e: + logger.debug(f"Error parsing package.json: {e}") + + return dependencies + + def _parse_package_lock(self, content: str) -> List[str]: + """Parse package-lock.json for dependencies.""" + dependencies = [] + try: + data = json.loads(content) + + # Extract from packages section (npm v7+) + if 'packages' in data: + for package_path in data['packages']: + if package_path.startswith('node_modules/'): + package_name = package_path[13:] # Remove 'node_modules/' prefix + if package_name and not package_name.startswith('@'): + dependencies.append(package_name) + elif package_name.startswith('@'): + # Handle scoped packages + dependencies.append(package_name) + + # Extract from dependencies section (npm v6) + elif 'dependencies' in data: + dependencies.extend(data['dependencies'].keys()) + + except Exception as e: + logger.debug(f"Error parsing package-lock.json: {e}") + + return dependencies + + def _parse_yarn_lock(self, content: str) -> List[str]: + """Parse yarn.lock for dependencies.""" + dependencies = [] + try: + # Parse yarn.lock format + for line in content.splitlines(): + line = line.strip() + if line and not line.startswith('#') and '@' in line and ':' in line: + # Extract package name from yarn.lock entry + package_spec = line.split(':')[0].strip() + if '"' in package_spec: + package_name = package_spec.split('"')[1] + if package_name and package_name not in dependencies: + # Remove version specifier + base_name = package_name.split('@')[0] if not package_name.startswith('@') else '@' + package_name.split('@')[1] + if base_name: + dependencies.append(base_name) + + except Exception as e: + logger.debug(f"Error parsing yarn.lock: {e}") + + return dependencies + + def _parse_pnpm_lock(self, content: str) -> List[str]: + """Parse pnpm-lock.yaml for dependencies.""" + dependencies = [] + try: + # Simple YAML parsing for dependencies + in_deps_section = False + for line in content.splitlines(): + line = line.strip() + if line in ['dependencies:', 'devDependencies:']: + in_deps_section = True + continue + elif line and not line.startswith(' ') and in_deps_section: + in_deps_section = False + elif in_deps_section and ':' in line: + dep_name = line.split(':')[0].strip() + if dep_name and not dep_name.startswith('#'): + dependencies.append(dep_name) + + except Exception as e: + logger.debug(f"Error parsing pnpm-lock.yaml: {e}") + + return dependencies + + def is_scoped_package(self, import_path: str) -> bool: + """Check if import is a scoped npm package.""" + return import_path.startswith('@') and '/' in import_path + + def get_package_name_from_import(self, import_path: str) -> str: + """Extract package name from JavaScript import path.""" + # Handle scoped packages + if import_path.startswith('@'): + parts = import_path.split('/') + if len(parts) >= 2: + return f"{parts[0]}/{parts[1]}" + return parts[0] + + # Regular packages + return import_path.split('/')[0] + + def supports_version_detection(self) -> bool: + """JavaScript supports version detection through package files.""" + return True + + def detect_package_version(self, package_name: str, context: Dict[str, any] = None) -> Optional[str]: + """Detect JavaScript package version from context.""" + if not context: + return None + + # Check package-lock.json or yarn.lock data + lock_data = context.get('lock_file_data', {}) + if package_name in lock_data: + return lock_data[package_name].get('version') + + # Check package.json dependencies + package_json = context.get('package_json', {}) + for dep_section in ['dependencies', 'devDependencies', 'peerDependencies']: + if dep_section in package_json and package_name in package_json[dep_section]: + return package_json[dep_section][package_name] + + return None diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/objc.py b/src/code_index_mcp/tools/scip/dependencies/configs/objc.py new file mode 100644 index 0000000..544d1b3 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/configs/objc.py @@ -0,0 +1,346 @@ +""" +Objective-C-specific dependency configuration. + +This module provides Objective-C specific dependency classification, +including iOS/macOS framework detection and CocoaPods support. +""" + +import re +import logging +from typing import Set, Dict, List, Optional +from .base import BaseDependencyConfig + +logger = logging.getLogger(__name__) + + +class ObjectiveCDependencyConfig(BaseDependencyConfig): + """ + Objective-C-specific dependency configuration. + + Handles Objective-C framework and dependency classification with support for: + - iOS and macOS system frameworks + - CocoaPods package management + - Carthage dependency management + - Swift Package Manager integration + - Private framework detection + """ + + def get_language_name(self) -> str: + return "objective-c" + + def get_stdlib_modules(self) -> Set[str]: + """Return iOS/macOS system frameworks.""" + return { + # Core frameworks (iOS and macOS) + 'Foundation', 'CoreFoundation', 'CoreData', 'CoreGraphics', + 'QuartzCore', 'CoreAnimation', 'CoreImage', 'CoreText', + 'Security', 'SystemConfiguration', 'CFNetwork', + + # UI frameworks + 'UIKit', 'AppKit', 'Cocoa', 'SwiftUI', + + # Media frameworks + 'AVFoundation', 'AVKit', 'AudioToolbox', 'AudioUnit', + 'VideoToolbox', 'MediaPlayer', 'Photos', 'PhotosUI', + 'CoreAudio', 'CoreMIDI', 'CoreMedia', 'ImageIO', + + # Graphics and gaming + 'Metal', 'MetalKit', 'GameplayKit', 'SpriteKit', 'SceneKit', + 'GLKit', 'OpenGLES', 'CoreMotion', 'ARKit', 'RealityKit', + + # Location and maps + 'CoreLocation', 'MapKit', 'Contacts', 'ContactsUI', + + # Web and networking + 'WebKit', 'JavaScriptCore', 'NetworkExtension', + + # Data and storage + 'CloudKit', 'CoreSpotlight', 'EventKit', 'EventKitUI', + 'HealthKit', 'HealthKitUI', 'HomeKit', 'HomeKitUI', + + # Device and sensors + 'CoreBluetooth', 'ExternalAccessory', 'CoreNFC', + 'CoreTelephony', 'CallKit', 'PushKit', + + # Machine learning and AI + 'CoreML', 'Vision', 'NaturalLanguage', 'Speech', + 'SoundAnalysis', + + # Development tools + 'XCTest', 'os', 'Accelerate', 'simd', + + # Legacy frameworks + 'AddressBook', 'AddressBookUI', 'AssetsLibrary', + 'MobileCoreServices', 'Social', 'Accounts', + + # watchOS specific + 'WatchKit', 'ClockKit', 'WatchConnectivity', + + # tvOS specific + 'TVUIKit', 'TVMLKit', + + # macOS specific + 'Carbon', 'ApplicationServices', 'CoreServices', + 'IOKit', 'DiskArbitration', 'FSEvents', 'ServiceManagement', + 'LaunchServices', 'SearchKit', 'PreferencePanes', + 'InstantMessage', 'Automator', 'CalendarStore', + 'Collaboration', 'CoreWLAN', 'DiscRecording', + 'DiscRecordingUI', 'DVDPlayback', 'ExceptionHandling', + 'FWAUserLib', 'InstallerPlugins', 'IOBluetooth', + 'IOBluetoothUI', 'Kernel', 'LDAP', 'Message', + 'OpenDirectory', 'OSAKit', 'PubSub', 'QTKit', + 'Quartz', 'QuartzComposer', 'QuickLook', 'ScreenSaver', + 'ScriptingBridge', 'SyncServices', 'Tcl', 'Tk', + 'WebKit', 'XgridFoundation' + } + + def _compile_patterns(self) -> None: + """Compile Objective-C specific regex patterns.""" + try: + self._third_party_patterns = [ + # CocoaPods patterns + re.compile(r'^[A-Z][a-zA-Z0-9]*$'), # CamelCase frameworks + re.compile(r'^FB[A-Z][a-zA-Z0-9]*'), # Facebook frameworks + re.compile(r'^AF[A-Z][a-zA-Z0-9]*'), # AFNetworking family + re.compile(r'^SD[A-Z][a-zA-Z0-9]*'), # SDWebImage family + re.compile(r'^MB[A-Z][a-zA-Z0-9]*'), # MBProgressHUD family + re.compile(r'^Google[A-Z][a-zA-Z0-9]*'), # Google frameworks + re.compile(r'^Firebase[A-Z][a-zA-Z0-9]*'), # Firebase frameworks + ] + + self._local_patterns = [ + # Private frameworks + re.compile(r'Private'), + re.compile(r'Internal'), + # Local project patterns + re.compile(r'^[a-z]'), # lowercase frameworks are usually local + re.compile(r'\.framework'), + re.compile(r'/'), # Path-based imports + ] + except Exception as e: + logger.warning(f"Error compiling Objective-C patterns: {e}") + + def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: + """Objective-C specific import classification.""" + # Check for common third-party frameworks + common_third_party = { + 'AFNetworking', 'Alamofire', 'SDWebImage', 'MBProgressHUD', + 'JSONModel', 'RestKit', 'Firebase', 'ReactiveCocoa', + 'Masonry', 'SnapKit', 'Realm', 'FMDB', 'SQLite', + 'GoogleAnalytics', 'Fabric', 'Crashlytics', 'TestFlight', + 'Facebook', 'Twitter', 'Instagram', 'FBSDKCoreKit', + 'GoogleMaps', 'GooglePlaces', 'GoogleSignIn', + 'FirebaseCore', 'FirebaseAuth', 'FirebaseFirestore', + 'FirebaseDatabase', 'FirebaseStorage', 'FirebaseAnalytics', + 'Lottie', 'Charts', 'YYKit', 'Pop', 'IGListKit', + 'ComponentKit', 'Texture', 'AsyncDisplayKit' + } + + base_framework = self.get_package_name_from_import(import_path) + if base_framework in common_third_party: + return 'third_party' + + # Check for CocoaPods/Carthage patterns + if any(indicator in import_path for indicator in ['Pods/', 'Carthage/', 'Build/Products']): + return 'third_party' + + # Check context for dependency management info + if context: + # Check Podfile dependencies + pods = context.get('cocoapods_dependencies', set()) + if base_framework in pods: + return 'third_party' + + # Check Cartfile dependencies + carthage_deps = context.get('carthage_dependencies', set()) + if base_framework in carthage_deps: + return 'third_party' + + # Check SPM dependencies + spm_deps = context.get('spm_dependencies', set()) + if base_framework in spm_deps: + return 'third_party' + + # Private or internal frameworks are local + if 'Private' in import_path or 'Internal' in import_path: + return 'local' + + # Default to standard_library for unknown Apple frameworks + # (Objective-C ecosystem has many system frameworks) + return 'standard_library' + + def normalize_import_path(self, raw_path: str) -> str: + """Normalize Objective-C import path.""" + normalized = raw_path.strip() + + # Remove .framework suffix + if normalized.endswith('.framework'): + normalized = normalized[:-10] + + # Remove file extensions + for ext in ['.h', '.m', '.mm']: + if normalized.endswith(ext): + normalized = normalized[:-len(ext)] + break + + # Extract framework name from paths + if '/' in normalized: + # Extract the last component (framework name) + normalized = normalized.split('/')[-1] + + return normalized + + def get_package_manager_files(self) -> Set[str]: + """Return Objective-C package manager files.""" + return { + 'Podfile', + 'Podfile.lock', + 'Cartfile', + 'Cartfile.resolved', + 'Package.swift', + 'Package.resolved', + 'project.pbxproj' # Xcode project file + } + + def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: + """Extract dependencies from Objective-C package manager files.""" + dependencies = [] + + try: + if 'Podfile' in file_path and not file_path.endswith('.lock'): + dependencies = self._parse_podfile(file_content) + elif file_path.endswith('Podfile.lock'): + dependencies = self._parse_podfile_lock(file_content) + elif 'Cartfile' in file_path: + dependencies = self._parse_cartfile(file_content) + elif file_path.endswith('Package.swift'): + dependencies = self._parse_package_swift(file_content) + elif file_path.endswith('project.pbxproj'): + dependencies = self._parse_pbxproj(file_content) + except Exception as e: + logger.debug(f"Error parsing Objective-C dependency file {file_path}: {e}") + + return dependencies + + def _parse_podfile(self, content: str) -> List[str]: + """Parse Podfile for CocoaPods dependencies.""" + dependencies = [] + try: + for line in content.splitlines(): + line = line.strip() + if line.startswith('pod '): + # Extract pod name + match = re.search(r"pod\s+['\"]([^'\"]+)['\"]", line) + if match: + pod_name = match.group(1) + dependencies.append(pod_name) + except Exception as e: + logger.debug(f"Error parsing Podfile: {e}") + + return dependencies + + def _parse_podfile_lock(self, content: str) -> List[str]: + """Parse Podfile.lock for installed pods.""" + dependencies = [] + try: + in_pods_section = False + for line in content.splitlines(): + line = line.strip() + if line.startswith('PODS:'): + in_pods_section = True + continue + elif in_pods_section and line.startswith('DEPENDENCIES:'): + break + elif in_pods_section and line.startswith('- '): + # Extract pod name + pod_spec = line[2:].strip() + if '(' in pod_spec: + pod_name = pod_spec.split('(')[0].strip() + else: + pod_name = pod_spec.split(' ')[0].strip() + if pod_name: + dependencies.append(pod_name) + except Exception as e: + logger.debug(f"Error parsing Podfile.lock: {e}") + + return dependencies + + def _parse_cartfile(self, content: str) -> List[str]: + """Parse Cartfile for Carthage dependencies.""" + dependencies = [] + try: + for line in content.splitlines(): + line = line.strip() + if line and not line.startswith('#'): + # Extract dependency name from Carthage format + parts = line.split() + if len(parts) >= 2: + repo = parts[1] + if '/' in repo: + # Extract framework name from GitHub repo + framework_name = repo.split('/')[-1] + if framework_name: + dependencies.append(framework_name) + except Exception as e: + logger.debug(f"Error parsing Cartfile: {e}") + + return dependencies + + def _parse_package_swift(self, content: str) -> List[str]: + """Parse Package.swift for Swift Package Manager dependencies.""" + dependencies = [] + try: + # Look for .package declarations + for line in content.splitlines(): + line = line.strip() + if '.package(' in line: + # Extract package name or URL + match = re.search(r'url:\s*["\']([^"\']+)["\']', line) + if match: + url = match.group(1) + if '/' in url: + package_name = url.split('/')[-1] + if package_name.endswith('.git'): + package_name = package_name[:-4] + dependencies.append(package_name) + except Exception as e: + logger.debug(f"Error parsing Package.swift: {e}") + + return dependencies + + def _parse_pbxproj(self, content: str) -> List[str]: + """Parse Xcode project file for framework references.""" + dependencies = [] + try: + # Look for framework references in pbxproj + for line in content.splitlines(): + if '.framework' in line: + # Extract framework names + matches = re.findall(r'([A-Za-z0-9_]+)\.framework', line) + for framework in matches: + if framework not in dependencies: + dependencies.append(framework) + except Exception as e: + logger.debug(f"Error parsing project.pbxproj: {e}") + + return dependencies + + def get_package_name_from_import(self, import_path: str) -> str: + """Extract framework name from Objective-C import path.""" + # Remove common prefixes/suffixes + normalized = import_path + + if normalized.endswith('.framework'): + normalized = normalized[:-10] + + # Extract framework name from paths + if '/' in normalized: + normalized = normalized.split('/')[-1] + + # Remove file extensions + for ext in ['.h', '.m', '.mm']: + if normalized.endswith(ext): + normalized = normalized[:-len(ext)] + break + + return normalized diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/python.py b/src/code_index_mcp/tools/scip/dependencies/configs/python.py new file mode 100644 index 0000000..02f0f38 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/configs/python.py @@ -0,0 +1,355 @@ +""" +Python-specific dependency configuration. + +This module provides Python-specific dependency classification rules, +including comprehensive standard library detection and pip package management. +""" + +import json +import re +import logging +from typing import Set, Dict, List, Optional, Pattern +from .base import BaseDependencyConfig + +logger = logging.getLogger(__name__) + + +class PythonDependencyConfig(BaseDependencyConfig): + """ + Python-specific dependency configuration. + + Handles Python import classification with support for: + - Comprehensive standard library detection + - pip/conda package management + - Virtual environment detection + - Relative and absolute import patterns + - PEP 420 namespace packages + """ + + def get_language_name(self) -> str: + return "python" + + def get_stdlib_modules(self) -> Set[str]: + """Return comprehensive Python standard library modules.""" + return { + # Core modules + 'os', 'sys', 'json', 'time', 'datetime', 'logging', 'pathlib', + 'typing', 'dataclasses', 'functools', 'itertools', 'collections', + 're', 'math', 'random', 'threading', 'subprocess', 'shutil', + 'contextlib', 'traceback', 'warnings', 'weakref', 'copy', + 'pickle', 'base64', 'hashlib', 'hmac', 'uuid', 'urllib', + 'http', 'socketserver', 'email', 'mimetypes', 'csv', 'configparser', + 'argparse', 'getopt', 'tempfile', 'glob', 'fnmatch', 'linecache', + 'pprint', 'textwrap', 'string', 'struct', 'codecs', 'unicodedata', + 'io', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile', + + # Network and web + 'socket', 'ssl', 'ftplib', 'poplib', 'imaplib', 'smtplib', + 'xmlrpc', 'webbrowser', + + # Data formats + 'xml', 'html', 'sqlite3', 'dbm', 'marshal', + + # Development tools + 'unittest', 'doctest', 'pdb', 'profile', 'cProfile', 'timeit', + 'trace', 'cgitb', 'py_compile', 'compileall', 'dis', 'pickletools', + + # System services + 'errno', 'ctypes', 'syslog', 'curses', 'platform', + + # Internationalization + 'locale', 'gettext', + + # Multimedia + 'audioop', 'wave', 'chunk', 'sunau', 'aifc', 'colorsys', + + # Cryptographic services + 'secrets', 'hashlib', 'hmac', + + # File and directory access + 'stat', 'fileinput', 'filecmp', 'shutil', 'macpath', + + # Data persistence + 'shelve', 'copyreg', + + # Data compression and archiving + 'zlib', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile', + + # File formats + 'csv', 'netrc', 'xdrlib', 'plistlib', + + # Internet protocols and support + 'ipaddress', 'mailbox', 'mimetypes', + + # Structured markup processing tools + 'html', 'xml', + + # Internet data handling + 'json', 'base64', 'binascii', 'uu', 'quopri', + + # Numeric and mathematical modules + 'numbers', 'decimal', 'fractions', 'statistics', 'cmath', + + # Functional programming modules + 'operator', 'functools', 'itertools', + + # Python language services + 'ast', 'symtable', 'symbol', 'token', 'tokenize', 'keyword', + 'tabnanny', 'pyclbr', 'py_compile', 'compileall', 'dis', + 'pickletools', 'distutils', + + # Importing modules + 'importlib', 'pkgutil', 'modulefinder', 'runpy', + + # Python runtime services + 'atexit', 'gc', 'inspect', 'site', '__future__', '__main__', + + # Custom Python interpreters + 'code', 'codeop', + + # MS Windows specific services + 'msvcrt', 'winreg', 'winsound', + + # Unix specific services + 'posix', 'pwd', 'grp', 'crypt', 'termios', 'tty', 'pty', + 'fcntl', 'pipes', 'resource', 'nis', 'syslog', + + # Superseded modules + 'optparse', 'imp' + } + + def _compile_patterns(self) -> None: + """Compile Python-specific regex patterns.""" + try: + self._third_party_patterns = [ + # Standard package names + re.compile(r'^[a-zA-Z][a-zA-Z0-9_-]*$'), + # Namespace packages (PEP 420) + re.compile(r'^[a-zA-Z][a-zA-Z0-9_]*(\.[a-zA-Z][a-zA-Z0-9_]*)+$'), + # Common third-party patterns + re.compile(r'^(django|flask|requests|numpy|pandas|matplotlib|scipy|tensorflow|pytorch|sklearn)'), + ] + + self._local_patterns = [ + # Relative imports + re.compile(r'^\.+'), + # Project-specific patterns + re.compile(r'^(src|lib|app|project)\.'), + re.compile(r'^(tests?|test_)'), + # Common local patterns + re.compile(r'^(utils|helpers|common|core|models|views|controllers)$'), + ] + except Exception as e: + logger.warning(f"Error compiling Python patterns: {e}") + + def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: + """Python-specific import classification.""" + # Handle special cases + if import_path.startswith('__'): + # Dunder modules are usually built-in or special + return 'standard_library' + + # Check for common third-party packages + common_third_party = { + 'numpy', 'pandas', 'matplotlib', 'scipy', 'sklearn', 'tensorflow', + 'torch', 'pytorch', 'requests', 'urllib3', 'beautifulsoup4', + 'django', 'flask', 'fastapi', 'sqlalchemy', 'alembic', + 'pytest', 'mock', 'coverage', 'tox', 'black', 'flake8', + 'mypy', 'isort', 'autopep8', 'yapf', 'pylint', 'bandit', + 'click', 'typer', 'pydantic', 'marshmallow', 'cerberus', + 'redis', 'celery', 'kombu', 'amqp', 'boto3', 'botocore', + 'psycopg2', 'pymongo', 'elasticsearch', 'kafka-python', + 'pillow', 'opencv-python', 'imageio', 'plotly', 'seaborn', + 'jupyter', 'ipython', 'notebook', 'jupyterlab' + } + + base_package = self.get_package_name_from_import(import_path) + if base_package in common_third_party: + return 'third_party' + + # Check context for pip indicators + if context: + pip_indicators = context.get('pip_packages', set()) + if base_package in pip_indicators: + return 'third_party' + + # Check for requirements.txt or setup.py dependencies + project_deps = context.get('project_dependencies', set()) + if base_package in project_deps: + return 'third_party' + + # Default to third_party for unknown packages + return 'third_party' + + def normalize_import_path(self, raw_path: str) -> str: + """Normalize Python import path.""" + # Remove common prefixes and suffixes + normalized = raw_path.strip() + + # Handle namespace packages + if normalized.endswith('.__init__'): + normalized = normalized[:-9] + + # Normalize path separators to dots + normalized = normalized.replace('/', '.') + + return normalized + + def get_package_manager_files(self) -> Set[str]: + """Return Python package manager files.""" + return { + 'requirements.txt', + 'requirements-dev.txt', + 'requirements-test.txt', + 'setup.py', + 'setup.cfg', + 'pyproject.toml', + 'Pipfile', + 'Pipfile.lock', + 'poetry.lock', + 'conda.yaml', + 'environment.yml', + 'environment.yaml' + } + + def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: + """Extract dependencies from Python package manager files.""" + dependencies = [] + + try: + if file_path.endswith('requirements.txt'): + dependencies = self._parse_requirements_txt(file_content) + elif file_path.endswith('setup.py'): + dependencies = self._parse_setup_py(file_content) + elif file_path.endswith('pyproject.toml'): + dependencies = self._parse_pyproject_toml(file_content) + elif file_path.endswith('Pipfile'): + dependencies = self._parse_pipfile(file_content) + elif file_path.endswith('.lock'): + dependencies = self._parse_lock_file(file_path, file_content) + except Exception as e: + logger.debug(f"Error parsing {file_path}: {e}") + + return dependencies + + def _parse_requirements_txt(self, content: str) -> List[str]: + """Parse requirements.txt file.""" + dependencies = [] + for line in content.splitlines(): + line = line.strip() + if line and not line.startswith('#'): + # Extract package name (before version specifiers) + package = re.split(r'[><=!]', line)[0].strip() + if package: + dependencies.append(package) + return dependencies + + def _parse_setup_py(self, content: str) -> List[str]: + """Parse setup.py file for dependencies.""" + dependencies = [] + try: + # Look for install_requires or setup() calls + install_requires_match = re.search( + r'install_requires\s*=\s*\[(.*?)\]', + content, + re.DOTALL + ) + if install_requires_match: + deps_str = install_requires_match.group(1) + # Extract quoted strings + for match in re.finditer(r'["\']([^"\']+)["\']', deps_str): + package = re.split(r'[><=!]', match.group(1))[0].strip() + if package: + dependencies.append(package) + except Exception as e: + logger.debug(f"Error parsing setup.py: {e}") + + return dependencies + + def _parse_pyproject_toml(self, content: str) -> List[str]: + """Parse pyproject.toml file.""" + dependencies = [] + try: + # This would require toml parsing library + # For now, use simple regex approach + deps_match = re.search(r'dependencies\s*=\s*\[(.*?)\]', content, re.DOTALL) + if deps_match: + deps_str = deps_match.group(1) + for match in re.finditer(r'["\']([^"\']+)["\']', deps_str): + package = re.split(r'[><=!]', match.group(1))[0].strip() + if package: + dependencies.append(package) + except Exception as e: + logger.debug(f"Error parsing pyproject.toml: {e}") + + return dependencies + + def _parse_pipfile(self, content: str) -> List[str]: + """Parse Pipfile for dependencies.""" + dependencies = [] + try: + # Look for [packages] section + in_packages_section = False + for line in content.splitlines(): + line = line.strip() + if line == '[packages]': + in_packages_section = True + continue + elif line.startswith('[') and in_packages_section: + break + elif in_packages_section and '=' in line: + package = line.split('=')[0].strip().strip('"\'') + if package: + dependencies.append(package) + except Exception as e: + logger.debug(f"Error parsing Pipfile: {e}") + + return dependencies + + def _parse_lock_file(self, file_path: str, content: str) -> List[str]: + """Parse lock files (Pipfile.lock, poetry.lock).""" + dependencies = [] + try: + if 'Pipfile.lock' in file_path: + # JSON format + data = json.loads(content) + if 'default' in data: + dependencies.extend(data['default'].keys()) + if 'develop' in data: + dependencies.extend(data['develop'].keys()) + elif 'poetry.lock' in file_path: + # TOML format - simplified parsing + for line in content.splitlines(): + if line.startswith('name = '): + name = line.split('=')[1].strip().strip('"\'') + if name: + dependencies.append(name) + except Exception as e: + logger.debug(f"Error parsing lock file {file_path}: {e}") + + return dependencies + + def is_scoped_package(self, import_path: str) -> bool: + """Check if import is a namespace package.""" + return '.' in import_path and not import_path.startswith('.') + + def supports_version_detection(self) -> bool: + """Python supports version detection through various methods.""" + return True + + def detect_package_version(self, package_name: str, context: Dict[str, any] = None) -> Optional[str]: + """Detect Python package version from context.""" + if not context: + return None + + # Check lock files first (most reliable) + lock_data = context.get('lock_file_data', {}) + if package_name in lock_data: + return lock_data[package_name].get('version') + + # Check installed packages (if available) + installed_packages = context.get('installed_packages', {}) + if package_name in installed_packages: + return installed_packages[package_name] + + return None diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/zig.py b/src/code_index_mcp/tools/scip/dependencies/configs/zig.py new file mode 100644 index 0000000..e22f553 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/configs/zig.py @@ -0,0 +1,266 @@ +""" +Zig-specific dependency configuration. + +This module provides Zig-specific dependency classification rules, +including standard library detection and package management. +""" + +import re +import logging +from typing import Set, Dict, List, Optional +from .base import BaseDependencyConfig + +logger = logging.getLogger(__name__) + + +class ZigDependencyConfig(BaseDependencyConfig): + """ + Zig-specific dependency configuration. + + Handles Zig import classification with support for: + - Zig standard library detection + - Package manager (zigmod, gyro) support + - Local .zig file imports + - System library detection + """ + + def get_language_name(self) -> str: + return "zig" + + def get_stdlib_modules(self) -> Set[str]: + """Return comprehensive Zig standard library modules.""" + return { + # Core standard library + 'std', 'builtin', 'testing', + + # Data structures and algorithms + 'math', 'mem', 'sort', 'hash', 'crypto', + + # Text and formatting + 'fmt', 'ascii', 'unicode', 'json', + + # System interaction + 'os', 'fs', 'process', 'thread', 'atomic', + + # Networking and I/O + 'net', 'http', 'io', + + # Compression and encoding + 'compress', 'base64', + + # Development and debugging + 'debug', 'log', 'meta', 'comptime', + + # Utilities + 'rand', 'time', 'zig', + + # Platform-specific + 'c', 'wasm', + + # Build system + 'build', 'target' + } + + def _compile_patterns(self) -> None: + """Compile Zig-specific regex patterns.""" + try: + self._third_party_patterns = [ + # Package names (typically lowercase with hyphens) + re.compile(r'^[a-z][a-z0-9-]*$'), + # Zig package patterns + re.compile(r'^zig-'), + # GitHub-style packages + re.compile(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$'), + ] + + self._local_patterns = [ + # Relative paths + re.compile(r'^\.\.?/'), + # .zig files + re.compile(r'\.zig$'), + # Local project paths + re.compile(r'^src/'), + re.compile(r'^lib/'), + ] + except Exception as e: + logger.warning(f"Error compiling Zig patterns: {e}") + + def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: + """Zig-specific import classification.""" + # Handle .zig file extensions + if import_path.endswith('.zig'): + return 'local' + + # Check for common third-party Zig packages + common_third_party = { + 'zigmod', 'gyro', 'known-folders', 'zig-args', 'zig-clap', + 'zig-network', 'zig-sqlite', 'zig-json', 'zig-yaml', + 'raylib-zig', 'mach', 'zls', 'zig-gamedev' + } + + base_package = self.get_package_name_from_import(import_path) + if base_package in common_third_party: + return 'third_party' + + # Check context for package manager info + if context: + # Check build.zig dependencies + build_deps = context.get('build_dependencies', set()) + if base_package in build_deps: + return 'third_party' + + # Check package manager files + pkg_deps = context.get('package_dependencies', set()) + if base_package in pkg_deps: + return 'third_party' + + # If it's not stdlib or clearly local, assume third_party + return 'third_party' + + def normalize_import_path(self, raw_path: str) -> str: + """Normalize Zig import path.""" + normalized = raw_path.strip() + + # Remove .zig extension for consistency + if normalized.endswith('.zig'): + normalized = normalized[:-4] + + # Normalize path separators + normalized = normalized.replace('\\', '/') + + return normalized + + def get_package_manager_files(self) -> Set[str]: + """Return Zig package manager files.""" + return { + 'build.zig', + 'build.zig.zon', + 'zigmod.yml', + 'zigmod.lock', + 'gyro.zzz', + 'deps.zig' + } + + def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: + """Extract dependencies from Zig package manager files.""" + dependencies = [] + + try: + if file_path.endswith('build.zig'): + dependencies = self._parse_build_zig(file_content) + elif file_path.endswith('build.zig.zon'): + dependencies = self._parse_build_zon(file_content) + elif file_path.endswith('zigmod.yml'): + dependencies = self._parse_zigmod_yml(file_content) + elif file_path.endswith('gyro.zzz'): + dependencies = self._parse_gyro_zzz(file_content) + except Exception as e: + logger.debug(f"Error parsing Zig dependency file {file_path}: {e}") + + return dependencies + + def _parse_build_zig(self, content: str) -> List[str]: + """Parse build.zig for dependencies.""" + dependencies = [] + try: + # Look for addPackage or dependency declarations + for line in content.splitlines(): + line = line.strip() + # Simple pattern matching for package declarations + if 'addPackage' in line or 'dependency' in line: + # Extract quoted strings that might be package names + matches = re.findall(r'["\']([a-zA-Z0-9_-]+)["\']', line) + dependencies.extend(matches) + except Exception as e: + logger.debug(f"Error parsing build.zig: {e}") + + return dependencies + + def _parse_build_zon(self, content: str) -> List[str]: + """Parse build.zig.zon file.""" + dependencies = [] + try: + # Look for .dependencies section + in_deps_section = False + for line in content.splitlines(): + line = line.strip() + if '.dependencies' in line: + in_deps_section = True + continue + elif in_deps_section and line.startswith('}'): + break + elif in_deps_section and '=' in line: + # Extract dependency name + dep_name = line.split('=')[0].strip().strip('.') + if dep_name: + dependencies.append(dep_name) + except Exception as e: + logger.debug(f"Error parsing build.zig.zon: {e}") + + return dependencies + + def _parse_zigmod_yml(self, content: str) -> List[str]: + """Parse zigmod.yml file.""" + dependencies = [] + try: + # Simple YAML parsing for dependencies section + in_deps_section = False + for line in content.splitlines(): + line = line.strip() + if line.startswith('dependencies:'): + in_deps_section = True + continue + elif in_deps_section and line.startswith('-'): + # Extract dependency info + if 'src:' in line: + # Extract from src: field + match = re.search(r'src:\s*([^\s]+)', line) + if match: + src = match.group(1) + # Extract package name from URL or path + if '/' in src: + dep_name = src.split('/')[-1] + if dep_name: + dependencies.append(dep_name) + elif in_deps_section and not line.startswith(' ') and not line.startswith('-'): + break + except Exception as e: + logger.debug(f"Error parsing zigmod.yml: {e}") + + return dependencies + + def _parse_gyro_zzz(self, content: str) -> List[str]: + """Parse gyro.zzz file.""" + dependencies = [] + try: + # Look for deps section in gyro format + for line in content.splitlines(): + line = line.strip() + if line.startswith('deps:'): + # Extract dependencies from gyro format + deps_part = line[5:].strip() + if deps_part: + # Simple parsing of dependency list + for dep in deps_part.split(): + if dep: + dependencies.append(dep) + except Exception as e: + logger.debug(f"Error parsing gyro.zzz: {e}") + + return dependencies + + def get_package_name_from_import(self, import_path: str) -> str: + """Extract package name from Zig import path.""" + # Handle different Zig import patterns + if '/' in import_path: + # GitHub-style: owner/repo + parts = import_path.split('/') + if len(parts) >= 2: + return f"{parts[0]}/{parts[1]}" + return parts[0] + + # Remove .zig extension if present + if import_path.endswith('.zig'): + import_path = import_path[:-4] + + return import_path diff --git a/src/code_index_mcp/tools/scip/dependencies/normalizer.py b/src/code_index_mcp/tools/scip/dependencies/normalizer.py new file mode 100644 index 0000000..08c2f01 --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/normalizer.py @@ -0,0 +1,354 @@ +""" +Import path normalization utilities. + +This module provides utilities for normalizing import paths across different +languages and import styles for consistent dependency classification. +""" + +import re +import logging +from typing import Dict, List, Optional, Set, Callable +from urllib.parse import urlparse + +logger = logging.getLogger(__name__) + + +class ImportNormalizer: + """ + Import path normalization system. + + Provides language-specific import path normalization to ensure + consistent classification regardless of import style variations. + """ + + def __init__(self): + """Initialize the import normalizer.""" + self._normalizers: Dict[str, Callable[[str], str]] = {} + self._setup_default_normalizers() + + def _setup_default_normalizers(self) -> None: + """Setup default normalizers for supported languages.""" + self._normalizers.update({ + 'python': self._normalize_python_import, + 'javascript': self._normalize_javascript_import, + 'typescript': self._normalize_javascript_import, # Same as JS + 'zig': self._normalize_zig_import, + 'objective-c': self._normalize_objc_import, + 'java': self._normalize_java_import, + 'swift': self._normalize_swift_import, + 'go': self._normalize_go_import, + 'rust': self._normalize_rust_import, + }) + + def normalize_import_path(self, import_path: str, language: str) -> str: + """ + Normalize an import path based on language-specific rules. + + Args: + import_path: Raw import path to normalize + language: Programming language + + Returns: + Normalized import path + """ + if not import_path: + return import_path + + # Apply basic normalization first + normalized = self._basic_normalize(import_path) + + # Apply language-specific normalization + language_lower = language.lower() + if language_lower in self._normalizers: + normalized = self._normalizers[language_lower](normalized) + + logger.debug(f"Normalized {import_path} -> {normalized} ({language})") + return normalized + + def _basic_normalize(self, import_path: str) -> str: + """Apply basic normalization common to all languages.""" + # Strip whitespace + normalized = import_path.strip() + + # Remove quotes if present + if (normalized.startswith('"') and normalized.endswith('"')) or \ + (normalized.startswith("'") and normalized.endswith("'")): + normalized = normalized[1:-1] + + # Remove semicolons at the end + normalized = normalized.rstrip(';') + + return normalized + + def _normalize_python_import(self, import_path: str) -> str: + """Normalize Python import paths.""" + normalized = import_path + + # Handle namespace packages + if normalized.endswith('.__init__'): + normalized = normalized[:-9] + + # Convert file paths to module paths + normalized = normalized.replace('/', '.') + normalized = normalized.replace('\\', '.') + + # Remove .py extension if present + if normalized.endswith('.py'): + normalized = normalized[:-3] + + # Normalize multiple dots in relative imports + if normalized.startswith('.'): + # Count leading dots + dot_count = 0 + for char in normalized: + if char == '.': + dot_count += 1 + else: + break + + # Reconstruct with normalized dots + remaining = normalized[dot_count:] + if remaining: + normalized = '.' * dot_count + remaining + else: + normalized = '.' * dot_count + + return normalized + + def _normalize_javascript_import(self, import_path: str) -> str: + """Normalize JavaScript/TypeScript import paths.""" + normalized = import_path + + # Handle URL imports (for Deno or web) + if normalized.startswith(('http://', 'https://')): + parsed = urlparse(normalized) + # Extract package name from URL + path_parts = parsed.path.strip('/').split('/') + if path_parts: + normalized = path_parts[0] # Use first path component as package name + + # Remove common file extensions + extensions = ['.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs', '.json'] + for ext in extensions: + if normalized.endswith(ext): + normalized = normalized[:-len(ext)] + break + + # Remove /index suffix (common in Node.js) + if normalized.endswith('/index'): + normalized = normalized[:-6] + + # Handle scoped packages - ensure proper format + if normalized.startswith('@') and '/' in normalized: + parts = normalized.split('/') + if len(parts) >= 2: + # Keep only @scope/package part + normalized = f"{parts[0]}/{parts[1]}" + + # Convert Windows paths to forward slashes + normalized = normalized.replace('\\', '/') + + return normalized + + def _normalize_zig_import(self, import_path: str) -> str: + """Normalize Zig import paths.""" + normalized = import_path + + # Remove .zig extension + if normalized.endswith('.zig'): + normalized = normalized[:-4] + + # Convert Windows paths to forward slashes + normalized = normalized.replace('\\', '/') + + # Handle relative paths + if normalized.startswith('./'): + normalized = normalized[2:] + elif normalized.startswith('../'): + # Keep relative indicator but normalize + pass + + return normalized + + def _normalize_objc_import(self, import_path: str) -> str: + """Normalize Objective-C import paths.""" + normalized = import_path + + # Remove framework suffix + if normalized.endswith('.framework'): + normalized = normalized[:-10] + + # Remove common file extensions + extensions = ['.h', '.m', '.mm'] + for ext in extensions: + if normalized.endswith(ext): + normalized = normalized[:-len(ext)] + break + + # Extract framework name from paths + if '/' in normalized: + parts = normalized.split('/') + # For framework imports, usually want the framework name + # e.g., "UIKit/UIKit.h" -> "UIKit" + if len(parts) >= 2 and parts[0] == parts[-1]: + normalized = parts[0] + else: + # Use the last component + normalized = parts[-1] + + return normalized + + def _normalize_java_import(self, import_path: str) -> str: + """Normalize Java import paths.""" + normalized = import_path + + # Java imports are typically already normalized + # But handle any file extensions that might be present + if normalized.endswith('.java'): + normalized = normalized[:-5] + + # Convert file paths to package notation + normalized = normalized.replace('/', '.') + normalized = normalized.replace('\\', '.') + + return normalized + + def _normalize_swift_import(self, import_path: str) -> str: + """Normalize Swift import paths.""" + normalized = import_path + + # Remove .swift extension if present + if normalized.endswith('.swift'): + normalized = normalized[:-6] + + # Swift imports are typically module names, so minimal normalization needed + return normalized + + def _normalize_go_import(self, import_path: str) -> str: + """Normalize Go import paths.""" + normalized = import_path + + # Go imports are typically already well-formatted + # Remove any .go extension that might be present + if normalized.endswith('.go'): + normalized = normalized[:-3] + + # Convert Windows paths to forward slashes + normalized = normalized.replace('\\', '/') + + return normalized + + def _normalize_rust_import(self, import_path: str) -> str: + """Normalize Rust import paths.""" + normalized = import_path + + # Remove .rs extension if present + if normalized.endswith('.rs'): + normalized = normalized[:-3] + + # Convert :: to / for consistency (though :: is correct Rust syntax) + # This is for classification purposes only + normalized = normalized.replace('::', '/') + + return normalized + + def register_normalizer(self, language: str, normalizer: Callable[[str], str]) -> None: + """ + Register a custom normalizer for a language. + + Args: + language: Language name + normalizer: Function that takes import_path and returns normalized path + """ + self._normalizers[language.lower()] = normalizer + logger.debug(f"Registered custom normalizer for {language}") + + def get_supported_languages(self) -> Set[str]: + """ + Get set of languages with custom normalizers. + + Returns: + Set of supported language names + """ + return set(self._normalizers.keys()) + + def normalize_package_name(self, package_name: str, language: str) -> str: + """ + Normalize a package name for consistent lookup. + + Args: + package_name: Package name to normalize + language: Programming language + + Returns: + Normalized package name + """ + normalized = package_name.strip().lower() + + # Language-specific package name normalization + if language.lower() == 'python': + # Python package names use hyphens and underscores interchangeably + normalized = normalized.replace('_', '-') + elif language.lower() in ['javascript', 'typescript']: + # JavaScript packages typically use hyphens + # But handle scoped packages specially + if normalized.startswith('@'): + pass # Keep scoped packages as-is + else: + normalized = normalized.replace('_', '-') + elif language.lower() == 'zig': + # Zig packages typically use hyphens + normalized = normalized.replace('_', '-') + elif language.lower() == 'objective-c': + # Objective-C frameworks use CamelCase, preserve case + normalized = package_name.strip() + + return normalized + + def extract_base_package_name(self, import_path: str, language: str) -> str: + """ + Extract the base package name from an import path. + + Args: + import_path: Full import path + language: Programming language + + Returns: + Base package name + """ + normalized = self.normalize_import_path(import_path, language) + + if language.lower() in ['javascript', 'typescript']: + # Handle scoped packages + if normalized.startswith('@'): + parts = normalized.split('/') + if len(parts) >= 2: + return f"{parts[0]}/{parts[1]}" + return parts[0] + else: + return normalized.split('/')[0] + + elif language.lower() == 'python': + # Python: first component of dotted path + if normalized.startswith('.'): + # Relative import, return as-is + return normalized + return normalized.split('.')[0] + + elif language.lower() == 'zig': + # Zig: handle different import patterns + if '/' in normalized: + parts = normalized.split('/') + if len(parts) == 2: + # owner/repo pattern + return normalized + return parts[0] + return normalized + + elif language.lower() == 'objective-c': + # Objective-C: framework name + return normalized + + else: + # Default: first component + return normalized.split('/')[0].split('.')[0] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/dependencies/registry.py b/src/code_index_mcp/tools/scip/dependencies/registry.py new file mode 100644 index 0000000..f74b9df --- /dev/null +++ b/src/code_index_mcp/tools/scip/dependencies/registry.py @@ -0,0 +1,371 @@ +""" +Dependency registry and caching system. + +This module provides centralized caching and registry functionality for +dependency classification results and metadata. +""" + +import time +import logging +from typing import Dict, Optional, Any, Set, List, Tuple +from collections import defaultdict, Counter + +logger = logging.getLogger(__name__) + + +class DependencyRegistry: + """ + Centralized registry and caching system for dependency classification. + + Provides: + - Classification result caching + - Dependency metadata storage + - Performance statistics + - Cache management and cleanup + """ + + def __init__(self, max_cache_size: int = 10000, cache_ttl: int = 3600): + """ + Initialize the dependency registry. + + Args: + max_cache_size: Maximum number of entries to cache + cache_ttl: Cache time-to-live in seconds + """ + self.max_cache_size = max_cache_size + self.cache_ttl = cache_ttl + + # Classification cache: {cache_key: (classification, timestamp)} + self._classification_cache: Dict[str, Tuple[str, float]] = {} + + # Dependency metadata cache: {language: {package: metadata}} + self._metadata_cache: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict) + + # Standard library cache: {language: (modules_set, timestamp)} + self._stdlib_cache: Dict[str, Tuple[Set[str], float]] = {} + + # Package manager file cache: {language: (files_set, timestamp)} + self._package_files_cache: Dict[str, Tuple[Set[str], float]] = {} + + # Statistics + self._stats = { + 'cache_hits': 0, + 'cache_misses': 0, + 'classifications_performed': 0, + 'cache_evictions': 0, + 'last_cleanup': time.time() + } + + # Classification counters + self._classification_counts = Counter() + + def cache_classification(self, cache_key: str, classification: str) -> None: + """ + Cache a dependency classification result. + + Args: + cache_key: Unique cache key for the classification + classification: Classification result to cache + """ + current_time = time.time() + + # Check if cache is full and needs cleanup + if len(self._classification_cache) >= self.max_cache_size: + self._cleanup_cache() + + # Store the classification with timestamp + self._classification_cache[cache_key] = (classification, current_time) + self._classification_counts[classification] += 1 + self._stats['classifications_performed'] += 1 + + logger.debug(f"Cached classification: {cache_key} -> {classification}") + + def get_cached_classification(self, cache_key: str) -> Optional[str]: + """ + Retrieve a cached classification result. + + Args: + cache_key: Cache key to look up + + Returns: + Cached classification or None if not found/expired + """ + if cache_key not in self._classification_cache: + self._stats['cache_misses'] += 1 + return None + + classification, timestamp = self._classification_cache[cache_key] + current_time = time.time() + + # Check if the cache entry has expired + if current_time - timestamp > self.cache_ttl: + del self._classification_cache[cache_key] + self._stats['cache_misses'] += 1 + logger.debug(f"Cache entry expired: {cache_key}") + return None + + self._stats['cache_hits'] += 1 + return classification + + def cache_dependency_metadata( + self, + language: str, + package_name: str, + metadata: Dict[str, Any] + ) -> None: + """ + Cache dependency metadata. + + Args: + language: Programming language + package_name: Package/dependency name + metadata: Metadata to cache + """ + self._metadata_cache[language][package_name] = { + **metadata, + 'cached_at': time.time() + } + logger.debug(f"Cached metadata for {language}:{package_name}") + + def get_cached_metadata( + self, + language: str, + package_name: str + ) -> Optional[Dict[str, Any]]: + """ + Retrieve cached dependency metadata. + + Args: + language: Programming language + package_name: Package/dependency name + + Returns: + Cached metadata or None if not found/expired + """ + if language not in self._metadata_cache: + return None + + if package_name not in self._metadata_cache[language]: + return None + + metadata = self._metadata_cache[language][package_name] + current_time = time.time() + + # Check if metadata has expired + cached_at = metadata.get('cached_at', 0) + if current_time - cached_at > self.cache_ttl: + del self._metadata_cache[language][package_name] + return None + + return metadata + + def cache_standard_library_modules(self, language: str, modules: Set[str]) -> None: + """ + Cache standard library modules for a language. + + Args: + language: Programming language + modules: Set of standard library module names + """ + self._stdlib_cache[language] = (modules, time.time()) + logger.debug(f"Cached {len(modules)} stdlib modules for {language}") + + def get_cached_standard_library_modules(self, language: str) -> Optional[Set[str]]: + """ + Retrieve cached standard library modules. + + Args: + language: Programming language + + Returns: + Set of standard library modules or None if not cached/expired + """ + if language not in self._stdlib_cache: + return None + + modules, timestamp = self._stdlib_cache[language] + current_time = time.time() + + # Stdlib modules rarely change, use longer TTL + if current_time - timestamp > self.cache_ttl * 24: # 24x longer TTL + del self._stdlib_cache[language] + return None + + return modules + + def cache_package_manager_files(self, language: str, files: Set[str]) -> None: + """ + Cache package manager files for a language. + + Args: + language: Programming language + files: Set of package manager file names + """ + self._package_files_cache[language] = (files, time.time()) + logger.debug(f"Cached {len(files)} package manager files for {language}") + + def get_cached_package_manager_files(self, language: str) -> Optional[Set[str]]: + """ + Retrieve cached package manager files. + + Args: + language: Programming language + + Returns: + Set of package manager files or None if not cached/expired + """ + if language not in self._package_files_cache: + return None + + files, timestamp = self._package_files_cache[language] + current_time = time.time() + + # Package manager files rarely change, use longer TTL + if current_time - timestamp > self.cache_ttl * 12: # 12x longer TTL + del self._package_files_cache[language] + return None + + return files + + def get_dependency_list(self, language: str, classification: str) -> List[str]: + """ + Get list of dependencies of a specific classification for a language. + + Args: + language: Programming language + classification: Classification type to filter by + + Returns: + List of dependency names + """ + if language not in self._metadata_cache: + return [] + + dependencies = [] + for package_name, metadata in self._metadata_cache[language].items(): + if metadata.get('classification') == classification: + dependencies.append(package_name) + + return dependencies + + def get_classification_summary(self) -> Dict[str, int]: + """ + Get summary of classification counts. + + Returns: + Dictionary with classification counts + """ + return dict(self._classification_counts) + + def _cleanup_cache(self) -> None: + """Clean up expired cache entries.""" + current_time = time.time() + + # Clean classification cache + expired_keys = [] + for cache_key, (classification, timestamp) in self._classification_cache.items(): + if current_time - timestamp > self.cache_ttl: + expired_keys.append(cache_key) + + for key in expired_keys: + del self._classification_cache[key] + self._stats['cache_evictions'] += 1 + + # Clean metadata cache + for language in list(self._metadata_cache.keys()): + expired_packages = [] + for package, metadata in self._metadata_cache[language].items(): + cached_at = metadata.get('cached_at', 0) + if current_time - cached_at > self.cache_ttl: + expired_packages.append(package) + + for package in expired_packages: + del self._metadata_cache[language][package] + + # Remove empty language entries + if not self._metadata_cache[language]: + del self._metadata_cache[language] + + # Clean stdlib cache + expired_langs = [] + for language, (modules, timestamp) in self._stdlib_cache.items(): + if current_time - timestamp > self.cache_ttl * 24: + expired_langs.append(language) + + for lang in expired_langs: + del self._stdlib_cache[lang] + + # Clean package files cache + expired_langs = [] + for language, (files, timestamp) in self._package_files_cache.items(): + if current_time - timestamp > self.cache_ttl * 12: + expired_langs.append(language) + + for lang in expired_langs: + del self._package_files_cache[lang] + + self._stats['last_cleanup'] = current_time + logger.debug(f"Cache cleanup completed, evicted {len(expired_keys)} classification entries") + + def clear_cache(self) -> None: + """Clear all cached data.""" + self._classification_cache.clear() + self._metadata_cache.clear() + self._stdlib_cache.clear() + self._package_files_cache.clear() + + # Reset stats but keep historical counters + self._stats.update({ + 'cache_hits': 0, + 'cache_misses': 0, + 'cache_evictions': 0, + 'last_cleanup': time.time() + }) + + logger.debug("Cleared all dependency registry cache") + + def get_stats(self) -> Dict[str, Any]: + """ + Get registry statistics. + + Returns: + Dictionary with statistics + """ + current_time = time.time() + + stats = { + **self._stats, + 'cache_size': len(self._classification_cache), + 'metadata_entries': sum(len(packages) for packages in self._metadata_cache.values()), + 'stdlib_languages': len(self._stdlib_cache), + 'package_files_languages': len(self._package_files_cache), + 'classification_counts': dict(self._classification_counts), + 'cache_hit_rate': ( + self._stats['cache_hits'] / + max(1, self._stats['cache_hits'] + self._stats['cache_misses']) + ), + 'uptime': current_time - self._stats['last_cleanup'] + } + + return stats + + def optimize_cache(self) -> None: + """Optimize cache for better performance.""" + # Remove least recently used entries if cache is getting full + if len(self._classification_cache) > self.max_cache_size * 0.8: + current_time = time.time() + + # Sort by timestamp and remove oldest entries + sorted_entries = sorted( + self._classification_cache.items(), + key=lambda x: x[1][1] # Sort by timestamp + ) + + # Remove oldest 20% of entries + remove_count = int(len(sorted_entries) * 0.2) + for i in range(remove_count): + cache_key, (classification, timestamp) = sorted_entries[i] + del self._classification_cache[cache_key] + self._stats['cache_evictions'] += 1 + + logger.debug(f"Optimized cache, removed {remove_count} oldest entries") diff --git a/src/code_index_mcp/tools/scip/position/__init__.py b/src/code_index_mcp/tools/scip/position/__init__.py new file mode 100644 index 0000000..c684147 --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/__init__.py @@ -0,0 +1,46 @@ +""" +Position resolution system for SCIP symbols. + +This package provides the modular position resolution system that replaces +complex position detection logic in SCIPSymbolAnalyzer, following the +refactoring plan for better maintainability and accuracy. + +Key Components: +- PositionResolver: Main position resolution engine using strategy pattern +- PositionStrategy: Abstract base for position detection strategies +- SCIPOccurrenceStrategy: SCIP occurrence-based position detection (high confidence) +- TreeSitterStrategy: Tree-sitter AST-based position detection (medium confidence) +- HeuristicStrategy: Fallback heuristic position detection (low confidence) +- PositionCalculator: Utility for position calculations and conversions +- LocationInfo: Enhanced location information with confidence levels + +The system provides: +- Multi-layered position detection with confidence scoring +- Fallback mechanisms for robust symbol location +- Caching for performance optimization +- Integration with SCIPSymbolManager +- Support for different SCIP symbol formats +""" + +from .resolver import PositionResolver, get_position_resolver, resolve_position +from .calculator import PositionCalculator +from .confidence import ConfidenceLevel, LocationInfo +from .strategies import ( + PositionStrategy, + SCIPOccurrenceStrategy, + TreeSitterStrategy, + HeuristicStrategy +) + +__all__ = [ + 'PositionResolver', + 'get_position_resolver', + 'resolve_position', + 'PositionCalculator', + 'ConfidenceLevel', + 'LocationInfo', + 'PositionStrategy', + 'SCIPOccurrenceStrategy', + 'TreeSitterStrategy', + 'HeuristicStrategy' +] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/calculator.py b/src/code_index_mcp/tools/scip/position/calculator.py new file mode 100644 index 0000000..84694fd --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/calculator.py @@ -0,0 +1,394 @@ +""" +Position calculation utilities. + +This module provides utilities for position calculations, conversions, +and position-related operations for SCIP symbol analysis. +""" + +import logging +from typing import Optional, Dict, Any, List, Tuple +from .confidence import LocationInfo, ConfidenceLevel + +logger = logging.getLogger(__name__) + + +class PositionCalculator: + """ + Utility class for position calculations and conversions. + + Provides methods for: + - Converting between different position formats + - Calculating position offsets and distances + - Validating and normalizing positions + - Estimating positions based on context + """ + + def __init__(self): + """Initialize the position calculator.""" + self._line_cache: Dict[str, List[int]] = {} # Cache for line start byte positions + + def convert_byte_to_line_column( + self, + byte_offset: int, + file_content: str, + file_key: Optional[str] = None + ) -> Tuple[int, int]: + """ + Convert byte offset to line and column numbers. + + Args: + byte_offset: Byte offset in file + file_content: File content string + file_key: Optional cache key for the file + + Returns: + Tuple of (line, column) - both 1-based + """ + if byte_offset < 0: + return 1, 1 + + if byte_offset >= len(file_content): + # Return end of file position + lines = file_content.splitlines() + if lines: + return len(lines), len(lines[-1]) + 1 + return 1, 1 + + # Get line start positions (cached) + line_starts = self._get_line_starts(file_content, file_key) + + # Binary search to find line + line_number = self._binary_search_line(line_starts, byte_offset) + + # Calculate column within the line + line_start = line_starts[line_number - 1] # line_number is 1-based + column = byte_offset - line_start + 1 # Convert to 1-based + + return line_number, column + + def convert_line_column_to_byte( + self, + line: int, + column: int, + file_content: str, + file_key: Optional[str] = None + ) -> int: + """ + Convert line and column to byte offset. + + Args: + line: Line number (1-based) + column: Column number (1-based) + file_content: File content string + file_key: Optional cache key for the file + + Returns: + Byte offset in file + """ + if line < 1 or column < 1: + return 0 + + # Get line start positions (cached) + line_starts = self._get_line_starts(file_content, file_key) + + if line > len(line_starts): + # Beyond end of file + return len(file_content) + + line_start = line_starts[line - 1] # Convert to 0-based + byte_offset = line_start + column - 1 # Convert column to 0-based + + # Ensure we don't go beyond file end + return min(byte_offset, len(file_content)) + + def estimate_position_by_symbol_type( + self, + symbol_type: str, + document_info: Optional[Dict[str, Any]] = None + ) -> LocationInfo: + """ + Estimate position based on symbol type characteristics. + + Args: + symbol_type: Type of symbol (class, function, variable, etc.) + document_info: Optional document information for better estimation + + Returns: + LocationInfo with estimated position + """ + # Default positions based on common patterns + type_positions = { + 'class': (1, 1), # Classes usually at file start + 'interface': (1, 1), # Interfaces usually at file start + 'module': (1, 1), # Modules at file start + 'namespace': (1, 1), # Namespaces at file start + 'function': (5, 1), # Functions after imports + 'method': (10, 5), # Methods inside classes + 'variable': (3, 1), # Variables after imports + 'constant': (2, 1), # Constants near file start + 'field': (8, 5), # Fields inside classes/structs + 'property': (12, 5), # Properties inside classes + 'enum': (1, 1), # Enums at file start + 'enum_member': (15, 5), # Enum members inside enums + } + + default_line, default_column = type_positions.get(symbol_type, (1, 1)) + + # Adjust based on document info + if document_info: + # If we have information about document size, adjust positions + estimated_lines = document_info.get('estimated_lines', 100) + symbol_count = document_info.get('symbol_count', 10) + + if symbol_count > 0: + # Distribute symbols throughout the file + if symbol_type in ['method', 'field', 'property']: + # These are typically inside classes, estimate deeper in file + default_line = min(estimated_lines // 2, default_line + symbol_count) + elif symbol_type in ['function', 'variable']: + # These might be distributed throughout + default_line = min(estimated_lines // 3, default_line + (symbol_count // 2)) + + return LocationInfo.from_heuristic( + line=default_line, + column=default_column, + heuristic_type=f"symbol_type_{symbol_type}", + method="position_calculator_estimate" + ) + + def estimate_position_in_class( + self, + class_location: LocationInfo, + member_index: int = 0, + member_type: str = "method" + ) -> LocationInfo: + """ + Estimate position of a class member relative to class location. + + Args: + class_location: Location of the containing class + member_index: Index of the member within the class + member_type: Type of class member + + Returns: + LocationInfo with estimated member position + """ + if not class_location.is_reliable(): + # If class location is unreliable, use basic estimation + return self.estimate_position_by_symbol_type(member_type) + + # Estimate member position based on class location + base_line = class_location.line + base_column = class_location.column + + # Different member types have different typical offsets + member_offsets = { + 'field': (2, 4), + 'property': (3, 4), + 'method': (4, 4), + 'constructor': (1, 4), + 'destructor': (5, 4), + } + + line_offset, column_offset = member_offsets.get(member_type, (3, 4)) + + # Add index-based spacing + estimated_line = base_line + line_offset + (member_index * 2) + estimated_column = base_column + column_offset + + metadata = { + 'class_line': class_location.line, + 'class_column': class_location.column, + 'member_index': member_index, + 'member_type': member_type, + 'based_on_class_location': True + } + + return LocationInfo( + line=estimated_line, + column=estimated_column, + confidence=ConfidenceLevel.LOW, + method="class_member_estimation", + metadata=metadata + ) + + def calculate_distance(self, loc1: LocationInfo, loc2: LocationInfo) -> int: + """ + Calculate distance between two locations (in lines). + + Args: + loc1: First location + loc2: Second location + + Returns: + Distance in lines (absolute value) + """ + return abs(loc1.line - loc2.line) + + def is_within_range( + self, + location: LocationInfo, + start_line: int, + end_line: int + ) -> bool: + """ + Check if location is within a line range. + + Args: + location: Location to check + start_line: Start of range (inclusive) + end_line: End of range (inclusive) + + Returns: + True if location is within range + """ + return start_line <= location.line <= end_line + + def adjust_position_for_language( + self, + location: LocationInfo, + language: str, + symbol_type: str + ) -> LocationInfo: + """ + Adjust position based on language-specific conventions. + + Args: + location: Original location + language: Programming language + symbol_type: Type of symbol + + Returns: + Adjusted LocationInfo + """ + # Language-specific adjustments + adjustments = { + 'python': self._adjust_for_python, + 'javascript': self._adjust_for_javascript, + 'typescript': self._adjust_for_javascript, # Same as JS + 'zig': self._adjust_for_zig, + 'objective-c': self._adjust_for_objc, + } + + adjust_func = adjustments.get(language.lower()) + if adjust_func: + return adjust_func(location, symbol_type) + + return location + + def validate_position( + self, + location: LocationInfo, + max_line: Optional[int] = None, + max_column: Optional[int] = None + ) -> LocationInfo: + """ + Validate and correct position if necessary. + + Args: + location: Location to validate + max_line: Maximum valid line number + max_column: Maximum valid column number + + Returns: + Validated LocationInfo + """ + corrected_line = max(1, location.line) + corrected_column = max(1, location.column) + + if max_line and corrected_line > max_line: + corrected_line = max_line + + if max_column and corrected_column > max_column: + corrected_column = max_column + + if corrected_line != location.line or corrected_column != location.column: + # Position was corrected, update metadata + validated_location = LocationInfo( + line=corrected_line, + column=corrected_column, + confidence=location.confidence, + method=location.method, + metadata=location.metadata.copy() if location.metadata else {} + ) + + validated_location.add_metadata('position_corrected', True) + validated_location.add_metadata('original_line', location.line) + validated_location.add_metadata('original_column', location.column) + + return validated_location + + return location + + def _get_line_starts(self, file_content: str, file_key: Optional[str]) -> List[int]: + """Get cached line start positions.""" + if file_key and file_key in self._line_cache: + return self._line_cache[file_key] + + line_starts = [0] # First line starts at byte 0 + for i, char in enumerate(file_content): + if char == '\n': + line_starts.append(i + 1) + + if file_key: + self._line_cache[file_key] = line_starts + + return line_starts + + def _binary_search_line(self, line_starts: List[int], byte_offset: int) -> int: + """Binary search to find line number for byte offset.""" + left, right = 0, len(line_starts) - 1 + + while left <= right: + mid = (left + right) // 2 + + if mid == len(line_starts) - 1: + # Last line + return mid + 1 + elif line_starts[mid] <= byte_offset < line_starts[mid + 1]: + return mid + 1 # Convert to 1-based + elif byte_offset < line_starts[mid]: + right = mid - 1 + else: + left = mid + 1 + + return len(line_starts) # Fallback to last line + + def _adjust_for_python(self, location: LocationInfo, symbol_type: str) -> LocationInfo: + """Python-specific position adjustments.""" + # Python functions/classes typically have decorators above them + if symbol_type in ['function', 'method', 'class'] and location.line > 1: + # Assume decorators might be present, adjust upward slightly + adjusted_line = max(1, location.line - 1) + if adjusted_line != location.line: + location.add_metadata('python_decorator_adjustment', True) + location.line = adjusted_line + + return location + + def _adjust_for_javascript(self, location: LocationInfo, symbol_type: str) -> LocationInfo: + """JavaScript/TypeScript-specific position adjustments.""" + # No specific adjustments needed for now + return location + + def _adjust_for_zig(self, location: LocationInfo, symbol_type: str) -> LocationInfo: + """Zig-specific position adjustments.""" + # No specific adjustments needed for now + return location + + def _adjust_for_objc(self, location: LocationInfo, symbol_type: str) -> LocationInfo: + """Objective-C specific position adjustments.""" + # Objective-C methods often have + or - prefix + if symbol_type == 'method' and location.column > 1: + # Adjust column to account for method prefix + adjusted_column = max(1, location.column - 1) + if adjusted_column != location.column: + location.add_metadata('objc_method_prefix_adjustment', True) + location.column = adjusted_column + + return location + + def clear_cache(self) -> None: + """Clear the line position cache.""" + self._line_cache.clear() + logger.debug("Cleared position calculator cache") diff --git a/src/code_index_mcp/tools/scip/position/confidence.py b/src/code_index_mcp/tools/scip/position/confidence.py new file mode 100644 index 0000000..f063f82 --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/confidence.py @@ -0,0 +1,317 @@ +""" +Confidence level management and enhanced location information. + +This module provides enhanced location information with confidence levels +for position resolution results. +""" + +import logging +from enum import Enum +from typing import Optional, Dict, Any +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + + +class ConfidenceLevel(Enum): + """ + Position detection confidence levels. + + Indicates the reliability of position detection results based on + the method used and available data quality. + """ + HIGH = "high" # SCIP occurrence data with exact positions + MEDIUM = "medium" # Tree-sitter AST analysis or symbol structure inference + LOW = "low" # Heuristic fallback or partial data + UNKNOWN = "unknown" # Default/fallback position with minimal confidence + + def __lt__(self, other): + """Allow confidence level comparison.""" + if not isinstance(other, ConfidenceLevel): + return NotImplemented + order = [ConfidenceLevel.UNKNOWN, ConfidenceLevel.LOW, ConfidenceLevel.MEDIUM, ConfidenceLevel.HIGH] + return order.index(self) < order.index(other) + + def __le__(self, other): + return self < other or self == other + + def __gt__(self, other): + return not self <= other + + def __ge__(self, other): + return not self < other + + +@dataclass +class LocationInfo: + """ + Enhanced location information with confidence and metadata. + + Provides comprehensive location information including confidence levels, + detection method metadata, and optional context information. + """ + line: int + column: int + confidence: ConfidenceLevel = ConfidenceLevel.UNKNOWN + method: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + + def __post_init__(self): + """Validate location information after initialization.""" + if self.line < 1: + logger.warning(f"Invalid line number: {self.line}, setting to 1") + self.line = 1 + + if self.column < 1: + logger.warning(f"Invalid column number: {self.column}, setting to 1") + self.column = 1 + + if self.metadata is None: + self.metadata = {} + + @classmethod + def from_scip_occurrence(cls, occurrence, method: str = "scip_occurrence") -> 'LocationInfo': + """ + Create LocationInfo from SCIP occurrence data. + + Args: + occurrence: SCIP occurrence object + method: Detection method name + + Returns: + LocationInfo with high confidence + """ + try: + if not hasattr(occurrence, 'range') or not occurrence.range: + return cls.default_location(method="scip_occurrence_no_range") + + range_obj = occurrence.range + if not hasattr(range_obj, 'start') or not range_obj.start: + return cls.default_location(method="scip_occurrence_no_start") + + start = range_obj.start + if len(start) >= 2: + # SCIP uses 0-based indexing, convert to 1-based + line = start[0] + 1 + column = start[1] + 1 + + metadata = { + 'scip_range_available': True, + 'range_length': len(start), + 'raw_line': start[0], + 'raw_column': start[1] + } + + # Add end position if available + if hasattr(range_obj, 'end') and range_obj.end and len(range_obj.end) >= 2: + metadata.update({ + 'end_line': range_obj.end[0] + 1, + 'end_column': range_obj.end[1] + 1, + 'span_lines': range_obj.end[0] - start[0] + 1 + }) + + return cls( + line=line, + column=column, + confidence=ConfidenceLevel.HIGH, + method=method, + metadata=metadata + ) + + except (AttributeError, IndexError, TypeError) as e: + logger.debug(f"Error creating LocationInfo from SCIP occurrence: {e}") + + return cls.default_location(method="scip_occurrence_error") + + @classmethod + def from_tree_sitter( + cls, + line: int, + column: int, + node_info: Optional[Dict[str, Any]] = None, + method: str = "tree_sitter" + ) -> 'LocationInfo': + """ + Create LocationInfo from Tree-sitter analysis. + + Args: + line: Line number (1-based) + column: Column number (1-based) + node_info: Optional AST node information + method: Detection method name + + Returns: + LocationInfo with medium confidence + """ + metadata = { + 'tree_sitter_analysis': True + } + + if node_info: + metadata.update({ + 'node_type': node_info.get('type'), + 'node_text': node_info.get('text', '')[:50], # Truncate long text + 'node_start_byte': node_info.get('start_byte'), + 'node_end_byte': node_info.get('end_byte'), + 'node_children_count': node_info.get('children_count', 0) + }) + + return cls( + line=max(1, line), + column=max(1, column), + confidence=ConfidenceLevel.MEDIUM, + method=method, + metadata=metadata + ) + + @classmethod + def from_heuristic( + cls, + line: int, + column: int, + heuristic_type: str, + method: str = "heuristic" + ) -> 'LocationInfo': + """ + Create LocationInfo from heuristic analysis. + + Args: + line: Line number (1-based) + column: Column number (1-based) + heuristic_type: Type of heuristic used + method: Detection method name + + Returns: + LocationInfo with low confidence + """ + metadata = { + 'heuristic_type': heuristic_type, + 'estimated': True + } + + return cls( + line=max(1, line), + column=max(1, column), + confidence=ConfidenceLevel.LOW, + method=method, + metadata=metadata + ) + + @classmethod + def default_location(cls, method: str = "default") -> 'LocationInfo': + """ + Create default LocationInfo for fallback cases. + + Args: + method: Detection method name + + Returns: + LocationInfo with unknown confidence at (1,1) + """ + return cls( + line=1, + column=1, + confidence=ConfidenceLevel.UNKNOWN, + method=method, + metadata={'fallback': True} + ) + + def is_reliable(self) -> bool: + """ + Check if the location information is reliable. + + Returns: + True if confidence is medium or high + """ + return self.confidence in (ConfidenceLevel.HIGH, ConfidenceLevel.MEDIUM) + + def is_high_confidence(self) -> bool: + """ + Check if the location has high confidence. + + Returns: + True if confidence is high + """ + return self.confidence == ConfidenceLevel.HIGH + + def update_confidence(self, new_confidence: ConfidenceLevel, reason: str = "") -> None: + """ + Update confidence level with optional reason. + + Args: + new_confidence: New confidence level + reason: Optional reason for the update + """ + old_confidence = self.confidence + self.confidence = new_confidence + + if not self.metadata: + self.metadata = {} + + self.metadata.update({ + 'confidence_updated': True, + 'previous_confidence': old_confidence.value, + 'update_reason': reason + }) + + logger.debug(f"Updated confidence from {old_confidence.value} to {new_confidence.value}: {reason}") + + def add_metadata(self, key: str, value: Any) -> None: + """ + Add metadata information. + + Args: + key: Metadata key + value: Metadata value + """ + if not self.metadata: + self.metadata = {} + self.metadata[key] = value + + def to_dict(self) -> Dict[str, Any]: + """ + Convert LocationInfo to dictionary. + + Returns: + Dictionary representation + """ + return { + 'line': self.line, + 'column': self.column, + 'confidence': self.confidence.value, + 'method': self.method, + 'metadata': self.metadata or {} + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'LocationInfo': + """ + Create LocationInfo from dictionary. + + Args: + data: Dictionary with location data + + Returns: + LocationInfo instance + """ + confidence_str = data.get('confidence', 'unknown') + try: + confidence = ConfidenceLevel(confidence_str) + except ValueError: + confidence = ConfidenceLevel.UNKNOWN + + return cls( + line=data.get('line', 1), + column=data.get('column', 1), + confidence=confidence, + method=data.get('method'), + metadata=data.get('metadata', {}) + ) + + def __str__(self) -> str: + """String representation of LocationInfo.""" + return f"LocationInfo(line={self.line}, column={self.column}, confidence={self.confidence.value})" + + def __repr__(self) -> str: + """Detailed string representation.""" + return f"LocationInfo(line={self.line}, column={self.column}, confidence={self.confidence.value}, method={self.method})" \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/resolver.py b/src/code_index_mcp/tools/scip/position/resolver.py new file mode 100644 index 0000000..fdc32b2 --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/resolver.py @@ -0,0 +1,436 @@ +""" +Main position resolution system. + +This module provides the PositionResolver that coordinates different position +detection strategies to find symbol positions with appropriate confidence levels. +""" + +import logging +from typing import Optional, Dict, Any, List +from .confidence import LocationInfo, ConfidenceLevel +from .strategies.scip_occurrence import SCIPOccurrenceStrategy +from .strategies.tree_sitter_strategy import TreeSitterStrategy +from .strategies.heuristic import HeuristicStrategy +from .strategies.base import PositionStrategy + +logger = logging.getLogger(__name__) + + +class PositionResolver: + """ + Main position resolution coordinator. + + This class manages multiple position detection strategies and applies them + in order of confidence level to find the best possible position for SCIP symbols. + + Strategy Order (by confidence): + 1. SCIPOccurrenceStrategy (HIGH) - Uses SCIP occurrence data + 2. TreeSitterStrategy (MEDIUM) - Uses AST analysis + 3. HeuristicStrategy (LOW) - Uses pattern matching and estimation + """ + + def __init__(self): + """Initialize the position resolver with default strategies.""" + self._strategies: List[PositionStrategy] = [] + self._strategy_cache: Dict[str, PositionStrategy] = {} + self._resolution_cache: Dict[str, LocationInfo] = {} + self._setup_default_strategies() + + def _setup_default_strategies(self) -> None: + """Setup default position detection strategies in order of confidence.""" + self._strategies = [ + SCIPOccurrenceStrategy(), # Highest confidence + TreeSitterStrategy(), # Medium confidence + HeuristicStrategy() # Lowest confidence (fallback) + ] + + # Build strategy cache for quick lookup + for strategy in self._strategies: + self._strategy_cache[strategy.name] = strategy + + logger.debug(f"Initialized position resolver with {len(self._strategies)} strategies") + + def resolve_position( + self, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None, + preferred_confidence: Optional[ConfidenceLevel] = None + ) -> Optional[LocationInfo]: + """ + Resolve position for a SCIP symbol using the best available strategy. + + Args: + scip_symbol: SCIP symbol identifier + document: Document containing source text or SCIP data + context: Optional context information (file path, project info, etc.) + preferred_confidence: Minimum confidence level required + + Returns: + LocationInfo with the best confidence available, or None if not found + """ + if not scip_symbol: + return None + + # Check cache first + cache_key = self._create_cache_key(scip_symbol, context) + if cache_key in self._resolution_cache: + cached_result = self._resolution_cache[cache_key] + if self._meets_confidence_requirement(cached_result, preferred_confidence): + return cached_result + + # Try strategies in order of confidence + best_location = None + + for strategy in self._strategies: + try: + # Check if strategy can handle this symbol + if not strategy.can_handle_symbol(scip_symbol, document): + continue + + # Try to resolve position + location = strategy.try_resolve(scip_symbol, document, context) + + if location: + # Add strategy information to metadata + location.add_metadata('strategy_used', strategy.name) + location.add_metadata('strategy_confidence', strategy.get_confidence_level().value) + + # Check if this meets our confidence requirements + if self._meets_confidence_requirement(location, preferred_confidence): + # Cache and return immediately if confidence requirement is met + self._resolution_cache[cache_key] = location + logger.debug(f"Resolved {scip_symbol} using {strategy.name} with {location.confidence.value} confidence") + return location + + # Keep track of best location found so far + if not best_location or location.confidence > best_location.confidence: + best_location = location + + except Exception as e: + logger.debug(f"Strategy {strategy.name} failed for {scip_symbol}: {e}") + continue + + # Cache the best result found (even if it doesn't meet preferred confidence) + if best_location: + self._resolution_cache[cache_key] = best_location + logger.debug(f"Resolved {scip_symbol} using fallback with {best_location.confidence.value} confidence") + + return best_location + + def resolve_multiple_positions( + self, + symbols: List[str], + document, + context: Optional[Dict[str, Any]] = None + ) -> Dict[str, Optional[LocationInfo]]: + """ + Resolve positions for multiple SCIP symbols efficiently. + + Args: + symbols: List of SCIP symbol identifiers + document: Document containing source text or SCIP data + context: Optional context information + + Returns: + Dictionary mapping symbol -> LocationInfo (or None if not found) + """ + results = {} + + for symbol in symbols: + results[symbol] = self.resolve_position(symbol, document, context) + + return results + + def try_strategy( + self, + strategy_name: str, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None + ) -> Optional[LocationInfo]: + """ + Try a specific strategy to resolve a position. + + Args: + strategy_name: Name of the strategy to use + scip_symbol: SCIP symbol identifier + document: Document containing source text or SCIP data + context: Optional context information + + Returns: + LocationInfo if the strategy succeeds, None otherwise + """ + if strategy_name not in self._strategy_cache: + logger.warning(f"Unknown strategy: {strategy_name}") + return None + + strategy = self._strategy_cache[strategy_name] + + if not strategy.can_handle_symbol(scip_symbol, document): + return None + + try: + location = strategy.try_resolve(scip_symbol, document, context) + if location: + location.add_metadata('strategy_used', strategy.name) + location.add_metadata('strategy_confidence', strategy.get_confidence_level().value) + return location + except Exception as e: + logger.debug(f"Strategy {strategy_name} failed for {scip_symbol}: {e}") + return None + + def get_available_strategies(self) -> List[str]: + """ + Get list of available strategy names. + + Returns: + List of strategy names + """ + return [strategy.name for strategy in self._strategies] + + def get_strategy_info(self) -> List[Dict[str, Any]]: + """ + Get information about all available strategies. + + Returns: + List of dictionaries with strategy information + """ + return [ + { + 'name': strategy.name, + 'confidence_level': strategy.get_confidence_level().value, + 'description': strategy.__class__.__doc__.strip().split('\n')[0] if strategy.__class__.__doc__ else '' + } + for strategy in self._strategies + ] + + def add_strategy(self, strategy: PositionStrategy, priority: Optional[int] = None) -> None: + """ + Add a custom position detection strategy. + + Args: + strategy: PositionStrategy instance to add + priority: Optional priority (lower number = higher priority) + If None, adds at appropriate position based on confidence + """ + if priority is not None: + self._strategies.insert(priority, strategy) + else: + # Insert based on confidence level + inserted = False + for i, existing_strategy in enumerate(self._strategies): + if strategy.get_confidence_level() > existing_strategy.get_confidence_level(): + self._strategies.insert(i, strategy) + inserted = True + break + + if not inserted: + self._strategies.append(strategy) + + # Update cache + self._strategy_cache[strategy.name] = strategy + + logger.debug(f"Added strategy {strategy.name} with {strategy.get_confidence_level().value} confidence") + + def remove_strategy(self, strategy_name: str) -> bool: + """ + Remove a strategy by name. + + Args: + strategy_name: Name of the strategy to remove + + Returns: + True if strategy was removed, False if not found + """ + if strategy_name not in self._strategy_cache: + return False + + strategy = self._strategy_cache[strategy_name] + self._strategies.remove(strategy) + del self._strategy_cache[strategy_name] + + logger.debug(f"Removed strategy {strategy_name}") + return True + + def clear_cache(self) -> None: + """Clear all cached resolution results.""" + self._resolution_cache.clear() + logger.debug("Cleared position resolution cache") + + def get_cache_stats(self) -> Dict[str, Any]: + """ + Get cache statistics. + + Returns: + Dictionary with cache statistics + """ + return { + 'cache_size': len(self._resolution_cache), + 'strategies_count': len(self._strategies), + 'strategy_names': self.get_available_strategies() + } + + def find_best_positions( + self, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None, + max_results: int = 3 + ) -> List[LocationInfo]: + """ + Find multiple possible positions for a symbol using different strategies. + + Args: + scip_symbol: SCIP symbol identifier + document: Document containing source text or SCIP data + context: Optional context information + max_results: Maximum number of results to return + + Returns: + List of LocationInfo objects sorted by confidence + """ + positions = [] + + for strategy in self._strategies[:max_results]: + try: + if strategy.can_handle_symbol(scip_symbol, document): + location = strategy.try_resolve(scip_symbol, document, context) + if location: + location.add_metadata('strategy_used', strategy.name) + location.add_metadata('strategy_confidence', strategy.get_confidence_level().value) + positions.append(location) + except Exception as e: + logger.debug(f"Strategy {strategy.name} failed for {scip_symbol}: {e}") + + # Sort by confidence level (highest first) + positions.sort(key=lambda x: x.confidence, reverse=True) + + return positions[:max_results] + + def _create_cache_key(self, scip_symbol: str, context: Optional[Dict[str, Any]]) -> str: + """Create a cache key for resolution results.""" + if not context: + return scip_symbol + + # Include relevant context in cache key + relevant_keys = ['file_path', 'language', 'project_path'] + context_parts = [] + + for key in relevant_keys: + if key in context: + context_parts.append(f"{key}:{context[key]}") + + if context_parts: + return f"{scip_symbol}#{':'.join(context_parts)}" + return scip_symbol + + def _meets_confidence_requirement( + self, + location: LocationInfo, + preferred_confidence: Optional[ConfidenceLevel] + ) -> bool: + """Check if location meets the preferred confidence requirement.""" + if preferred_confidence is None: + return True + return location.confidence >= preferred_confidence + + def diagnose_resolution( + self, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Diagnose position resolution for debugging purposes. + + Args: + scip_symbol: SCIP symbol identifier + document: Document containing source text or SCIP data + context: Optional context information + + Returns: + Dictionary with diagnostic information + """ + diagnosis = { + 'symbol': scip_symbol, + 'strategies_tested': [], + 'successful_strategies': [], + 'failed_strategies': [], + 'best_result': None, + 'context_available': context is not None, + 'document_type': type(document).__name__ + } + + for strategy in self._strategies: + strategy_info = { + 'name': strategy.name, + 'confidence_level': strategy.get_confidence_level().value, + 'can_handle': False, + 'result': None, + 'error': None + } + + try: + strategy_info['can_handle'] = strategy.can_handle_symbol(scip_symbol, document) + + if strategy_info['can_handle']: + location = strategy.try_resolve(scip_symbol, document, context) + if location: + strategy_info['result'] = location.to_dict() + diagnosis['successful_strategies'].append(strategy.name) + + if not diagnosis['best_result'] or location.confidence > ConfidenceLevel(diagnosis['best_result']['confidence']): + diagnosis['best_result'] = location.to_dict() + else: + diagnosis['failed_strategies'].append(strategy.name) + else: + diagnosis['failed_strategies'].append(strategy.name) + + except Exception as e: + strategy_info['error'] = str(e) + diagnosis['failed_strategies'].append(strategy.name) + + diagnosis['strategies_tested'].append(strategy_info) + + return diagnosis + + +# Global resolver instance for convenience +_resolver_instance: Optional[PositionResolver] = None + + +def get_position_resolver() -> PositionResolver: + """ + Get the global position resolver instance. + + Returns: + Global PositionResolver instance + """ + global _resolver_instance + if _resolver_instance is None: + _resolver_instance = PositionResolver() + return _resolver_instance + + +def resolve_position( + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None, + preferred_confidence: Optional[ConfidenceLevel] = None +) -> Optional[LocationInfo]: + """ + Convenience function to resolve a position using the global resolver. + + Args: + scip_symbol: SCIP symbol identifier + document: Document containing source text or SCIP data + context: Optional context information + preferred_confidence: Minimum confidence level required + + Returns: + LocationInfo with the best confidence available, or None if not found + """ + return get_position_resolver().resolve_position( + scip_symbol, document, context, preferred_confidence + ) \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/__init__.py b/src/code_index_mcp/tools/scip/position/strategies/__init__.py new file mode 100644 index 0000000..9d63180 --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/strategies/__init__.py @@ -0,0 +1,18 @@ +""" +Position detection strategies. + +This package provides different strategies for detecting symbol positions +with varying levels of confidence and accuracy. +""" + +from .base import PositionStrategy +from .scip_occurrence import SCIPOccurrenceStrategy +from .tree_sitter_strategy import TreeSitterStrategy +from .heuristic import HeuristicStrategy + +__all__ = [ + 'PositionStrategy', + 'SCIPOccurrenceStrategy', + 'TreeSitterStrategy', + 'HeuristicStrategy' +] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/base.py b/src/code_index_mcp/tools/scip/position/strategies/base.py new file mode 100644 index 0000000..c8959c1 --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/strategies/base.py @@ -0,0 +1,185 @@ +""" +Base position detection strategy. + +This module provides the abstract base class for all position detection strategies. +""" + +import logging +from abc import ABC, abstractmethod +from typing import Optional, Dict, Any +from ..confidence import LocationInfo, ConfidenceLevel + +logger = logging.getLogger(__name__) + + +class PositionStrategy(ABC): + """ + Abstract base class for position detection strategies. + + Each strategy implements a different approach to detecting symbol positions + with varying levels of accuracy and confidence. + """ + + def __init__(self, name: str): + """ + Initialize the position strategy. + + Args: + name: Human-readable name for this strategy + """ + self.name = name + self._stats = { + 'attempts': 0, + 'successes': 0, + 'failures': 0 + } + + @abstractmethod + def try_resolve( + self, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None + ) -> Optional[LocationInfo]: + """ + Attempt to resolve symbol position using this strategy. + + Args: + scip_symbol: SCIP symbol identifier + document: SCIP document containing symbols and occurrences + context: Optional context information (symbol parser, etc.) + + Returns: + LocationInfo if position found, None otherwise + """ + pass + + @abstractmethod + def get_confidence_level(self) -> ConfidenceLevel: + """ + Return the confidence level this strategy typically provides. + + Returns: + ConfidenceLevel for this strategy's results + """ + pass + + def get_priority(self) -> int: + """ + Get priority for this strategy (higher = tried first). + + Returns: + Priority value (0-100, where 100 is highest priority) + """ + # Map confidence levels to priorities + confidence_priorities = { + ConfidenceLevel.HIGH: 90, + ConfidenceLevel.MEDIUM: 60, + ConfidenceLevel.LOW: 30, + ConfidenceLevel.UNKNOWN: 10 + } + return confidence_priorities.get(self.get_confidence_level(), 50) + + def can_handle_symbol(self, scip_symbol: str, document) -> bool: + """ + Check if this strategy can handle the given symbol. + + Args: + scip_symbol: SCIP symbol identifier + document: SCIP document + + Returns: + True if strategy can attempt to resolve this symbol + """ + # Default implementation: can handle any symbol + return True + + def resolve( + self, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None + ) -> Optional[LocationInfo]: + """ + Public method to resolve position with statistics tracking. + + Args: + scip_symbol: SCIP symbol identifier + document: SCIP document + context: Optional context information + + Returns: + LocationInfo if position found, None otherwise + """ + self._stats['attempts'] += 1 + + try: + if not self.can_handle_symbol(scip_symbol, document): + self._stats['failures'] += 1 + return None + + result = self.try_resolve(scip_symbol, document, context) + + if result is not None: + self._stats['successes'] += 1 + # Ensure the result has proper metadata + if not result.metadata: + result.metadata = {} + result.metadata['strategy'] = self.name + result.metadata['strategy_confidence'] = self.get_confidence_level().value + + logger.debug(f"Strategy '{self.name}' resolved {scip_symbol} at {result.line}:{result.column}") + return result + else: + self._stats['failures'] += 1 + return None + + except Exception as e: + self._stats['failures'] += 1 + logger.debug(f"Strategy '{self.name}' failed for {scip_symbol}: {e}") + return None + + def get_success_rate(self) -> float: + """ + Get success rate for this strategy. + + Returns: + Success rate as a float between 0.0 and 1.0 + """ + if self._stats['attempts'] == 0: + return 0.0 + return self._stats['successes'] / self._stats['attempts'] + + def get_stats(self) -> Dict[str, Any]: + """ + Get statistics for this strategy. + + Returns: + Dictionary with strategy statistics + """ + return { + 'name': self.name, + 'confidence_level': self.get_confidence_level().value, + 'priority': self.get_priority(), + 'success_rate': self.get_success_rate(), + **self._stats + } + + def reset_stats(self) -> None: + """Reset strategy statistics.""" + self._stats = { + 'attempts': 0, + 'successes': 0, + 'failures': 0 + } + logger.debug(f"Reset statistics for strategy '{self.name}'") + + def __str__(self) -> str: + """String representation of the strategy.""" + return f"{self.__class__.__name__}(name='{self.name}', confidence={self.get_confidence_level().value})" + + def __repr__(self) -> str: + """Detailed string representation.""" + return (f"{self.__class__.__name__}(name='{self.name}', " + f"confidence={self.get_confidence_level().value}, " + f"success_rate={self.get_success_rate():.2f})") \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/heuristic.py b/src/code_index_mcp/tools/scip/position/strategies/heuristic.py new file mode 100644 index 0000000..2449e21 --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/strategies/heuristic.py @@ -0,0 +1,568 @@ +""" +Heuristic-based position detection strategy. + +This strategy uses heuristic analysis and pattern matching to find symbol +positions with low confidence as a fallback when other methods fail. +""" + +import logging +import re +from typing import Optional, Dict, Any, List, Tuple +from .base import PositionStrategy +from ..confidence import LocationInfo, ConfidenceLevel + +logger = logging.getLogger(__name__) + + +class HeuristicStrategy(PositionStrategy): + """ + Heuristic-based position detection strategy. + + This strategy provides low confidence position detection using + pattern matching, text search, and educated guesses when more + reliable methods are not available. + """ + + def __init__(self): + """Initialize the heuristic strategy.""" + super().__init__("heuristic") + self._common_patterns = self._build_common_patterns() + + def _build_common_patterns(self) -> Dict[str, List[Dict[str, Any]]]: + """Build common symbol detection patterns across languages.""" + return { + 'function_patterns': [ + { + 'pattern': r'\bdef\s+{name}\s*\(', + 'language': 'python', + 'confidence_boost': 0.8, + 'description': 'Python function definition' + }, + { + 'pattern': r'\bfunction\s+{name}\s*\(', + 'language': 'javascript', + 'confidence_boost': 0.8, + 'description': 'JavaScript function declaration' + }, + { + 'pattern': r'\bfn\s+{name}\s*\(', + 'language': 'zig', + 'confidence_boost': 0.8, + 'description': 'Zig function definition' + }, + { + 'pattern': r'\b{name}\s*=\s*function', + 'language': 'javascript', + 'confidence_boost': 0.7, + 'description': 'JavaScript function expression' + }, + { + 'pattern': r'\b{name}\s*=\s*\([^)]*\)\s*=>', + 'language': 'javascript', + 'confidence_boost': 0.7, + 'description': 'JavaScript arrow function' + } + ], + 'class_patterns': [ + { + 'pattern': r'\bclass\s+{name}\s*[:({{]', + 'language': 'python', + 'confidence_boost': 0.9, + 'description': 'Python class definition' + }, + { + 'pattern': r'\bclass\s+{name}\s*\{{', + 'language': 'javascript', + 'confidence_boost': 0.9, + 'description': 'JavaScript class declaration' + }, + { + 'pattern': r'\b@interface\s+{name}\s*[:(]', + 'language': 'objective-c', + 'confidence_boost': 0.9, + 'description': 'Objective-C interface declaration' + } + ], + 'variable_patterns': [ + { + 'pattern': r'\b{name}\s*=', + 'language': 'general', + 'confidence_boost': 0.5, + 'description': 'Variable assignment' + }, + { + 'pattern': r'\bconst\s+{name}\s*=', + 'language': 'javascript', + 'confidence_boost': 0.7, + 'description': 'JavaScript const declaration' + }, + { + 'pattern': r'\blet\s+{name}\s*=', + 'language': 'javascript', + 'confidence_boost': 0.7, + 'description': 'JavaScript let declaration' + }, + { + 'pattern': r'\bvar\s+{name}\s*=', + 'language': 'javascript', + 'confidence_boost': 0.6, + 'description': 'JavaScript var declaration' + } + ], + 'import_patterns': [ + { + 'pattern': r'\bfrom\s+\S+\s+import\s+.*{name}', + 'language': 'python', + 'confidence_boost': 0.6, + 'description': 'Python import statement' + }, + { + 'pattern': r'\bimport\s+.*{name}', + 'language': 'python', + 'confidence_boost': 0.6, + 'description': 'Python import statement' + }, + { + 'pattern': r'\bimport\s+\{{.*{name}.*\}}', + 'language': 'javascript', + 'confidence_boost': 0.6, + 'description': 'JavaScript named import' + } + ] + } + + def get_confidence_level(self) -> ConfidenceLevel: + """Heuristic analysis provides low confidence positions.""" + return ConfidenceLevel.LOW + + def can_handle_symbol(self, scip_symbol: str, document) -> bool: + """ + Check if we can attempt heuristic analysis for this symbol. + + Args: + scip_symbol: SCIP symbol identifier + document: Document context + + Returns: + Always True as this is the fallback strategy + """ + # Heuristic strategy can always attempt to find a symbol + return True + + def try_resolve( + self, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None + ) -> Optional[LocationInfo]: + """ + Try to resolve position using heuristic analysis. + + Args: + scip_symbol: SCIP symbol identifier + document: Document containing source text or metadata + context: Optional context information + + Returns: + LocationInfo with low confidence if found, None otherwise + """ + # Get source text + source_text = self._get_source_text(document, context) + if not source_text: + return None + + # Parse symbol information + symbol_info = self._parse_symbol(scip_symbol) + if not symbol_info: + return None + + # Try different heuristic approaches in order of confidence + strategies = [ + self._find_by_definition_patterns, + self._find_by_usage_patterns, + self._find_by_text_search, + self._find_by_line_estimation + ] + + best_location = None + best_confidence_score = 0.0 + + for strategy_func in strategies: + try: + location = strategy_func(source_text, symbol_info, context) + if location: + confidence_score = location.metadata.get('confidence_score', 0.0) + if confidence_score > best_confidence_score: + best_location = location + best_confidence_score = confidence_score + except Exception as e: + logger.debug(f"Heuristic strategy failed: {strategy_func.__name__}: {e}") + + return best_location + + def _get_source_text(self, document, context: Optional[Dict[str, Any]]) -> Optional[str]: + """Extract source text from document or context.""" + # Try context first + if context: + if 'source_text' in context: + return context['source_text'] + if 'file_content' in context: + return context['file_content'] + + # Try document + if hasattr(document, 'text') and document.text: + return document.text + if hasattr(document, 'content') and document.content: + return document.content + + # Try reading from file path + if context and 'file_path' in context: + try: + with open(context['file_path'], 'r', encoding='utf-8') as f: + return f.read() + except (OSError, UnicodeDecodeError) as e: + logger.debug(f"Failed to read source file: {e}") + + return None + + def _parse_symbol(self, scip_symbol: str) -> Optional[Dict[str, Any]]: + """Parse SCIP symbol to extract useful information.""" + try: + info = { + 'original': scip_symbol, + 'name': None, + 'type': 'unknown', + 'scope': [], + 'language': None + } + + # Extract from SCIP symbol format + if scip_symbol.startswith('local '): + local_part = scip_symbol[6:] + + # Remove descriptor suffix + if local_part.endswith('.'): + local_part = local_part[:-1] + + # Parse different symbol types + if '(' in local_part: + # Function-like symbol + base_name = local_part.split('(')[0] + info['type'] = 'function' + elif local_part.count('.') > 0: + # Nested symbol (method, attribute, etc.) + parts = local_part.split('.') + base_name = parts[-1] + info['scope'] = parts[:-1] + info['type'] = 'method' if len(parts) > 1 else 'attribute' + else: + # Simple identifier + base_name = local_part + info['type'] = 'identifier' + + # Clean up name + if '/' in base_name: + info['name'] = base_name.split('/')[-1] + else: + info['name'] = base_name + + # Try to infer language + info['language'] = self._infer_language(scip_symbol) + + return info + + except Exception as e: + logger.debug(f"Failed to parse symbol {scip_symbol}: {e}") + + return None + + def _infer_language(self, scip_symbol: str) -> Optional[str]: + """Infer programming language from SCIP symbol.""" + symbol_lower = scip_symbol.lower() + + if '.py' in symbol_lower or 'python' in symbol_lower: + return 'python' + elif '.js' in symbol_lower or '.ts' in symbol_lower or 'javascript' in symbol_lower: + return 'javascript' + elif '.zig' in symbol_lower: + return 'zig' + elif '.java' in symbol_lower: + return 'java' + elif '.m' in symbol_lower or '.mm' in symbol_lower or 'objc' in symbol_lower: + return 'objective-c' + elif '.go' in symbol_lower: + return 'go' + elif '.rs' in symbol_lower: + return 'rust' + + return None + + def _find_by_definition_patterns( + self, + source_text: str, + symbol_info: Dict[str, Any], + context: Optional[Dict[str, Any]] + ) -> Optional[LocationInfo]: + """Find symbol using definition patterns.""" + symbol_name = symbol_info['name'] + symbol_type = symbol_info['type'] + language = symbol_info['language'] + + if not symbol_name: + return None + + # Get relevant patterns based on symbol type + pattern_groups = [] + if symbol_type == 'function': + pattern_groups.append(self._common_patterns['function_patterns']) + elif symbol_type in ['class', 'identifier']: + pattern_groups.append(self._common_patterns['class_patterns']) + pattern_groups.append(self._common_patterns['variable_patterns']) + else: + pattern_groups.append(self._common_patterns['variable_patterns']) + + best_match = None + best_confidence = 0.0 + + for patterns in pattern_groups: + for pattern_info in patterns: + # Filter by language if known + if language and pattern_info['language'] != 'general' and pattern_info['language'] != language: + continue + + # Format pattern with symbol name + pattern = pattern_info['pattern'].format(name=re.escape(symbol_name)) + + match = re.search(pattern, source_text, re.MULTILINE | re.IGNORECASE) + if match: + confidence = pattern_info['confidence_boost'] + if confidence > best_confidence: + best_confidence = confidence + best_match = (match, pattern_info) + + if best_match: + match, pattern_info = best_match + line_num = source_text[:match.start()].count('\n') + 1 + line_start = source_text.rfind('\n', 0, match.start()) + 1 + column_num = match.start() - line_start + 1 + + return LocationInfo.from_heuristic( + line=line_num, + column=column_num, + heuristic_type="definition_pattern", + method=f"heuristic_pattern_{pattern_info['language']}" + ) + + return None + + def _find_by_usage_patterns( + self, + source_text: str, + symbol_info: Dict[str, Any], + context: Optional[Dict[str, Any]] + ) -> Optional[LocationInfo]: + """Find symbol by looking for usage patterns.""" + symbol_name = symbol_info['name'] + + if not symbol_name: + return None + + # Look for the symbol in import statements first + import_patterns = self._common_patterns['import_patterns'] + + for pattern_info in import_patterns: + pattern = pattern_info['pattern'].format(name=re.escape(symbol_name)) + match = re.search(pattern, source_text, re.MULTILINE) + + if match: + line_num = source_text[:match.start()].count('\n') + 1 + line_start = source_text.rfind('\n', 0, match.start()) + 1 + column_num = match.start() - line_start + 1 + + metadata = { + 'confidence_score': 0.6, + 'usage_type': 'import', + 'pattern_description': pattern_info['description'] + } + + location = LocationInfo.from_heuristic( + line=line_num, + column=column_num, + heuristic_type="usage_pattern", + method="heuristic_import" + ) + location.metadata.update(metadata) + return location + + return None + + def _find_by_text_search( + self, + source_text: str, + symbol_info: Dict[str, Any], + context: Optional[Dict[str, Any]] + ) -> Optional[LocationInfo]: + """Find symbol using simple text search.""" + symbol_name = symbol_info['name'] + + if not symbol_name or len(symbol_name) < 2: + return None + + # Look for word boundary matches + pattern = rf'\b{re.escape(symbol_name)}\b' + matches = list(re.finditer(pattern, source_text)) + + if matches: + # Use the first match (usually the definition) + match = matches[0] + line_num = source_text[:match.start()].count('\n') + 1 + line_start = source_text.rfind('\n', 0, match.start()) + 1 + column_num = match.start() - line_start + 1 + + metadata = { + 'confidence_score': 0.3, + 'total_matches': len(matches), + 'search_method': 'text_search' + } + + location = LocationInfo.from_heuristic( + line=line_num, + column=column_num, + heuristic_type="text_search", + method="heuristic_text_search" + ) + location.metadata.update(metadata) + return location + + return None + + def _find_by_line_estimation( + self, + source_text: str, + symbol_info: Dict[str, Any], + context: Optional[Dict[str, Any]] + ) -> Optional[LocationInfo]: + """Estimate position based on file structure and symbol type.""" + total_lines = source_text.count('\n') + 1 + + # Make educated guesses based on symbol type and common patterns + estimated_line = 1 + confidence_score = 0.1 + + symbol_type = symbol_info['type'] + + if symbol_type == 'function': + # Functions often appear in the middle of files + estimated_line = max(1, total_lines // 3) + confidence_score = 0.2 + elif symbol_type == 'class': + # Classes often appear early in files + estimated_line = max(1, total_lines // 4) + confidence_score = 0.15 + elif symbol_type == 'import': + # Imports usually at the top + estimated_line = min(10, total_lines // 10) + confidence_score = 0.25 + else: + # Default to somewhere in the first half + estimated_line = max(1, total_lines // 2) + + metadata = { + 'confidence_score': confidence_score, + 'estimation_method': 'line_estimation', + 'total_lines': total_lines, + 'symbol_type': symbol_type + } + + location = LocationInfo.from_heuristic( + line=estimated_line, + column=1, + heuristic_type="line_estimation", + method="heuristic_estimation" + ) + location.metadata.update(metadata) + return location + + def find_all_occurrences( + self, + symbol_name: str, + source_text: str, + context: Optional[Dict[str, Any]] = None + ) -> List[LocationInfo]: + """ + Find all occurrences of a symbol in source text. + + Args: + symbol_name: Name of the symbol to find + source_text: Source code text + context: Optional context information + + Returns: + List of LocationInfo objects for all occurrences + """ + occurrences = [] + + if not symbol_name or len(symbol_name) < 2: + return occurrences + + # Find all word boundary matches + pattern = rf'\b{re.escape(symbol_name)}\b' + matches = re.finditer(pattern, source_text) + + for i, match in enumerate(matches): + line_num = source_text[:match.start()].count('\n') + 1 + line_start = source_text.rfind('\n', 0, match.start()) + 1 + column_num = match.start() - line_start + 1 + + metadata = { + 'occurrence_index': i, + 'confidence_score': 0.3, + 'search_method': 'all_occurrences' + } + + location = LocationInfo.from_heuristic( + line=line_num, + column=column_num, + heuristic_type="occurrence", + method="heuristic_all_occurrences" + ) + location.metadata.update(metadata) + occurrences.append(location) + + return occurrences + + def get_heuristic_confidence( + self, + symbol_info: Dict[str, Any], + context: Optional[Dict[str, Any]] = None + ) -> float: + """ + Calculate heuristic confidence score for a symbol. + + Args: + symbol_info: Parsed symbol information + context: Optional context information + + Returns: + Confidence score between 0.0 and 1.0 + """ + base_confidence = 0.3 # Base confidence for heuristic methods + + # Boost confidence based on symbol characteristics + if symbol_info.get('type') == 'function': + base_confidence += 0.2 + elif symbol_info.get('type') == 'class': + base_confidence += 0.15 + + # Boost if we have language information + if symbol_info.get('language'): + base_confidence += 0.1 + + # Boost if symbol name is longer (less likely to be false positive) + name_length = len(symbol_info.get('name', '')) + if name_length > 5: + base_confidence += 0.1 + elif name_length > 10: + base_confidence += 0.15 + + return min(1.0, base_confidence) \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/scip_occurrence.py b/src/code_index_mcp/tools/scip/position/strategies/scip_occurrence.py new file mode 100644 index 0000000..1d1c257 --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/strategies/scip_occurrence.py @@ -0,0 +1,236 @@ +""" +SCIP occurrence-based position detection strategy. + +This strategy uses SCIP occurrence data to find exact symbol positions +with high confidence. +""" + +import logging +from typing import Optional, Dict, Any +from .base import PositionStrategy +from ..confidence import LocationInfo, ConfidenceLevel + +logger = logging.getLogger(__name__) + +# Try to import SCIP protobuf definitions +try: + from ....scip.proto import scip_pb2 + SCIP_PROTO_AVAILABLE = True +except ImportError: + scip_pb2 = None + SCIP_PROTO_AVAILABLE = False + + +class SCIPOccurrenceStrategy(PositionStrategy): + """ + SCIP occurrence-based position detection strategy. + + This strategy provides the highest confidence position detection by + using SCIP occurrence data which contains exact position information + from the original indexing process. + """ + + def __init__(self): + """Initialize the SCIP occurrence strategy.""" + super().__init__("scip_occurrence") + + def get_confidence_level(self) -> ConfidenceLevel: + """SCIP occurrences provide high confidence positions.""" + return ConfidenceLevel.HIGH + + def can_handle_symbol(self, scip_symbol: str, document) -> bool: + """ + Check if document has occurrences for the symbol. + + Args: + scip_symbol: SCIP symbol identifier + document: SCIP document + + Returns: + True if document has occurrences we can search + """ + return hasattr(document, 'occurrences') and document.occurrences + + def try_resolve( + self, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None + ) -> Optional[LocationInfo]: + """ + Try to resolve position using SCIP occurrence data. + + Args: + scip_symbol: SCIP symbol identifier + document: SCIP document containing occurrences + context: Optional context information + + Returns: + LocationInfo with high confidence if found, None otherwise + """ + # Strategy 1: Look for definition occurrence first (most reliable) + location = self._find_definition_occurrence(scip_symbol, document) + if location: + location.add_metadata('occurrence_type', 'definition') + return location + + # Strategy 2: Look for any occurrence with position data + location = self._find_any_occurrence(scip_symbol, document) + if location: + location.add_metadata('occurrence_type', 'reference') + return location + + # No occurrences found for this symbol + return None + + def _find_definition_occurrence(self, scip_symbol: str, document) -> Optional[LocationInfo]: + """ + Find the definition occurrence for a symbol. + + Args: + scip_symbol: SCIP symbol identifier + document: SCIP document + + Returns: + LocationInfo if definition found, None otherwise + """ + for occurrence in document.occurrences: + if occurrence.symbol == scip_symbol and self._is_definition(occurrence): + location = self._parse_occurrence_location(occurrence) + if location: + location.add_metadata('is_definition', True) + return location + return None + + def _find_any_occurrence(self, scip_symbol: str, document) -> Optional[LocationInfo]: + """ + Find any occurrence with location data for a symbol. + + Args: + scip_symbol: SCIP symbol identifier + document: SCIP document + + Returns: + LocationInfo if any occurrence found, None otherwise + """ + for occurrence in document.occurrences: + if occurrence.symbol == scip_symbol: + location = self._parse_occurrence_location(occurrence) + if location: + location.add_metadata('is_definition', self._is_definition(occurrence)) + location.add_metadata('symbol_roles', getattr(occurrence, 'symbol_roles', 0)) + return location + return None + + def _is_definition(self, occurrence) -> bool: + """ + Check if an occurrence represents a definition. + + Args: + occurrence: SCIP occurrence object + + Returns: + True if this occurrence is a definition + """ + if not hasattr(occurrence, 'symbol_roles'): + return False + + try: + if SCIP_PROTO_AVAILABLE: + return bool(occurrence.symbol_roles & scip_pb2.SymbolRole.Definition) + else: + # Fallback: Definition role = 1 + return bool(occurrence.symbol_roles & 1) + except (AttributeError, TypeError): + return False + + def _parse_occurrence_location(self, occurrence) -> Optional[LocationInfo]: + """ + Parse location information from SCIP occurrence. + + Args: + occurrence: SCIP occurrence object + + Returns: + LocationInfo if parsing successful, None otherwise + """ + try: + if not hasattr(occurrence, 'range') or not occurrence.range: + return None + + range_obj = occurrence.range + if not hasattr(range_obj, 'start') or not range_obj.start: + return None + + start = range_obj.start + if len(start) >= 2: + # SCIP uses 0-based indexing, convert to 1-based + line = start[0] + 1 + column = start[1] + 1 + + # Create LocationInfo with metadata + metadata = { + 'scip_range_available': True, + 'range_length': len(start), + 'raw_line': start[0], + 'raw_column': start[1] + } + + # Add end position if available + if hasattr(range_obj, 'end') and range_obj.end and len(range_obj.end) >= 2: + metadata.update({ + 'end_line': range_obj.end[0] + 1, + 'end_column': range_obj.end[1] + 1, + 'span_lines': range_obj.end[0] - start[0] + 1 + }) + + return LocationInfo( + line=line, + column=column, + confidence=ConfidenceLevel.HIGH, + method="scip_occurrence", + metadata=metadata + ) + + except (AttributeError, IndexError, TypeError) as e: + logger.debug(f"Error parsing occurrence location: {e}") + + return None + + def get_occurrence_info(self, scip_symbol: str, document) -> Dict[str, Any]: + """ + Get detailed information about occurrences for a symbol. + + Args: + scip_symbol: SCIP symbol identifier + document: SCIP document + + Returns: + Dictionary with occurrence statistics and information + """ + info = { + 'total_occurrences': 0, + 'definition_occurrences': 0, + 'reference_occurrences': 0, + 'occurrences_with_position': 0, + 'role_distribution': {} + } + + for occurrence in document.occurrences: + if occurrence.symbol == scip_symbol: + info['total_occurrences'] += 1 + + if self._is_definition(occurrence): + info['definition_occurrences'] += 1 + else: + info['reference_occurrences'] += 1 + + if self._parse_occurrence_location(occurrence): + info['occurrences_with_position'] += 1 + + # Track role distribution + roles = getattr(occurrence, 'symbol_roles', 0) + role_key = str(roles) + info['role_distribution'][role_key] = info['role_distribution'].get(role_key, 0) + 1 + + return info \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/tree_sitter_strategy.py b/src/code_index_mcp/tools/scip/position/strategies/tree_sitter_strategy.py new file mode 100644 index 0000000..8db6fd8 --- /dev/null +++ b/src/code_index_mcp/tools/scip/position/strategies/tree_sitter_strategy.py @@ -0,0 +1,523 @@ +""" +Tree-sitter AST-based position detection strategy. + +This strategy uses Tree-sitter AST analysis to find symbol positions +with medium confidence by parsing source code. +""" + +import logging +import re +from typing import Optional, Dict, Any, List, Tuple +from .base import PositionStrategy +from ..confidence import LocationInfo, ConfidenceLevel + +logger = logging.getLogger(__name__) + +# Try to import tree-sitter +try: + import tree_sitter + from tree_sitter import Language, Parser + TREE_SITTER_AVAILABLE = True +except ImportError: + tree_sitter = None + Language = None + Parser = None + TREE_SITTER_AVAILABLE = False + + +class TreeSitterStrategy(PositionStrategy): + """ + Tree-sitter AST-based position detection strategy. + + This strategy provides medium confidence position detection by + parsing source code with Tree-sitter and analyzing the AST structure + to find symbol definitions and references. + """ + + def __init__(self): + """Initialize the Tree-sitter strategy.""" + super().__init__("tree_sitter") + self._parsers: Dict[str, Parser] = {} + self._languages: Dict[str, Language] = {} + self._setup_parsers() + + def _setup_parsers(self) -> None: + """Setup Tree-sitter parsers for supported languages.""" + if not TREE_SITTER_AVAILABLE: + logger.debug("Tree-sitter not available, TreeSitterStrategy will have limited functionality") + return + + # Language configurations with their Tree-sitter names + language_configs = { + 'python': 'python', + 'javascript': 'javascript', + 'typescript': 'typescript', + 'zig': 'zig', + 'java': 'java', + 'objective-c': 'objc', + 'c': 'c', + 'cpp': 'cpp', + 'go': 'go', + 'rust': 'rust', + } + + for lang_name, ts_name in language_configs.items(): + try: + # This would typically load pre-compiled language libraries + # For now, we'll just track which languages we support + self._languages[lang_name] = ts_name + logger.debug(f"Configured Tree-sitter support for {lang_name}") + except Exception as e: + logger.debug(f"Failed to setup Tree-sitter for {lang_name}: {e}") + + def get_confidence_level(self) -> ConfidenceLevel: + """Tree-sitter AST analysis provides medium confidence positions.""" + return ConfidenceLevel.MEDIUM + + def can_handle_symbol(self, scip_symbol: str, document) -> bool: + """ + Check if we can handle this symbol with Tree-sitter analysis. + + Args: + scip_symbol: SCIP symbol identifier + document: Document context (may contain language info) + + Returns: + True if Tree-sitter is available and language is supported + """ + if not TREE_SITTER_AVAILABLE: + return False + + # Try to detect language from symbol or document + language = self._detect_language(scip_symbol, document) + return language is not None and language in self._languages + + def try_resolve( + self, + scip_symbol: str, + document, + context: Optional[Dict[str, Any]] = None + ) -> Optional[LocationInfo]: + """ + Try to resolve position using Tree-sitter AST analysis. + + Args: + scip_symbol: SCIP symbol identifier + document: Document containing source text + context: Optional context information + + Returns: + LocationInfo with medium confidence if found, None otherwise + """ + if not TREE_SITTER_AVAILABLE: + return None + + # Get source text from document or context + source_text = self._get_source_text(document, context) + if not source_text: + return None + + # Detect language + language = self._detect_language(scip_symbol, document) + if not language or language not in self._languages: + return None + + # Parse symbol to extract name and type + symbol_info = self._parse_scip_symbol(scip_symbol) + if not symbol_info: + return None + + # Try different AST-based search strategies + location = self._find_by_ast_analysis(source_text, language, symbol_info) + if location: + location.add_metadata('ast_analysis', True) + location.add_metadata('language', language) + return location + + # Fallback to pattern matching with AST guidance + location = self._find_by_pattern_with_ast(source_text, language, symbol_info) + if location: + location.add_metadata('pattern_with_ast', True) + location.add_metadata('language', language) + return location + + return None + + def _get_source_text(self, document, context: Optional[Dict[str, Any]]) -> Optional[str]: + """ + Extract source text from document or context. + + Args: + document: Document object + context: Optional context information + + Returns: + Source text or None if not available + """ + # Try to get from context first + if context and 'source_text' in context: + return context['source_text'] + + # Try to get from document + if hasattr(document, 'text') and document.text: + return document.text + + if hasattr(document, 'content') and document.content: + return document.content + + # Try file path in context + if context and 'file_path' in context: + try: + with open(context['file_path'], 'r', encoding='utf-8') as f: + return f.read() + except (OSError, UnicodeDecodeError) as e: + logger.debug(f"Failed to read source file: {e}") + + return None + + def _detect_language(self, scip_symbol: str, document) -> Optional[str]: + """ + Detect programming language from symbol or document. + + Args: + scip_symbol: SCIP symbol identifier + document: Document context + + Returns: + Language name or None if not detected + """ + # Try to get from document first + if hasattr(document, 'language') and document.language: + return document.language.lower() + + # Infer from SCIP symbol patterns + if 'python' in scip_symbol or '.py' in scip_symbol: + return 'python' + elif 'javascript' in scip_symbol or '.js' in scip_symbol or 'npm' in scip_symbol: + return 'javascript' + elif 'typescript' in scip_symbol or '.ts' in scip_symbol: + return 'typescript' + elif '.zig' in scip_symbol or 'zig' in scip_symbol: + return 'zig' + elif '.java' in scip_symbol or 'java' in scip_symbol: + return 'java' + elif '.m' in scip_symbol or '.mm' in scip_symbol or 'objc' in scip_symbol: + return 'objective-c' + elif '.go' in scip_symbol: + return 'go' + elif '.rs' in scip_symbol or 'rust' in scip_symbol: + return 'rust' + + return None + + def _parse_scip_symbol(self, scip_symbol: str) -> Optional[Dict[str, Any]]: + """ + Parse SCIP symbol to extract meaningful information. + + Args: + scip_symbol: SCIP symbol identifier + + Returns: + Dictionary with symbol information or None if parsing failed + """ + try: + # Basic SCIP symbol format: "local ." + if scip_symbol.startswith('local '): + local_part = scip_symbol[6:] # Remove "local " + + # Split into local-id and descriptor + if '(' in local_part: + # Function-like symbol + name_part = local_part.split('(')[0] + symbol_type = 'function' + elif '.' in local_part: + # Method or attribute + parts = local_part.split('.') + name_part = parts[-2] if len(parts) > 1 else parts[0] + symbol_type = 'method' if len(parts) > 2 else 'attribute' + else: + # Simple identifier + name_part = local_part.rstrip('.') + symbol_type = 'identifier' + + # Extract base name + if '/' in name_part: + base_name = name_part.split('/')[-1] + else: + base_name = name_part + + return { + 'name': base_name, + 'full_name': name_part, + 'type': symbol_type, + 'scip_symbol': scip_symbol + } + + except (IndexError, AttributeError) as e: + logger.debug(f"Failed to parse SCIP symbol {scip_symbol}: {e}") + + return None + + def _find_by_ast_analysis( + self, + source_text: str, + language: str, + symbol_info: Dict[str, Any] + ) -> Optional[LocationInfo]: + """ + Find symbol position using full AST analysis. + + Args: + source_text: Source code text + language: Programming language + symbol_info: Parsed symbol information + + Returns: + LocationInfo if found, None otherwise + """ + # This would typically involve: + # 1. Parse source code with Tree-sitter + # 2. Traverse AST to find matching symbol definitions + # 3. Extract precise position information + + # For now, we'll simulate this with pattern matching + # In a real implementation, this would use tree-sitter parsing + + symbol_name = symbol_info['name'] + symbol_type = symbol_info['type'] + + # Language-specific AST-guided patterns + patterns = self._get_ast_patterns(language, symbol_type, symbol_name) + + for pattern_info in patterns: + match = re.search(pattern_info['pattern'], source_text, re.MULTILINE) + if match: + line_num = source_text[:match.start()].count('\n') + 1 + line_start = source_text.rfind('\n', 0, match.start()) + 1 + column_num = match.start() - line_start + 1 + + metadata = { + 'pattern_type': pattern_info['type'], + 'confidence_reason': pattern_info['reason'], + 'match_text': match.group()[:50], # Truncate long matches + 'ast_guided': True + } + + return LocationInfo.from_tree_sitter( + line=line_num, + column=column_num, + node_info={ + 'type': pattern_info['type'], + 'text': match.group(), + 'start_byte': match.start(), + 'end_byte': match.end() + }, + method="tree_sitter_ast" + ) + + return None + + def _find_by_pattern_with_ast( + self, + source_text: str, + language: str, + symbol_info: Dict[str, Any] + ) -> Optional[LocationInfo]: + """ + Find symbol position using pattern matching with AST guidance. + + Args: + source_text: Source code text + language: Programming language + symbol_info: Parsed symbol information + + Returns: + LocationInfo if found, None otherwise + """ + symbol_name = symbol_info['name'] + + # Simple pattern matching as fallback + # This would be enhanced with AST context in a full implementation + + # Look for function definitions, class definitions, etc. + basic_patterns = [ + rf'\bdef\s+{re.escape(symbol_name)}\s*\(', # Python function + rf'\bclass\s+{re.escape(symbol_name)}\s*[:(]', # Python class + rf'\bfunction\s+{re.escape(symbol_name)}\s*\(', # JavaScript function + rf'\b{re.escape(symbol_name)}\s*=\s*function', # JS function assignment + rf'\bconst\s+{re.escape(symbol_name)}\s*=', # JS/TS const + rf'\blet\s+{re.escape(symbol_name)}\s*=', # JS/TS let + rf'\bvar\s+{re.escape(symbol_name)}\s*=', # JS var + ] + + for pattern in basic_patterns: + match = re.search(pattern, source_text, re.MULTILINE | re.IGNORECASE) + if match: + line_num = source_text[:match.start()].count('\n') + 1 + line_start = source_text.rfind('\n', 0, match.start()) + 1 + column_num = match.start() - line_start + 1 + + metadata = { + 'pattern_match': True, + 'match_text': match.group()[:50], + 'fallback_pattern': True + } + + return LocationInfo.from_tree_sitter( + line=line_num, + column=column_num, + node_info={ + 'text': match.group(), + 'start_byte': match.start(), + 'end_byte': match.end() + }, + method="tree_sitter_pattern" + ) + + return None + + def _get_ast_patterns(self, language: str, symbol_type: str, symbol_name: str) -> List[Dict[str, Any]]: + """ + Get AST-guided patterns for symbol detection. + + Args: + language: Programming language + symbol_type: Type of symbol (function, class, etc.) + symbol_name: Name of the symbol + + Returns: + List of pattern information dictionaries + """ + escaped_name = re.escape(symbol_name) + patterns = [] + + if language == 'python': + if symbol_type == 'function': + patterns.extend([ + { + 'pattern': rf'^\s*def\s+{escaped_name}\s*\(', + 'type': 'function_definition', + 'reason': 'Python function definition pattern' + }, + { + 'pattern': rf'^\s*async\s+def\s+{escaped_name}\s*\(', + 'type': 'async_function_definition', + 'reason': 'Python async function definition pattern' + } + ]) + elif symbol_type in ['class', 'identifier']: + patterns.append({ + 'pattern': rf'^\s*class\s+{escaped_name}\s*[:(]', + 'type': 'class_definition', + 'reason': 'Python class definition pattern' + }) + + elif language in ['javascript', 'typescript']: + if symbol_type == 'function': + patterns.extend([ + { + 'pattern': rf'\bfunction\s+{escaped_name}\s*\(', + 'type': 'function_declaration', + 'reason': 'JavaScript function declaration' + }, + { + 'pattern': rf'\b{escaped_name}\s*=\s*function', + 'type': 'function_expression', + 'reason': 'JavaScript function expression' + }, + { + 'pattern': rf'\b{escaped_name}\s*=\s*\([^)]*\)\s*=>', + 'type': 'arrow_function', + 'reason': 'JavaScript arrow function' + } + ]) + elif symbol_type in ['class', 'identifier']: + patterns.append({ + 'pattern': rf'\bclass\s+{escaped_name}\s*\{{', + 'type': 'class_declaration', + 'reason': 'JavaScript class declaration' + }) + + elif language == 'zig': + patterns.extend([ + { + 'pattern': rf'\bfn\s+{escaped_name}\s*\(', + 'type': 'function_definition', + 'reason': 'Zig function definition' + }, + { + 'pattern': rf'\bconst\s+{escaped_name}\s*=', + 'type': 'const_declaration', + 'reason': 'Zig constant declaration' + } + ]) + + elif language == 'java': + patterns.extend([ + { + 'pattern': rf'\b(public|private|protected)?\s*(static)?\s*\w+\s+{escaped_name}\s*\(', + 'type': 'method_definition', + 'reason': 'Java method definition' + }, + { + 'pattern': rf'\b(public|private|protected)?\s*class\s+{escaped_name}\s*\{{', + 'type': 'class_definition', + 'reason': 'Java class definition' + } + ]) + + return patterns + + def get_supported_languages(self) -> List[str]: + """ + Get list of languages supported by this strategy. + + Returns: + List of supported language names + """ + return list(self._languages.keys()) + + def get_ast_info( + self, + source_text: str, + language: str, + symbol_name: str + ) -> Dict[str, Any]: + """ + Get detailed AST information for a symbol. + + Args: + source_text: Source code text + language: Programming language + symbol_name: Name of the symbol to analyze + + Returns: + Dictionary with AST analysis information + """ + info = { + 'language': language, + 'symbol_name': symbol_name, + 'tree_sitter_available': TREE_SITTER_AVAILABLE, + 'language_supported': language in self._languages, + 'patterns_found': [], + 'potential_matches': 0 + } + + if language in self._languages: + # Get all potential patterns for this symbol + symbol_info = {'name': symbol_name, 'type': 'identifier'} + patterns = self._get_ast_patterns(language, 'identifier', symbol_name) + + for pattern_info in patterns: + matches = re.finditer(pattern_info['pattern'], source_text, re.MULTILINE) + for match in matches: + line_num = source_text[:match.start()].count('\n') + 1 + info['patterns_found'].append({ + 'type': pattern_info['type'], + 'line': line_num, + 'text': match.group()[:50], + 'reason': pattern_info['reason'] + }) + info['potential_matches'] += 1 + + return info \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/scip_index_tool.py b/src/code_index_mcp/tools/scip/scip_index_tool.py index e3620f7..f9e0c9c 100644 --- a/src/code_index_mcp/tools/scip/scip_index_tool.py +++ b/src/code_index_mcp/tools/scip/scip_index_tool.py @@ -45,7 +45,7 @@ def is_index_available(self) -> bool: def build_index(self, project_path: str) -> int: """ Build SCIP index for the specified project path. - + This is a pure technical operation that unconditionally rebuilds the index. Business logic for deciding when to rebuild should be handled by the caller. @@ -67,11 +67,11 @@ def build_index(self, project_path: str) -> int: try: logger.info(f"Building index for {project_path}") self._project_path = project_path - + # Initialize settings for this project from ...project_settings import ProjectSettings self._settings = ProjectSettings(project_path, skip_load=False) - + self._scip_index = self._builder.build_scip_index(project_path) logger.info(f"Built index with {len(self._scip_index.documents)} files") @@ -83,7 +83,7 @@ def build_index(self, project_path: str) -> int: def save_index(self) -> bool: """ Save the current SCIP index to disk. - + This is a pure technical operation that saves the current in-memory index. Returns: @@ -93,11 +93,11 @@ def save_index(self) -> bool: if self._settings is None: logger.error("No settings available, cannot save index") return False - + if self._scip_index is None: logger.error("No index available to save") return False - + self.save_current_index() logger.info("Index saved successfully") return True From f1b56e5704737f1160646948ca1347572bfced05 Mon Sep 17 00:00:00 2001 From: johnhuang316 <134570882+johnhuang316@users.noreply.github.com> Date: Thu, 21 Aug 2025 16:19:14 +0800 Subject: [PATCH 5/8] Enhance project management and relationship extraction in SCIP index - Implemented index rebuild check and cleanup of legacy files in ProjectManagementService. - Updated ProjectConfigTool to check for the latest index version and clean up legacy files. - Refactored relationship handling in SCIPRelationshipReader to improve extraction from both symbol relationships and occurrences. - Added cross-document analysis capabilities for called_by relationships. - Simplified relationship data structures and improved deduplication logic. - Removed unnecessary SCIPRelationshipReader dependency from SCIPSymbolAnalyzer, directly utilizing SCIP index for relationship extraction. - Streamlined output format in FileAnalysis for efficiency. --- benchmark_scip_framework.py | 1017 +++++++++++++++ src/code_index_mcp/indexing/scip_builder.py | 369 ++---- src/code_index_mcp/project_settings.py | 160 +-- src/code_index_mcp/scip/__init__.py | 8 +- src/code_index_mcp/scip/factory.py | 200 --- src/code_index_mcp/scip/framework/__init__.py | 157 +++ .../scip/framework/base/__init__.py | 13 + .../scip/framework/base/enum_mapper.py | 38 + .../scip/framework/base/index_factory.py | 206 ++++ .../scip/framework/base/language_analyzer.py | 77 ++ .../framework/base/relationship_extractor.py | 41 + .../scip/framework/caching_system.py | 346 ++++++ .../scip/framework/compliance_validator.py | 319 +++++ .../scip/framework/fallback/__init__.py | 14 + .../scip/framework/fallback/basic_analyzer.py | 156 +++ .../scip/framework/fallback/enum_mapper.py | 102 ++ .../scip/framework/fallback/factory.py | 153 +++ .../fallback/relationship_extractor.py | 85 ++ .../scip/framework/index_factory.py | 337 +++++ .../scip/framework/java/__init__.py | 14 + .../scip/framework/java/enum_mapper.py | 200 +++ .../scip/framework/java/factory.py | 399 ++++++ .../framework/java/relationship_extractor.py | 295 +++++ .../framework/java/tree_sitter_analyzer.py | 327 +++++ .../scip/framework/javascript/__init__.py | 14 + .../scip/framework/javascript/enum_mapper.py | 237 ++++ .../scip/framework/javascript/factory.py | 376 ++++++ .../javascript/relationship_extractor.py | 281 +++++ .../framework/javascript/syntax_analyzer.py | 418 +++++++ .../scip/framework/objective_c/__init__.py | 14 + .../framework/objective_c/clang_analyzer.py | 338 +++++ .../scip/framework/objective_c/enum_mapper.py | 228 ++++ .../scip/framework/objective_c/factory.py | 500 ++++++++ .../objective_c/relationship_extractor.py | 276 +++++ .../scip/framework/position_calculator.py | 225 ++++ .../scip/framework/python/__init__.py | 14 + .../scip/framework/python/ast_analyzer.py | 312 +++++ .../scip/framework/python/enum_mapper.py | 181 +++ .../scip/framework/python/factory.py | 583 +++++++++ .../python/relationship_extractor.py | 205 ++++ .../scip/framework/relationship_manager.py | 406 ++++++ .../scip/framework/standard_framework.py | 354 ++++++ .../scip/framework/streaming_indexer.py | 429 +++++++ .../scip/framework/symbol_generator.py | 144 +++ src/code_index_mcp/scip/framework/types.py | 79 ++ .../scip/framework/unified_api.py | 456 +++++++ .../scip/framework/zig/__init__.py | 14 + .../scip/framework/zig/enum_mapper.py | 217 ++++ .../scip/framework/zig/factory.py | 388 ++++++ .../framework/zig/relationship_extractor.py | 322 +++++ .../framework/zig/tree_sitter_analyzer.py | 357 ++++++ src/code_index_mcp/scip/language_manager.py | 522 ++++++++ .../scip/strategies/__init__.py | 5 - .../scip/strategies/base_strategy.py | 432 ------- .../scip/strategies/fallback_strategy.py | 193 --- .../scip/strategies/java_strategy.py | 624 ---------- .../scip/strategies/javascript_strategy.py | 974 --------------- .../scip/strategies/objective_c_strategy.py | 1083 ---------------- .../scip/strategies/python_strategy.py | 413 ------- .../scip/strategies/zig_strategy.py | 1086 ----------------- .../services/project_management_service.py | 16 + .../tools/config/project_config_tool.py | 17 +- .../tools/scip/relationship_info.py | 499 ++++++-- .../tools/scip/scip_symbol_analyzer.py | 145 ++- .../tools/scip/symbol_definitions.py | 5 +- 65 files changed, 12884 insertions(+), 5531 deletions(-) create mode 100644 benchmark_scip_framework.py delete mode 100644 src/code_index_mcp/scip/factory.py create mode 100644 src/code_index_mcp/scip/framework/__init__.py create mode 100644 src/code_index_mcp/scip/framework/base/__init__.py create mode 100644 src/code_index_mcp/scip/framework/base/enum_mapper.py create mode 100644 src/code_index_mcp/scip/framework/base/index_factory.py create mode 100644 src/code_index_mcp/scip/framework/base/language_analyzer.py create mode 100644 src/code_index_mcp/scip/framework/base/relationship_extractor.py create mode 100644 src/code_index_mcp/scip/framework/caching_system.py create mode 100644 src/code_index_mcp/scip/framework/compliance_validator.py create mode 100644 src/code_index_mcp/scip/framework/fallback/__init__.py create mode 100644 src/code_index_mcp/scip/framework/fallback/basic_analyzer.py create mode 100644 src/code_index_mcp/scip/framework/fallback/enum_mapper.py create mode 100644 src/code_index_mcp/scip/framework/fallback/factory.py create mode 100644 src/code_index_mcp/scip/framework/fallback/relationship_extractor.py create mode 100644 src/code_index_mcp/scip/framework/index_factory.py create mode 100644 src/code_index_mcp/scip/framework/java/__init__.py create mode 100644 src/code_index_mcp/scip/framework/java/enum_mapper.py create mode 100644 src/code_index_mcp/scip/framework/java/factory.py create mode 100644 src/code_index_mcp/scip/framework/java/relationship_extractor.py create mode 100644 src/code_index_mcp/scip/framework/java/tree_sitter_analyzer.py create mode 100644 src/code_index_mcp/scip/framework/javascript/__init__.py create mode 100644 src/code_index_mcp/scip/framework/javascript/enum_mapper.py create mode 100644 src/code_index_mcp/scip/framework/javascript/factory.py create mode 100644 src/code_index_mcp/scip/framework/javascript/relationship_extractor.py create mode 100644 src/code_index_mcp/scip/framework/javascript/syntax_analyzer.py create mode 100644 src/code_index_mcp/scip/framework/objective_c/__init__.py create mode 100644 src/code_index_mcp/scip/framework/objective_c/clang_analyzer.py create mode 100644 src/code_index_mcp/scip/framework/objective_c/enum_mapper.py create mode 100644 src/code_index_mcp/scip/framework/objective_c/factory.py create mode 100644 src/code_index_mcp/scip/framework/objective_c/relationship_extractor.py create mode 100644 src/code_index_mcp/scip/framework/position_calculator.py create mode 100644 src/code_index_mcp/scip/framework/python/__init__.py create mode 100644 src/code_index_mcp/scip/framework/python/ast_analyzer.py create mode 100644 src/code_index_mcp/scip/framework/python/enum_mapper.py create mode 100644 src/code_index_mcp/scip/framework/python/factory.py create mode 100644 src/code_index_mcp/scip/framework/python/relationship_extractor.py create mode 100644 src/code_index_mcp/scip/framework/relationship_manager.py create mode 100644 src/code_index_mcp/scip/framework/standard_framework.py create mode 100644 src/code_index_mcp/scip/framework/streaming_indexer.py create mode 100644 src/code_index_mcp/scip/framework/symbol_generator.py create mode 100644 src/code_index_mcp/scip/framework/types.py create mode 100644 src/code_index_mcp/scip/framework/unified_api.py create mode 100644 src/code_index_mcp/scip/framework/zig/__init__.py create mode 100644 src/code_index_mcp/scip/framework/zig/enum_mapper.py create mode 100644 src/code_index_mcp/scip/framework/zig/factory.py create mode 100644 src/code_index_mcp/scip/framework/zig/relationship_extractor.py create mode 100644 src/code_index_mcp/scip/framework/zig/tree_sitter_analyzer.py create mode 100644 src/code_index_mcp/scip/language_manager.py delete mode 100644 src/code_index_mcp/scip/strategies/__init__.py delete mode 100644 src/code_index_mcp/scip/strategies/base_strategy.py delete mode 100644 src/code_index_mcp/scip/strategies/fallback_strategy.py delete mode 100644 src/code_index_mcp/scip/strategies/java_strategy.py delete mode 100644 src/code_index_mcp/scip/strategies/javascript_strategy.py delete mode 100644 src/code_index_mcp/scip/strategies/objective_c_strategy.py delete mode 100644 src/code_index_mcp/scip/strategies/python_strategy.py delete mode 100644 src/code_index_mcp/scip/strategies/zig_strategy.py diff --git a/benchmark_scip_framework.py b/benchmark_scip_framework.py new file mode 100644 index 0000000..88d05f5 --- /dev/null +++ b/benchmark_scip_framework.py @@ -0,0 +1,1017 @@ +"""SCIP Framework Performance Benchmark Suite - Comprehensive performance testing and analysis.""" + +import os +import time +import tempfile +import statistics +import gc +import psutil +import threading +from pathlib import Path +from typing import Dict, List, Any, Tuple, Optional +from dataclasses import dataclass, asdict +from concurrent.futures import ThreadPoolExecutor, as_completed + +from src.code_index_mcp.scip.framework import ( + SCIPFrameworkAPI, SCIPConfig, create_scip_framework, + PythonSCIPIndexFactory, JavaScriptSCIPIndexFactory, JavaSCIPIndexFactory, + SCIPCacheManager, StreamingIndexer +) + + +@dataclass +class BenchmarkResult: + """Benchmark result data structure.""" + test_name: str + file_count: int + total_time: float + memory_usage_mb: float + symbols_generated: int + occurrences_generated: int + cache_hit_rate: float + throughput_files_per_sec: float + throughput_symbols_per_sec: float + error_count: int + additional_metrics: Dict[str, Any] + + +@dataclass +class SystemMetrics: + """System resource metrics.""" + cpu_percent: float + memory_percent: float + memory_available_mb: float + disk_io_read_mb: float + disk_io_write_mb: float + + +class PerformanceMonitor: + """Real-time performance monitoring during benchmarks.""" + + def __init__(self): + self.monitoring = False + self.metrics_history: List[SystemMetrics] = [] + self.monitor_thread: Optional[threading.Thread] = None + self.process = psutil.Process() + + def start_monitoring(self, interval: float = 0.5): + """Start performance monitoring.""" + self.monitoring = True + self.metrics_history.clear() + self.monitor_thread = threading.Thread(target=self._monitor_loop, args=(interval,)) + self.monitor_thread.daemon = True + self.monitor_thread.start() + + def stop_monitoring(self) -> List[SystemMetrics]: + """Stop monitoring and return collected metrics.""" + self.monitoring = False + if self.monitor_thread: + self.monitor_thread.join(timeout=2.0) + return self.metrics_history.copy() + + def _monitor_loop(self, interval: float): + """Monitor system metrics in a loop.""" + while self.monitoring: + try: + # Get current metrics + memory_info = self.process.memory_info() + + metrics = SystemMetrics( + cpu_percent=self.process.cpu_percent(), + memory_percent=self.process.memory_percent(), + memory_available_mb=memory_info.rss / 1024 / 1024, + disk_io_read_mb=0.0, # Simplified for demo + disk_io_write_mb=0.0 + ) + + self.metrics_history.append(metrics) + time.sleep(interval) + + except Exception as e: + print(f"Monitoring error: {e}") + break + + +class SCIPFrameworkBenchmark: + """Comprehensive benchmark suite for SCIP framework.""" + + def __init__(self): + self.results: List[BenchmarkResult] = [] + self.monitor = PerformanceMonitor() + + def run_all_benchmarks(self) -> Dict[str, Any]: + """Run complete benchmark suite.""" + print("=== SCIP Framework Performance Benchmark Suite ===") + print(f"System: {psutil.cpu_count()} CPUs, {psutil.virtual_memory().total // 1024**3} GB RAM") + + with tempfile.TemporaryDirectory() as temp_dir: + # Create test projects of various sizes + small_project = self.create_test_project(temp_dir, "small", 50) + medium_project = self.create_test_project(temp_dir, "medium", 200) + large_project = self.create_test_project(temp_dir, "large", 1000) + + # Run benchmarks + benchmark_suite = [ + ("Small Project (50 files)", small_project, {'max_workers': 2, 'batch_size': 10}), + ("Medium Project (200 files)", medium_project, {'max_workers': 4, 'batch_size': 50}), + ("Large Project (1000 files)", large_project, {'max_workers': 8, 'batch_size': 100}), + ] + + for test_name, project_path, config_overrides in benchmark_suite: + print(f"\n🏃 Running: {test_name}") + + # Basic index generation benchmark + result = self.benchmark_index_generation(test_name, project_path, config_overrides) + self.results.append(result) + + # Caching performance benchmark + cache_result = self.benchmark_caching_performance(f"{test_name} - Caching", project_path, config_overrides) + self.results.append(cache_result) + + # Streaming performance benchmark + streaming_result = self.benchmark_streaming_performance(f"{test_name} - Streaming", project_path, config_overrides) + self.results.append(streaming_result) + + # Multi-language benchmark + multi_lang_project = self.create_multi_language_project(temp_dir) + multi_result = self.benchmark_multi_language(multi_lang_project) + self.results.append(multi_result) + + # Memory stress test + memory_result = self.benchmark_memory_usage(large_project) + self.results.append(memory_result) + + # Concurrent processing benchmark + concurrent_result = self.benchmark_concurrent_processing(medium_project) + self.results.append(concurrent_result) + + # Generate comprehensive report + return self.generate_benchmark_report() + + def create_test_project(self, base_dir: str, project_name: str, file_count: int) -> str: + """Create test project with specified number of files.""" + project_dir = os.path.join(base_dir, project_name) + os.makedirs(project_dir, exist_ok=True) + + # Generate Python files with varying complexity + for i in range(file_count): + file_path = os.path.join(project_dir, f"module_{i:04d}.py") + content = self.generate_python_file_content(i, file_count) + + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + + return project_dir + + def create_multi_language_project(self, base_dir: str) -> str: + """Create project with multiple programming languages.""" + project_dir = os.path.join(base_dir, "multi_language") + os.makedirs(project_dir, exist_ok=True) + + # Python files + for i in range(30): + file_path = os.path.join(project_dir, f"python_module_{i}.py") + with open(file_path, 'w') as f: + f.write(self.generate_python_file_content(i, 30)) + + # JavaScript files + for i in range(20): + file_path = os.path.join(project_dir, f"js_module_{i}.js") + with open(file_path, 'w') as f: + f.write(self.generate_javascript_file_content(i)) + + # Java files + for i in range(15): + file_path = os.path.join(project_dir, f"JavaClass_{i}.java") + with open(file_path, 'w') as f: + f.write(self.generate_java_file_content(i)) + + return project_dir + + def generate_python_file_content(self, file_index: int, total_files: int) -> str: + """Generate Python file content with realistic complexity.""" + imports_count = min(5, file_index % 8 + 1) + classes_count = file_index % 3 + 1 + functions_count = file_index % 5 + 2 + + content = f'"""Module {file_index} - Generated for performance testing."""\n\n' + + # Add imports + for i in range(imports_count): + import_target = f"module_{(file_index + i) % total_files:04d}" + content += f"from {import_target} import Class{i}, function_{i}\n" + + content += "\nimport os\nimport sys\nfrom typing import List, Dict, Optional\n\n" + + # Add classes + for class_i in range(classes_count): + content += f''' +class Class{file_index}_{class_i}: + """Test class {class_i} in module {file_index}.""" + + def __init__(self, value: int = 0): + self.value = value + self.data: Dict[str, int] = {{}} + self.items: List[str] = [] + + def process_data(self, input_data: List[int]) -> Dict[str, int]: + """Process input data and return results.""" + result = {{}} + for i, item in enumerate(input_data): + key = f"item_{{i}}" + result[key] = item * self.value + return result + + def calculate_total(self, multiplier: float = 1.0) -> float: + """Calculate total value.""" + return sum(self.data.values()) * multiplier + + def add_item(self, item: str) -> None: + """Add item to collection.""" + if item not in self.items: + self.items.append(item) + + @property + def item_count(self) -> int: + """Get number of items.""" + return len(self.items) +''' + + # Add functions + for func_i in range(functions_count): + content += f''' +def function_{file_index}_{func_i}(param1: int, param2: str = "default") -> Tuple[int, str]: + """Function {func_i} in module {file_index}.""" + processed_value = param1 * {func_i + 1} + processed_string = f"{{param2}}_{{processed_value}}" + + # Some processing logic + if processed_value > 100: + processed_value = processed_value // 2 + + return processed_value, processed_string + +def helper_function_{file_index}_{func_i}(data: List[Any]) -> Optional[Any]: + """Helper function for function_{func_i}.""" + if not data: + return None + + return data[0] if len(data) == 1 else data +''' + + # Add module-level variables + content += f''' +# Module-level variables +MODULE_ID = {file_index} +MODULE_NAME = "module_{file_index:04d}" +DEFAULT_CONFIG = {{ + "enabled": True, + "max_items": {file_index * 10 + 100}, + "timeout": {file_index * 2 + 30} +}} +''' + + return content + + def generate_javascript_file_content(self, file_index: int) -> str: + """Generate JavaScript file content.""" + return f''' +// JavaScript module {file_index} for performance testing +const express = require('express'); +const {{ EventEmitter }} = require('events'); + +class Service{file_index} extends EventEmitter {{ + constructor(config = {{}}) {{ + super(); + this.config = config; + this.data = new Map(); + this.active = false; + }} + + async initialize() {{ + this.active = true; + this.emit('initialized', {{ serviceId: {file_index} }}); + }} + + processData(input) {{ + const result = []; + for (const item of input) {{ + result.push({{ + id: item.id, + value: item.value * {file_index}, + timestamp: Date.now() + }}); + }} + return result; + }} + + async asyncOperation(delay = 100) {{ + return new Promise(resolve => {{ + setTimeout(() => {{ + resolve({{ result: 'completed', serviceId: {file_index} }}); + }}, delay); + }}); + }} +}} + +function helper{file_index}(data) {{ + return data.map(item => ({{ + ...item, + processed: true, + serviceId: {file_index} + }})); +}} + +const config{file_index} = {{ + serviceId: {file_index}, + enabled: true, + maxConnections: {file_index * 10 + 50} +}}; + +module.exports = {{ + Service{file_index}, + helper{file_index}, + config{file_index} +}}; +''' + + def generate_java_file_content(self, file_index: int) -> str: + """Generate Java file content.""" + return f''' +package com.benchmark.test; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.time.LocalDateTime; + +/** + * Test class {file_index} for performance benchmarking. + * Demonstrates various Java language features. + */ +public class JavaClass_{file_index} {{ + private final int classId; + private final Map data; + private final List items; + private boolean active; + + /** + * Constructor for JavaClass_{file_index}. + * + * @param classId Unique identifier for this class + */ + public JavaClass_{file_index}(int classId) {{ + this.classId = classId; + this.data = new ConcurrentHashMap<>(); + this.items = new ArrayList<>(); + this.active = false; + }} + + /** + * Initialize the class with default values. + */ + public void initialize() {{ + this.active = true; + this.data.put("initialized", LocalDateTime.now()); + this.data.put("classId", this.classId); + }} + + /** + * Process a list of integers and return results. + * + * @param input List of integers to process + * @return Map of processed results + */ + public Map processNumbers(List input) {{ + Map results = new HashMap<>(); + + for (int i = 0; i < input.size(); i++) {{ + String key = "result_" + i; + Integer value = input.get(i) * {file_index} + i; + results.put(key, value); + }} + + return results; + }} + + /** + * Add item to the collection. + * + * @param item Item to add + * @return true if item was added, false if it already exists + */ + public boolean addItem(String item) {{ + if (item == null || item.trim().isEmpty()) {{ + return false; + }} + + if (!items.contains(item)) {{ + items.add(item); + return true; + }} + + return false; + }} + + /** + * Get total count of items. + * + * @return Number of items in collection + */ + public int getItemCount() {{ + return items.size(); + }} + + /** + * Check if class is active. + * + * @return true if active, false otherwise + */ + public boolean isActive() {{ + return active; + }} + + /** + * Set active status. + * + * @param active New active status + */ + public void setActive(boolean active) {{ + this.active = active; + if (active) {{ + data.put("lastActivated", LocalDateTime.now()); + }} + }} + + @Override + public String toString() {{ + return String.format("JavaClass_%d{{classId=%d, active=%s, items=%d}}", + {file_index}, classId, active, items.size()); + }} + + @Override + public boolean equals(Object obj) {{ + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + JavaClass_{file_index} other = (JavaClass_{file_index}) obj; + return classId == other.classId; + }} + + @Override + public int hashCode() {{ + return Objects.hash(classId); + }} +}} +''' + + def benchmark_index_generation(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult: + """Benchmark basic index generation performance.""" + print(f" 📊 Index generation benchmark...") + + # Configure framework + config = SCIPConfig( + project_root=project_path, + cache_enabled=False, # Disable cache for pure generation benchmark + validate_compliance=True, + **config_overrides + ) + + framework = SCIPFrameworkAPI(config) + + # Count files + file_count = len(list(Path(project_path).rglob("*.py"))) + + # Start monitoring + self.monitor.start_monitoring() + + # Run benchmark + start_time = time.time() + start_memory = psutil.Process().memory_info().rss / 1024 / 1024 + + try: + index = framework.create_complete_index() + + end_time = time.time() + end_memory = psutil.Process().memory_info().rss / 1024 / 1024 + + # Stop monitoring + metrics_history = self.monitor.stop_monitoring() + + # Calculate metrics + total_time = end_time - start_time + memory_usage = end_memory - start_memory + + symbols_count = sum(len(doc.symbols) for doc in index.documents) + occurrences_count = sum(len(doc.occurrences) for doc in index.occurrences) + + throughput_files = file_count / total_time if total_time > 0 else 0 + throughput_symbols = symbols_count / total_time if total_time > 0 else 0 + + # Additional metrics + avg_cpu = statistics.mean([m.cpu_percent for m in metrics_history]) if metrics_history else 0 + peak_memory = max([m.memory_available_mb for m in metrics_history]) if metrics_history else end_memory + + result = BenchmarkResult( + test_name=test_name, + file_count=file_count, + total_time=total_time, + memory_usage_mb=memory_usage, + symbols_generated=symbols_count, + occurrences_generated=occurrences_count, + cache_hit_rate=0.0, # No cache in this test + throughput_files_per_sec=throughput_files, + throughput_symbols_per_sec=throughput_symbols, + error_count=0, + additional_metrics={ + 'avg_cpu_percent': avg_cpu, + 'peak_memory_mb': peak_memory, + 'documents_generated': len(index.documents), + 'external_symbols': len(index.external_symbols) + } + ) + + print(f" ✓ {file_count} files, {symbols_count} symbols in {total_time:.2f}s") + print(f" ✓ {throughput_files:.1f} files/sec, {throughput_symbols:.1f} symbols/sec") + + return result + + except Exception as e: + self.monitor.stop_monitoring() + print(f" ❌ Benchmark failed: {e}") + + return BenchmarkResult( + test_name=f"{test_name} (FAILED)", + file_count=file_count, + total_time=0, + memory_usage_mb=0, + symbols_generated=0, + occurrences_generated=0, + cache_hit_rate=0.0, + throughput_files_per_sec=0, + throughput_symbols_per_sec=0, + error_count=1, + additional_metrics={'error': str(e)} + ) + + def benchmark_caching_performance(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult: + """Benchmark caching system performance.""" + print(f" 🗂️ Caching performance benchmark...") + + config = SCIPConfig( + project_root=project_path, + cache_enabled=True, + **config_overrides + ) + + framework = SCIPFrameworkAPI(config) + file_count = len(list(Path(project_path).rglob("*.py"))) + + # First run to populate cache + start_time = time.time() + index1 = framework.create_complete_index() + first_run_time = time.time() - start_time + + # Second run with cache + start_time = time.time() + index2 = framework.create_complete_index() + second_run_time = time.time() - start_time + + # Get cache statistics + cache_stats = framework.get_cache_statistics() + hit_rate = float(cache_stats.get('hit_rate', '0%').rstrip('%')) / 100.0 + + symbols_count = sum(len(doc.symbols) for doc in index2.documents) + + result = BenchmarkResult( + test_name=test_name, + file_count=file_count, + total_time=second_run_time, + memory_usage_mb=0, # Not measured in this test + symbols_generated=symbols_count, + occurrences_generated=0, + cache_hit_rate=hit_rate, + throughput_files_per_sec=file_count / second_run_time if second_run_time > 0 else 0, + throughput_symbols_per_sec=symbols_count / second_run_time if second_run_time > 0 else 0, + error_count=0, + additional_metrics={ + 'first_run_time': first_run_time, + 'second_run_time': second_run_time, + 'cache_speedup': first_run_time / second_run_time if second_run_time > 0 else 0, + 'cache_entries': cache_stats.get('memory_entries', 0) + } + ) + + speedup = first_run_time / second_run_time if second_run_time > 0 else 0 + print(f" ✓ Cache hit rate: {hit_rate:.1%}, speedup: {speedup:.1f}x") + + return result + + def benchmark_streaming_performance(self, test_name: str, project_path: str, config_overrides: Dict) -> BenchmarkResult: + """Benchmark streaming indexer performance.""" + print(f" 🌊 Streaming performance benchmark...") + + config = SCIPConfig( + project_root=project_path, + cache_enabled=True, + **config_overrides + ) + + framework = SCIPFrameworkAPI(config) + python_files = list(Path(project_path).rglob("*.py")) + file_paths = [str(f) for f in python_files] + + # Create streaming indexer + python_factory = PythonSCIPIndexFactory(project_path) + cache_manager = SCIPCacheManager() + streaming_indexer = StreamingIndexer( + factory=python_factory, + cache_manager=cache_manager, + max_workers=config_overrides.get('max_workers', 4), + chunk_size=config_overrides.get('batch_size', 50) // 2 + ) + + # Track progress + progress_updates = [] + def track_progress(progress): + progress_updates.append({ + 'percentage': progress.progress_percentage, + 'elapsed': progress.elapsed_time + }) + + streaming_indexer.add_progress_callback(track_progress) + + # Run streaming benchmark + start_time = time.time() + + documents = [] + for doc in streaming_indexer.index_files_streaming(file_paths): + documents.append(doc) + + total_time = time.time() - start_time + + symbols_count = sum(len(doc.symbols) for doc in documents) + occurrences_count = sum(len(doc.occurrences) for doc in documents) + + result = BenchmarkResult( + test_name=test_name, + file_count=len(file_paths), + total_time=total_time, + memory_usage_mb=0, + symbols_generated=symbols_count, + occurrences_generated=occurrences_count, + cache_hit_rate=0.0, + throughput_files_per_sec=len(file_paths) / total_time if total_time > 0 else 0, + throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0, + error_count=0, + additional_metrics={ + 'progress_updates': len(progress_updates), + 'avg_chunk_time': total_time / max(1, len(progress_updates)), + 'documents_streamed': len(documents) + } + ) + + print(f" ✓ Streamed {len(documents)} documents in {total_time:.2f}s") + + return result + + def benchmark_multi_language(self, project_path: str) -> BenchmarkResult: + """Benchmark multi-language processing.""" + print(f" 🌐 Multi-language performance benchmark...") + + config = SCIPConfig( + project_root=project_path, + max_workers=6, + supported_languages={'python', 'javascript', 'java'} + ) + + framework = SCIPFrameworkAPI(config) + + # Count files by language + python_files = len(list(Path(project_path).rglob("*.py"))) + js_files = len(list(Path(project_path).rglob("*.js"))) + java_files = len(list(Path(project_path).rglob("*.java"))) + total_files = python_files + js_files + java_files + + # Run benchmark + start_time = time.time() + index = framework.create_complete_index() + total_time = time.time() - start_time + + symbols_count = sum(len(doc.symbols) for doc in index.documents) + + result = BenchmarkResult( + test_name="Multi-Language Processing", + file_count=total_files, + total_time=total_time, + memory_usage_mb=0, + symbols_generated=symbols_count, + occurrences_generated=0, + cache_hit_rate=0.0, + throughput_files_per_sec=total_files / total_time if total_time > 0 else 0, + throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0, + error_count=0, + additional_metrics={ + 'python_files': python_files, + 'javascript_files': js_files, + 'java_files': java_files, + 'languages_processed': 3, + 'documents_generated': len(index.documents) + } + ) + + print(f" ✓ {total_files} files ({python_files} Python, {js_files} JS, {java_files} Java)") + print(f" ✓ {symbols_count} symbols in {total_time:.2f}s") + + return result + + def benchmark_memory_usage(self, project_path: str) -> BenchmarkResult: + """Benchmark memory usage under load.""" + print(f" 🧠 Memory usage benchmark...") + + # Configure for memory stress testing + config = SCIPConfig( + project_root=project_path, + max_workers=1, # Single worker to control memory usage + batch_size=10, # Small batches + cache_enabled=True + ) + + framework = SCIPFrameworkAPI(config) + file_count = len(list(Path(project_path).rglob("*.py"))) + + # Monitor memory throughout the process + self.monitor.start_monitoring(interval=0.1) # High frequency monitoring + + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 + + start_time = time.time() + + # Process with memory monitoring + index = framework.create_complete_index() + + total_time = time.time() - start_time + final_memory = process.memory_info().rss / 1024 / 1024 + + # Stop monitoring and analyze + metrics_history = self.monitor.stop_monitoring() + + if metrics_history: + peak_memory = max(m.memory_available_mb for m in metrics_history) + avg_memory = statistics.mean(m.memory_available_mb for m in metrics_history) + else: + peak_memory = final_memory + avg_memory = final_memory + + memory_growth = final_memory - initial_memory + symbols_count = sum(len(doc.symbols) for doc in index.documents) + + result = BenchmarkResult( + test_name="Memory Usage Analysis", + file_count=file_count, + total_time=total_time, + memory_usage_mb=memory_growth, + symbols_generated=symbols_count, + occurrences_generated=0, + cache_hit_rate=0.0, + throughput_files_per_sec=file_count / total_time if total_time > 0 else 0, + throughput_symbols_per_sec=symbols_count / total_time if total_time > 0 else 0, + error_count=0, + additional_metrics={ + 'initial_memory_mb': initial_memory, + 'final_memory_mb': final_memory, + 'peak_memory_mb': peak_memory, + 'avg_memory_mb': avg_memory, + 'memory_efficiency_mb_per_symbol': memory_growth / symbols_count if symbols_count > 0 else 0, + 'monitoring_samples': len(metrics_history) + } + ) + + print(f" ✓ Memory growth: {memory_growth:.1f} MB (peak: {peak_memory:.1f} MB)") + print(f" ✓ {memory_growth/symbols_count:.3f} MB per symbol") + + return result + + def benchmark_concurrent_processing(self, project_path: str) -> BenchmarkResult: + """Benchmark concurrent processing capabilities.""" + print(f" ⚡ Concurrent processing benchmark...") + + python_files = list(Path(project_path).rglob("*.py")) + file_paths = [str(f) for f in python_files] + + # Test different worker counts + worker_counts = [1, 2, 4, 8] + results = {} + + for workers in worker_counts: + config = SCIPConfig( + project_root=project_path, + max_workers=workers, + batch_size=50 + ) + + framework = SCIPFrameworkAPI(config) + + start_time = time.time() + index = framework.create_complete_index() + elapsed_time = time.time() - start_time + + results[workers] = { + 'time': elapsed_time, + 'symbols': sum(len(doc.symbols) for doc in index.documents) + } + + # Find optimal worker count + best_workers = min(results.keys(), key=lambda w: results[w]['time']) + best_time = results[best_workers]['time'] + sequential_time = results[1]['time'] + + speedup = sequential_time / best_time if best_time > 0 else 0 + efficiency = speedup / best_workers if best_workers > 0 else 0 + + result = BenchmarkResult( + test_name="Concurrent Processing Analysis", + file_count=len(file_paths), + total_time=best_time, + memory_usage_mb=0, + symbols_generated=results[best_workers]['symbols'], + occurrences_generated=0, + cache_hit_rate=0.0, + throughput_files_per_sec=len(file_paths) / best_time if best_time > 0 else 0, + throughput_symbols_per_sec=results[best_workers]['symbols'] / best_time if best_time > 0 else 0, + error_count=0, + additional_metrics={ + 'optimal_workers': best_workers, + 'speedup': speedup, + 'efficiency': efficiency, + 'worker_results': results, + 'parallel_efficiency_percent': efficiency * 100 + } + ) + + print(f" ✓ Optimal workers: {best_workers}, speedup: {speedup:.1f}x") + print(f" ✓ Parallel efficiency: {efficiency:.1%}") + + return result + + def generate_benchmark_report(self) -> Dict[str, Any]: + """Generate comprehensive benchmark report.""" + if not self.results: + return {"error": "No benchmark results available"} + + # Calculate aggregate statistics + total_files = sum(r.file_count for r in self.results) + total_symbols = sum(r.symbols_generated for r in self.results) + total_time = sum(r.total_time for r in self.results) + + # Performance metrics + avg_throughput_files = statistics.mean([r.throughput_files_per_sec for r in self.results if r.throughput_files_per_sec > 0]) + avg_throughput_symbols = statistics.mean([r.throughput_symbols_per_sec for r in self.results if r.throughput_symbols_per_sec > 0]) + + # Memory analysis + memory_results = [r for r in self.results if r.memory_usage_mb > 0] + avg_memory_usage = statistics.mean([r.memory_usage_mb for r in memory_results]) if memory_results else 0 + + # Cache performance + cache_results = [r for r in self.results if r.cache_hit_rate > 0] + avg_cache_hit_rate = statistics.mean([r.cache_hit_rate for r in cache_results]) if cache_results else 0 + + # System information + system_info = { + 'cpu_count': psutil.cpu_count(), + 'cpu_freq_mhz': psutil.cpu_freq().current if psutil.cpu_freq() else 0, + 'memory_total_gb': psutil.virtual_memory().total / 1024**3, + 'memory_available_gb': psutil.virtual_memory().available / 1024**3, + 'disk_usage_percent': psutil.disk_usage('/').percent if os.name != 'nt' else psutil.disk_usage('C:\\').percent + } + + # Performance summary + performance_summary = { + 'total_benchmarks': len(self.results), + 'total_files_processed': total_files, + 'total_symbols_generated': total_symbols, + 'total_processing_time': total_time, + 'average_throughput_files_per_sec': avg_throughput_files, + 'average_throughput_symbols_per_sec': avg_throughput_symbols, + 'average_memory_usage_mb': avg_memory_usage, + 'average_cache_hit_rate': avg_cache_hit_rate, + 'failed_benchmarks': len([r for r in self.results if r.error_count > 0]) + } + + # Detailed results + detailed_results = [] + for result in self.results: + detailed_results.append(asdict(result)) + + # Performance recommendations + recommendations = self.generate_performance_recommendations() + + report = { + 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), + 'system_info': system_info, + 'performance_summary': performance_summary, + 'detailed_results': detailed_results, + 'recommendations': recommendations + } + + # Print summary + print("\n" + "="*60) + print("📊 BENCHMARK RESULTS SUMMARY") + print("="*60) + print(f"Total benchmarks: {len(self.results)}") + print(f"Files processed: {total_files:,}") + print(f"Symbols generated: {total_symbols:,}") + print(f"Total time: {total_time:.2f} seconds") + print(f"Average throughput: {avg_throughput_files:.1f} files/sec, {avg_throughput_symbols:.1f} symbols/sec") + print(f"Average memory usage: {avg_memory_usage:.1f} MB") + if avg_cache_hit_rate > 0: + print(f"Average cache hit rate: {avg_cache_hit_rate:.1%}") + print() + + # Print individual results + for result in self.results: + status = "✓" if result.error_count == 0 else "❌" + print(f"{status} {result.test_name}") + print(f" {result.file_count} files → {result.symbols_generated} symbols in {result.total_time:.2f}s") + print(f" {result.throughput_files_per_sec:.1f} files/sec, {result.throughput_symbols_per_sec:.1f} symbols/sec") + if result.cache_hit_rate > 0: + print(f" Cache hit rate: {result.cache_hit_rate:.1%}") + print() + + return report + + def generate_performance_recommendations(self) -> List[str]: + """Generate performance recommendations based on benchmark results.""" + recommendations = [] + + # Analyze results for recommendations + memory_results = [r for r in self.results if r.memory_usage_mb > 0] + if memory_results: + avg_memory = statistics.mean([r.memory_usage_mb for r in memory_results]) + if avg_memory > 500: # More than 500 MB + recommendations.append("Consider reducing batch_size or max_workers to control memory usage") + + # Cache performance + cache_results = [r for r in self.results if r.cache_hit_rate > 0] + if cache_results: + avg_cache_rate = statistics.mean([r.cache_hit_rate for r in cache_results]) + if avg_cache_rate < 0.7: # Less than 70% hit rate + recommendations.append("Cache performance is suboptimal. Consider increasing cache size or optimizing file change detection") + + # Throughput analysis + throughput_results = [r.throughput_files_per_sec for r in self.results if r.throughput_files_per_sec > 0] + if throughput_results: + avg_throughput = statistics.mean(throughput_results) + if avg_throughput < 10: # Less than 10 files per second + recommendations.append("Consider increasing max_workers or batch_size to improve throughput") + + # Concurrent processing + concurrent_results = [r for r in self.results if 'speedup' in r.additional_metrics] + if concurrent_results: + for result in concurrent_results: + efficiency = result.additional_metrics.get('efficiency', 0) + if efficiency < 0.5: # Less than 50% efficiency + recommendations.append("Parallel processing efficiency is low. Consider reducing worker count or optimizing workload distribution") + + # General recommendations + recommendations.extend([ + "Enable caching for repeated operations to improve performance", + "Use SSD storage for cache directory to reduce I/O latency", + "Monitor memory usage during large project processing", + "Consider streaming processing for very large codebases", + "Validate SCIP compliance only when necessary for better performance" + ]) + + return recommendations + + +def run_benchmark_suite(): + """Main function to run the complete benchmark suite.""" + benchmark = SCIPFrameworkBenchmark() + + try: + report = benchmark.run_all_benchmarks() + + # Save report to file + import json + report_path = "scip_framework_benchmark_report.json" + with open(report_path, 'w', encoding='utf-8') as f: + json.dump(report, f, indent=2, ensure_ascii=False) + + print(f"📄 Detailed benchmark report saved to: {report_path}") + + # Print recommendations + print("\n🎯 PERFORMANCE RECOMMENDATIONS:") + for i, rec in enumerate(report['recommendations'], 1): + print(f"{i}. {rec}") + + return report + + except Exception as e: + print(f"❌ Benchmark suite failed: {e}") + import traceback + traceback.print_exc() + return None + + +if __name__ == "__main__": + run_benchmark_suite() \ No newline at end of file diff --git a/src/code_index_mcp/indexing/scip_builder.py b/src/code_index_mcp/indexing/scip_builder.py index 828d378..0a58e13 100644 --- a/src/code_index_mcp/indexing/scip_builder.py +++ b/src/code_index_mcp/indexing/scip_builder.py @@ -10,14 +10,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field -from ..scip.factory import SCIPIndexerFactory, SCIPIndexingError +from ..scip.language_manager import SCIPLanguageManager, LanguageNotSupportedException from ..scip.proto import scip_pb2 logger = logging.getLogger(__name__) - @dataclass class ValidationResult: """Result of SCIP index validation.""" @@ -34,19 +33,21 @@ class ScanResult: class SCIPIndexBuilder: - """Main builder class that orchestrates SCIP-based indexing.""" + """Main builder class that orchestrates SCIP-based indexing with new language manager.""" def __init__(self, max_workers: Optional[int] = None): self.max_workers = max_workers - self.scip_factory = SCIPIndexerFactory() + self.language_manager: Optional[SCIPLanguageManager] = None self.project_path = "" def build_scip_index(self, project_path: str) -> scip_pb2.Index: """Build complete SCIP index for a project.""" - # Build index without timing logs start_time = datetime.now() self.project_path = project_path + # Initialize language manager for this project + self.language_manager = SCIPLanguageManager(project_path) + logger.info("🚀 Starting SCIP index build for project: %s", project_path) logger.debug("Build configuration: max_workers=%s", self.max_workers) @@ -57,21 +58,22 @@ def build_scip_index(self, project_path: str) -> scip_pb2.Index: total_files_considered = len(scan_result.file_list) logger.info("✅ File scan completed, found %d valid files", total_files_considered) - logger.info("🏷️ Phase 2: Grouping files by strategy...") + logger.info("🏷️ Phase 2: Analyzing language distribution...") file_paths = [str(f['path']) for f in scan_result.file_list] - strategy_files = self.scip_factory.group_files_by_strategy(file_paths) + language_stats = self.language_manager.get_language_statistics(file_paths) - for strategy, files in strategy_files.items(): - logger.info(" 📋 %s: %d files", strategy.__class__.__name__, len(files)) - logger.debug("File grouping completed") + for language, count in language_stats.items(): + logger.info(" 📋 %s: %d files", language, count) + logger.debug("Language analysis completed") - logger.info("⚙️ Phase 3: Processing files with strategies...") - all_documents = self._process_files(strategy_files, project_path) - logger.info("✅ File processing completed, generated %d documents", len(all_documents)) + logger.info("⚙️ Phase 3: Processing files with language manager...") + # Use the new language manager to create the complete index directly + scip_index = self.language_manager.create_complete_index(file_paths) + logger.info("✅ File processing completed, generated %d documents", len(scip_index.documents)) - logger.info("🔗 Phase 4: Assembling SCIP index...") - scip_index = self._assemble_scip_index(all_documents, scan_result, start_time) - logger.debug("Index assembly completed") + logger.info("🔗 Phase 4: Adding metadata...") + self._add_build_metadata(scip_index, scan_result, start_time) + logger.debug("Metadata addition completed") logger.info("🎉 SCIP index build completed successfully") @@ -87,6 +89,49 @@ def build_scip_index(self, project_path: str) -> scip_pb2.Index: logger.error("❌ SCIP index build failed: %s", e, exc_info=True) return self._create_fallback_scip_index(project_path, str(e)) + def _add_build_metadata(self, scip_index: scip_pb2.Index, scan_result: ScanResult, start_time: datetime) -> None: + """Add build metadata to the SCIP index.""" + build_time = datetime.now() - start_time + + # Add tool info to metadata if not already present + if not scip_index.metadata.tool_info.name: + scip_index.metadata.tool_info.name = "code-index-mcp" + scip_index.metadata.tool_info.version = "2.1.0" # Version with new architecture + + # Add project info + if not scip_index.metadata.project_root: + scip_index.metadata.project_root = self.project_path + + logger.debug(f"Added build metadata: {len(scip_index.documents)} documents, build time: {build_time}") + + def _create_fallback_scip_index(self, project_path: str, error_message: str) -> scip_pb2.Index: + """Create a minimal fallback SCIP index when build fails.""" + logger.warning("Creating fallback SCIP index due to error: %s", error_message) + + try: + # Use fallback language manager + fallback_manager = SCIPLanguageManager(project_path) + fallback_factory = fallback_manager.get_factory('fallback') + + # Create minimal index with just metadata + index = scip_pb2.Index() + index.metadata.CopyFrom(fallback_factory.create_metadata(project_path)) + + # Add error document + error_doc = scip_pb2.Document() + error_doc.relative_path = "BUILD_ERROR.md" + error_doc.language = "markdown" + error_doc.text = f"# Build Error\n\nSCIP indexing failed: {error_message}\n" + index.documents.append(error_doc) + + logger.info("Created fallback SCIP index with basic metadata") + return index + + except Exception as e: + logger.error(f"Failed to create fallback index: {e}") + # Return completely empty index as last resort + return scip_pb2.Index() + def _scan_project_files(self, project_path: str) -> ScanResult: """Scan project directory to get a list of files and metadata.""" logger.debug("📂 Starting file system scan of: %s", project_path) @@ -94,104 +139,68 @@ def _scan_project_files(self, project_path: str) -> ScanResult: # Use project settings for exclude patterns logger.debug("🚫 Loading exclude patterns...") - ignored_dirs = self._get_exclude_patterns() - logger.debug("Ignored directories: %s", ignored_dirs) - - # Load gitignore patterns - logger.debug("📋 Loading .gitignore patterns...") + default_exclude = self._get_default_exclude_patterns() gitignore_spec = self._load_gitignore_patterns(project_path) - if hasattr(gitignore_spec, 'patterns'): - logger.debug("Found %d gitignore patterns", len(gitignore_spec.patterns)) - elif gitignore_spec: - logger.debug("Loaded gitignore specification") - else: - logger.debug("No gitignore patterns found") - scan_count = 0 - gitignore_skipped = 0 - hidden_files_skipped = 0 - ignored_dir_time = 0 - gitignore_check_time = 0 + total_scanned = 0 + excluded_count = 0 + included_count = 0 - for root, dirs, filenames in os.walk(project_path): - scan_count += 1 - if scan_count % 100 == 0: - logger.debug("📊 Scanned %d directories, found %d files so far...", scan_count, len(files)) + try: + for root, dirs, filenames in os.walk(project_path): + total_scanned += len(filenames) - # Check if current root path contains any ignored directories - ignored_dir_start = datetime.now() - root_parts = Path(root).parts - project_parts = Path(project_path).parts - relative_parts = root_parts[len(project_parts):] - - # Skip if any part of the path is in ignored_dirs - if any(part in ignored_dirs for part in relative_parts): - ignored_dir_time += (datetime.now() - ignored_dir_start).total_seconds() - logger.debug("🚫 Skipping ignored directory: %s", root) - dirs[:] = [] # Don't descend further - continue + # Filter directories to skip excluded ones + dirs[:] = [d for d in dirs if not any(pattern in d for pattern in default_exclude)] - # Modify dirs in-place to prune the search - original_dirs = len(dirs) - dirs[:] = [d for d in dirs if d not in ignored_dirs] - if len(dirs) < original_dirs: - ignored_dir_time += (datetime.now() - ignored_dir_start).total_seconds() - logger.debug("🚫 Filtered %d ignored subdirectories in %s", original_dirs - len(dirs), root) - else: - ignored_dir_time += (datetime.now() - ignored_dir_start).total_seconds() - - # Apply gitignore filtering to directories - gitignore_dir_start = datetime.now() - pre_gitignore_dirs = len(dirs) - dirs[:] = [d for d in dirs if not self._is_gitignored(os.path.join(root, d), project_path, gitignore_spec)] - gitignore_filtered_dirs = pre_gitignore_dirs - len(dirs) - gitignore_check_time += (datetime.now() - gitignore_dir_start).total_seconds() - - if gitignore_filtered_dirs > 0: - logger.debug("📋 .gitignore filtered %d directories in %s", gitignore_filtered_dirs, root) - - for filename in filenames: - file_check_start = datetime.now() - - # Ignore hidden files (but allow .gitignore itself) - if filename.startswith('.') and filename != '.gitignore': - hidden_files_skipped += 1 - gitignore_check_time += (datetime.now() - file_check_start).total_seconds() - continue + for filename in filenames: + file_path = os.path.join(root, filename) - full_path = os.path.join(root, filename) - - # Apply gitignore filtering to files - if self._is_gitignored(full_path, project_path, gitignore_spec): - gitignore_skipped += 1 - gitignore_check_time += (datetime.now() - file_check_start).total_seconds() - continue + # Check default exclude patterns + if any(pattern in file_path for pattern in default_exclude): + excluded_count += 1 + continue + + # Check gitignore patterns + if self._is_gitignored(file_path, project_path, gitignore_spec): + excluded_count += 1 + continue - gitignore_check_time += (datetime.now() - file_check_start).total_seconds() - files.append(full_path) + # Include file + file_info = { + 'path': Path(file_path), + 'relative_path': os.path.relpath(file_path, project_path), + 'size': os.path.getsize(file_path), + 'extension': os.path.splitext(filename)[1].lower() + } + files.append(file_info) + included_count += 1 - logger.info("📊 File scan summary: scanned %d directories, found %d valid files", scan_count, len(files)) - logger.info("🚫 Filtered files: %d gitignored, %d hidden files", gitignore_skipped, hidden_files_skipped) - - file_list = [{'path': f, 'is_binary': False} for f in files] - project_metadata = {"project_name": os.path.basename(project_path)} - return ScanResult(file_list=file_list, project_metadata=project_metadata) + except Exception as e: + logger.error("❌ File scan failed: %s", e) + raise + + logger.debug("📊 File scan results: %d total, %d included, %d excluded", + total_scanned, included_count, excluded_count) + + project_metadata = { + 'project_path': project_path, + 'project_name': os.path.basename(project_path), + 'total_files_scanned': total_scanned, + 'files_included': included_count, + 'files_excluded': excluded_count, + 'scan_timestamp': datetime.now().isoformat() + } + + return ScanResult(file_list=files, project_metadata=project_metadata) - def _get_exclude_patterns(self) -> set: - """Get exclude patterns from project settings.""" - try: - from ..project_settings import ProjectSettings - # Try to get patterns from project settings - settings = ProjectSettings(self.project_path, skip_load=False) - exclude_patterns = settings.config.get("file_watcher", {}).get("exclude_patterns", []) - return set(exclude_patterns) - except Exception: - # Fallback to basic patterns if settings not available - return {'.git', '.svn', '.hg', '__pycache__', 'node_modules', '.venv', 'venv', - 'build', 'dist', 'target', '.idea', '.vscode'} + def _get_default_exclude_patterns(self) -> set: + """Get default patterns to exclude from indexing.""" + return {'.git', '.svn', '.hg', '__pycache__', 'node_modules', '.venv', 'venv', + 'build', 'dist', 'target', '.idea', '.vscode'} def _load_gitignore_patterns(self, project_path: str): - """Load patterns from .gitignore file using pathspec (required).""" + """Load patterns from .gitignore file using pathspec.""" gitignore_path = os.path.join(project_path, '.gitignore') if os.path.exists(gitignore_path): @@ -204,8 +213,6 @@ def _load_gitignore_patterns(self, project_path: str): return None return None - - def _is_gitignored(self, file_path: str, project_path: str, gitignore_spec) -> bool: """Check if a file or directory is ignored by .gitignore patterns using pathspec.""" @@ -221,161 +228,33 @@ def _is_gitignored(self, file_path: str, project_path: str, gitignore_spec) -> b return gitignore_spec.match_file(rel_path) except Exception: return False - - - - def _process_files(self, strategy_files: Dict, project_path: str) -> List[scip_pb2.Document]: - """Process files using appropriate strategies, either sequentially or in parallel.""" - if self.max_workers and self.max_workers > 1: - return self._process_files_parallel(strategy_files, project_path) - return self._process_files_sequential(strategy_files, project_path) - - def _process_files_sequential(self, strategy_files: Dict, project_path: str) -> List[scip_pb2.Document]: - """Process files sequentially.""" - logger.debug("🔄 Processing files sequentially (single-threaded)") - all_documents = [] - - for strategy, files in strategy_files.items(): - strategy_name = strategy.__class__.__name__ - logger.info("⚙️ Processing %d files with %s...", len(files), strategy_name) - - try: - documents = strategy.generate_scip_documents(files, project_path) - logger.info("✅ %s completed, generated %d documents", strategy_name, len(documents)) - all_documents.extend(documents) - except Exception as e: - logger.error("❌ %s failed: %s", strategy_name, e, exc_info=True) - logger.info("🔄 Trying fallback strategies for %d files...", len(files)) - fallback_docs = self._try_fallback_strategies(files, strategy, project_path) - all_documents.extend(fallback_docs) - logger.info("📄 Fallback generated %d documents", len(fallback_docs)) - - return all_documents - - def _process_files_parallel(self, strategy_files: Dict, project_path: str) -> List[scip_pb2.Document]: - """Process files in parallel.""" - all_documents = [] - with ThreadPoolExecutor(max_workers=self.max_workers) as executor: - future_to_strategy = { - executor.submit(s.generate_scip_documents, f, project_path): (s, f) - for s, f in strategy_files.items() - } - for future in as_completed(future_to_strategy): - strategy, files = future_to_strategy[future] - try: - documents = future.result() - all_documents.extend(documents) - - except Exception as e: - all_documents.extend(self._try_fallback_strategies(files, strategy, project_path)) - return all_documents - - def _try_fallback_strategies(self, failed_files: List[str], failed_strategy, project_path: str) -> List[scip_pb2.Document]: - """Try fallback strategies for files that failed.""" - fallback_documents = [] - - for file_path in failed_files: - extension = self._get_file_extension(file_path) - strategies = self.scip_factory.get_strategies_for_extension(extension) - fallback_strategies = [s for s in strategies if s != failed_strategy] - - success = False - for fallback in fallback_strategies: - try: - docs = fallback.generate_scip_documents([file_path], project_path) - fallback_documents.extend(docs) - success = True - break - except Exception: - pass - - if not success: - pass - return fallback_documents - - def _assemble_scip_index(self, documents: List[scip_pb2.Document], scan_result: ScanResult, start_time: datetime) -> scip_pb2.Index: - """Assemble the final SCIP index.""" - scip_index = scip_pb2.Index() - scip_index.metadata.CopyFrom(self._create_metadata(scan_result.project_metadata, start_time)) - scip_index.documents.extend(documents) - external_symbols = self._extract_external_symbols(documents) - scip_index.external_symbols.extend(external_symbols) - - return scip_index - - def _create_metadata(self, project_metadata: Dict[str, Any], start_time: datetime) -> scip_pb2.Metadata: - """Create SCIP metadata.""" - metadata = scip_pb2.Metadata() - metadata.version = scip_pb2.ProtocolVersion.UnspecifiedProtocolVersion - metadata.tool_info.name = "code-index-mcp" - metadata.tool_info.version = "1.2.1" - metadata.tool_info.arguments.extend(["scip-indexing"]) - metadata.project_root = self.project_path - metadata.text_document_encoding = scip_pb2.TextDocumentEncoding.UTF8 - return metadata - - def _extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract and deduplicate external symbols from strategies.""" - external_symbols = [] - seen_symbols = set() - - # Collect external symbols from all strategies - for strategy in self.scip_factory.strategies: - try: - strategy_external_symbols = strategy.get_external_symbols() - for symbol_info in strategy_external_symbols: - symbol_id = symbol_info.symbol - if symbol_id not in seen_symbols: - external_symbols.append(symbol_info) - seen_symbols.add(symbol_id) - except Exception as e: - # Strategy might not support external symbols yet - continue - - return external_symbols def _validate_scip_index(self, scip_index: scip_pb2.Index) -> ValidationResult: """Validate the completed SCIP index.""" errors, warnings = [], [] + if not scip_index.metadata.project_root: errors.append("Missing project_root in metadata") if not scip_index.documents: warnings.append("No documents in SCIP index") + for i, doc in enumerate(scip_index.documents): if not doc.relative_path: errors.append(f"Document {i} missing relative_path") if not doc.language: warnings.append(f"Document {i} ({doc.relative_path}) missing language") + if not scip_index.metadata.tool_info.name: warnings.append("Missing tool name in metadata") + return ValidationResult(is_valid=not errors, errors=errors, warnings=warnings) - def _create_fallback_scip_index(self, project_path: str, error_message: str) -> scip_pb2.Index: - """Create a minimal fallback SCIP index on failure.""" - scip_index = scip_pb2.Index() - metadata = scip_pb2.Metadata() - metadata.tool_info.name = "code-index-mcp" - metadata.tool_info.version = "1.2.1" - metadata.project_root = project_path - metadata.text_document_encoding = scip_pb2.TextDocumentEncoding.UTF8 - scip_index.metadata.CopyFrom(metadata) - - error_doc = scip_pb2.Document() - error_doc.relative_path = "BUILD_ERROR.md" - error_doc.language = "markdown" - error_doc.text = f"# Build Error\n\nSCIP indexing failed: {error_message}\n" - scip_index.documents.append(error_doc) - - - return scip_index - - def _get_file_extension(self, file_path: str) -> str: - """Extract file extension.""" - return os.path.splitext(file_path)[1].lower() - - def get_strategy_summary(self) -> Dict[str, Any]: - """Get a summary of available strategies.""" + def get_language_summary(self) -> Dict[str, Any]: + """Get a summary of available languages.""" + if not self.language_manager: + return {"error": "Language manager not initialized"} + return { - 'total_strategies': len(self.scip_factory.strategies), - 'registered_strategies': [s.get_strategy_name() for s in self.scip_factory.strategies] - } + 'supported_languages': list(self.language_manager.get_supported_languages()), + 'project_path': self.project_path + } \ No newline at end of file diff --git a/src/code_index_mcp/project_settings.py b/src/code_index_mcp/project_settings.py index 5ad2c04..ffbf1c1 100644 --- a/src/code_index_mcp/project_settings.py +++ b/src/code_index_mcp/project_settings.py @@ -190,31 +190,10 @@ def get_config_path(self): def get_scip_index_path(self): """Get the path to the SCIP index file""" - try: - path = os.path.join(self.settings_path, SCIP_INDEX_FILE) - # Ensure directory exists - os.makedirs(os.path.dirname(path), exist_ok=True) - return path - except Exception: - # If error occurs, use file in project or home directory as fallback - if self.base_path and os.path.exists(self.base_path): - return os.path.join(self.base_path, SCIP_INDEX_FILE) - else: - return os.path.join(os.path.expanduser("~"), SCIP_INDEX_FILE) - - def get_index_path(self): - """Get the path to the legacy index file (for backward compatibility)""" - try: - path = os.path.join(self.settings_path, INDEX_FILE) - # Ensure directory exists - os.makedirs(os.path.dirname(path), exist_ok=True) - return path - except Exception: - # If error occurs, use file in project or home directory as fallback - if self.base_path and os.path.exists(self.base_path): - return os.path.join(self.base_path, INDEX_FILE) - else: - return os.path.join(os.path.expanduser("~"), INDEX_FILE) + path = os.path.join(self.settings_path, SCIP_INDEX_FILE) + # Ensure directory exists + os.makedirs(os.path.dirname(path), exist_ok=True) + return path # get_cache_path method removed - no longer needed with new indexing system @@ -471,110 +450,47 @@ def load_scip_index(self): # save_cache and load_cache methods removed - no longer needed with new indexing system - def detect_index_version(self): - """Detect the version of the existing index - + def is_latest_index(self) -> bool: + """Check if SCIP index exists and is the latest version. + Returns: - str: Version string ('legacy', '3.0', or None if no index exists) + bool: True if latest SCIP index exists, False if needs rebuild """ try: - # Check for new JSON format first - index_path = self.get_index_path() - if os.path.exists(index_path): - try: - with open(index_path, 'r', encoding='utf-8') as f: - index_data = json.load(f) - - # Check if it has the new structure - if isinstance(index_data, dict) and 'index_metadata' in index_data: - version = index_data.get('index_metadata', {}).get('version', '3.0') - return version - else: - return 'legacy' - except (json.JSONDecodeError, UnicodeDecodeError): - return 'legacy' - - # Check for old pickle format - old_pickle_path = os.path.join(self.settings_path, "file_index.pickle") - if os.path.exists(old_pickle_path): - return 'legacy' - - # Check fallback locations - if self.base_path and os.path.exists(self.base_path): - fallback_json = os.path.join(self.base_path, INDEX_FILE) - fallback_pickle = os.path.join(self.base_path, "file_index.pickle") - else: - fallback_json = os.path.join(os.path.expanduser("~"), INDEX_FILE) - fallback_pickle = os.path.join(os.path.expanduser("~"), "file_index.pickle") - - if os.path.exists(fallback_json): - try: - with open(fallback_json, 'r', encoding='utf-8') as f: - index_data = json.load(f) - if isinstance(index_data, dict) and 'index_metadata' in index_data: - version = index_data.get('index_metadata', {}).get('version', '3.0') - return version - else: - return 'legacy' - except Exception: - return 'legacy' - - if os.path.exists(fallback_pickle): - return 'legacy' - - return None - + # Only check for SCIP index at settings_path + scip_path = os.path.join(self.settings_path, SCIP_INDEX_FILE) + + if not os.path.exists(scip_path): + return False + + # Basic file integrity check + try: + with open(scip_path, 'rb') as f: + # Check if file is readable and has content + return f.read(1) != b'' + except: + return False + except Exception: - return None - - def migrate_legacy_index(self): - """Migrate legacy index format to new format + return False - Returns: - bool: True if migration was successful or not needed, False if failed - """ + def cleanup_legacy_files(self) -> None: + """Clean up any legacy index files found.""" try: - version = self.detect_index_version() - - if version is None: - return True - - if version == '3.0' or (isinstance(version, str) and version >= '3.0'): - return True - - if version == 'legacy': - - # Clean up legacy files - legacy_files = [ - os.path.join(self.settings_path, "file_index.pickle"), - os.path.join(self.settings_path, "content_cache.pickle") - ] - - # Add fallback locations - if self.base_path and os.path.exists(self.base_path): - legacy_files.extend([ - os.path.join(self.base_path, "file_index.pickle"), - os.path.join(self.base_path, "content_cache.pickle") - ]) - else: - legacy_files.extend([ - os.path.join(os.path.expanduser("~"), "file_index.pickle"), - os.path.join(os.path.expanduser("~"), "content_cache.pickle") - ]) - - for legacy_file in legacy_files: - if os.path.exists(legacy_file): - try: - os.remove(legacy_file) - except Exception: - pass - - return False # Indicate that manual rebuild is needed - - return True - + legacy_files = [ + os.path.join(self.settings_path, "file_index.pickle"), + os.path.join(self.settings_path, "content_cache.pickle"), + os.path.join(self.settings_path, INDEX_FILE) # Legacy JSON + ] + + for legacy_file in legacy_files: + if os.path.exists(legacy_file): + try: + os.remove(legacy_file) + except Exception: + pass except Exception: - return False + pass def clear(self): """Clear config and index files""" diff --git a/src/code_index_mcp/scip/__init__.py b/src/code_index_mcp/scip/__init__.py index 30ace0d..47939ef 100644 --- a/src/code_index_mcp/scip/__init__.py +++ b/src/code_index_mcp/scip/__init__.py @@ -1,10 +1,10 @@ """ SCIP (Source Code Intelligence Protocol) indexing module. -This module provides SCIP-based code indexing capabilities using a multi-strategy -approach to support various programming languages and tools. +This module provides SCIP-based code indexing capabilities using a modern +language manager approach to support various programming languages and tools. """ -from .factory import SCIPIndexerFactory, SCIPIndexingError +from .language_manager import SCIPLanguageManager, LanguageNotSupportedException, create_language_manager -__all__ = ['SCIPIndexerFactory', 'SCIPIndexingError'] \ No newline at end of file +__all__ = ['SCIPLanguageManager', 'LanguageNotSupportedException', 'create_language_manager'] \ No newline at end of file diff --git a/src/code_index_mcp/scip/factory.py b/src/code_index_mcp/scip/factory.py deleted file mode 100644 index 1620d8b..0000000 --- a/src/code_index_mcp/scip/factory.py +++ /dev/null @@ -1,200 +0,0 @@ -"""SCIP Indexer Factory - manages and selects appropriate indexing strategies.""" - -import logging -from typing import List, Dict, Set, Optional -from .strategies.base_strategy import SCIPIndexerStrategy, StrategyError -from .strategies.python_strategy import PythonStrategy -from .strategies.javascript_strategy import JavaScriptStrategy -from .strategies.java_strategy import JavaStrategy -from .strategies.objective_c_strategy import ObjectiveCStrategy -# Optional strategies - import only if available -try: - from .strategies.zig_strategy import ZigStrategy - ZIG_AVAILABLE = True -except ImportError: - ZigStrategy = None - ZIG_AVAILABLE = False -from .strategies.fallback_strategy import FallbackStrategy -from ..constants import SUPPORTED_EXTENSIONS - - -logger = logging.getLogger(__name__) - - -class SCIPIndexerFactory: - """Factory for creating and managing SCIP indexing strategies.""" - - def __init__(self): - """Initialize the factory with all available strategies.""" - self.strategies: List[SCIPIndexerStrategy] = [] - self.strategy_cache: Dict[str, SCIPIndexerStrategy] = {} - self._register_all_strategies() - self._validate_coverage() - - def _register_all_strategies(self): - """Register all available strategies in priority order.""" - logger.info("Registering SCIP indexing strategies (SCIP compliant)...") - - # Language-specific strategies (high priority: 95) - strategy_classes = [ - (PythonStrategy, 95), - (JavaScriptStrategy, 95), - (JavaStrategy, 95), - (ObjectiveCStrategy, 95), - ] - - # Add optional strategies if available - if ZIG_AVAILABLE and ZigStrategy: - strategy_classes.append((ZigStrategy, 95)) - - for strategy_class, priority in strategy_classes: - try: - strategy = strategy_class(priority=priority) - if strategy.is_available(): - self.register_strategy(strategy) - logger.debug(f"Registered {strategy.get_strategy_name()}") - else: - logger.warning(f"Strategy {strategy_class.__name__} is not available") - except Exception as e: - logger.warning(f"Failed to initialize {strategy_class.__name__}: {e}") - continue - - # Fallback strategy (lowest priority: 10) - fallback = FallbackStrategy(priority=10) - self.register_strategy(fallback) - logger.debug(f"Registered {fallback.get_strategy_name()}") - - logger.info(f"Registered {len(self.strategies)} strategies") - - def register_strategy(self, strategy: SCIPIndexerStrategy): - """ - Register a new strategy. - - Args: - strategy: The strategy to register - """ - self.strategies.append(strategy) - # Sort strategies by priority (highest first) - self.strategies.sort(key=lambda s: s.get_priority(), reverse=True) - - def get_strategy(self, extension: str, file_path: str = "") -> SCIPIndexerStrategy: - """ - Get the best strategy for a file type. - - Args: - extension: File extension (e.g., '.py') - file_path: Optional full file path for context - - Returns: - Best available strategy for the file type - - Raises: - StrategySelectionError: If no suitable strategy is found - """ - # Check cache first - cache_key = f"{extension}:{file_path}" - if cache_key in self.strategy_cache: - return self.strategy_cache[cache_key] - - # Find the highest priority strategy that can handle this file - for strategy in self.strategies: - if strategy.can_handle(extension, file_path): - self.strategy_cache[cache_key] = strategy - return strategy - - # No strategy found - raise StrategySelectionError(f"No strategy available for extension '{extension}'") - - def get_strategies_for_extension(self, extension: str) -> List[SCIPIndexerStrategy]: - """ - Get all strategies that can handle a file extension. - - Args: - extension: File extension to check - - Returns: - List of strategies, ordered by priority - """ - return [s for s in self.strategies if s.can_handle(extension, "")] - - def list_supported_extensions(self) -> Set[str]: - """ - Get all file extensions supported by registered strategies. - - Returns: - Set of supported file extensions - """ - supported = set() - - # Add extensions from all registered strategies - for strategy in self.strategies: - if isinstance(strategy, PythonStrategy): - supported.update({'.py', '.pyw'}) - elif isinstance(strategy, JavaScriptStrategy): - supported.update({'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs'}) - elif isinstance(strategy, JavaStrategy): - supported.update({'.java'}) - elif isinstance(strategy, ObjectiveCStrategy): - supported.update({'.m', '.mm'}) - elif ZIG_AVAILABLE and isinstance(strategy, ZigStrategy): - supported.update({'.zig', '.zon'}) - elif isinstance(strategy, FallbackStrategy): - # Fallback supports everything, but we don't want to list everything here - pass - - return supported - - def group_files_by_strategy(self, file_paths: List[str]) -> Dict[SCIPIndexerStrategy, List[str]]: - """ - Group files by the strategy that should handle them. - - Args: - file_paths: List of file paths to group - - Returns: - Dictionary mapping strategies to their file lists - """ - strategy_files = {} - - for file_path in file_paths: - # Get file extension - extension = self._get_file_extension(file_path) - - try: - strategy = self.get_strategy(extension, file_path) - if strategy not in strategy_files: - strategy_files[strategy] = [] - strategy_files[strategy].append(file_path) - except StrategySelectionError: - # Skip files we can't handle - logger.debug(f"No strategy available for file: {file_path}") - continue - - return strategy_files - - def _get_file_extension(self, file_path: str) -> str: - """Extract file extension from path.""" - if '.' not in file_path: - return '' - return '.' + file_path.split('.')[-1].lower() - - def _validate_coverage(self): - """Validate that we have reasonable coverage of supported file types.""" - if not self.strategies: - logger.warning("No SCIP strategies registered - indexing will not work") - return - - logger.info(f"SCIP factory initialized with {len(self.strategies)} strategies") - - -# Exception classes -class SCIPIndexingError(Exception): - """Base exception for SCIP indexing errors.""" - - -class StrategySelectionError(SCIPIndexingError): - """Raised when no suitable strategy can be found for a file.""" - - -class IndexingFailedError(SCIPIndexingError): - """Raised when indexing fails for a file or project.""" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/__init__.py b/src/code_index_mcp/scip/framework/__init__.py new file mode 100644 index 0000000..bbd2f12 --- /dev/null +++ b/src/code_index_mcp/scip/framework/__init__.py @@ -0,0 +1,157 @@ +"""SCIP Framework Infrastructure - Complete framework for SCIP standard compliance.""" + +# Core framework components +from .types import SCIPSymbolDescriptor, SCIPPositionInfo, SCIPSymbolContext, SCIPSymbolExtractor +from .standard_framework import SCIPStandardFramework +from .symbol_generator import SCIPSymbolGenerator +from .position_calculator import SCIPPositionCalculator +from .compliance_validator import SCIPComplianceValidator +from .relationship_manager import SCIPRelationshipManager, RelationshipType, SymbolRelationship + +# Language-specific implementations (legacy - being phased out) +# NOTE: Old java_factory.py has been removed and replaced with java/ module + +# Base abstract classes for all language implementations +from .base import ( + SCIPIndexFactory as BaseSCIPIndexFactory, + BaseRelationshipExtractor, + BaseEnumMapper, + BaseLanguageAnalyzer +) + +# New modular Python framework components +from .python import ( + PythonSCIPIndexFactory as ModularPythonSCIPIndexFactory, + create_python_scip_factory, + PythonRelationshipExtractor as ModularPythonRelationshipExtractor, + PythonEnumMapper as ModularPythonEnumMapper, + PythonASTAnalyzer +) + +# New modular JavaScript framework components +from .javascript import ( + JavaScriptSCIPIndexFactory as ModularJavaScriptSCIPIndexFactory, + create_javascript_scip_factory, + JavaScriptRelationshipExtractor as ModularJavaScriptRelationshipExtractor, + JavaScriptEnumMapper as ModularJavaScriptEnumMapper, + JavaScriptSyntaxAnalyzer +) + +# New modular Java framework components +from .java import ( + JavaSCIPIndexFactory as ModularJavaSCIPIndexFactory, + create_java_scip_factory, + JavaRelationshipExtractor as ModularJavaRelationshipExtractor, + JavaEnumMapper as ModularJavaEnumMapper, + JavaTreeSitterAnalyzer +) + +# New modular Objective-C framework components +from .objective_c import ( + ObjectiveCSCIPIndexFactory as ModularObjectiveCSCIPIndexFactory, + create_objective_c_scip_factory, + ObjectiveCRelationshipExtractor as ModularObjectiveCRelationshipExtractor, + ObjectiveCEnumMapper as ModularObjectiveCEnumMapper, + ObjectiveCClangAnalyzer +) + +# New modular Zig framework components +from .zig import ( + ZigSCIPIndexFactory as ModularZigSCIPIndexFactory, + create_zig_scip_factory, + ZigRelationshipExtractor as ModularZigRelationshipExtractor, + ZigEnumMapper as ModularZigEnumMapper, + ZigTreeSitterAnalyzer +) + +# New modular Fallback framework components +from .fallback import ( + FallbackSCIPIndexFactory as ModularFallbackSCIPIndexFactory, + create_fallback_scip_factory, + FallbackRelationshipExtractor as ModularFallbackRelationshipExtractor, + FallbackEnumMapper as ModularFallbackEnumMapper, + FallbackBasicAnalyzer +) + +# Advanced features +from .caching_system import SCIPCacheManager, BatchProcessor, CacheEntry +from .streaming_indexer import StreamingIndexer, IndexingProgress, IndexMerger +from .unified_api import SCIPFrameworkAPI, SCIPConfig, create_scip_framework + +__all__ = [ + # Core framework + 'SCIPSymbolDescriptor', + 'SCIPPositionInfo', + 'SCIPSymbolContext', + 'SCIPSymbolExtractor', + 'SCIPStandardFramework', + 'SCIPSymbolGenerator', + 'SCIPPositionCalculator', + 'SCIPComplianceValidator', + 'SCIPRelationshipManager', + 'RelationshipType', + 'SymbolRelationship', + + # Language implementations (legacy - removed) + # 'JavaSCIPIndexFactory', - moved to java/ module + # 'JavaSCIPEnumMapper', - moved to java/ module + + # Base abstract classes + 'BaseSCIPIndexFactory', + 'BaseRelationshipExtractor', + 'BaseEnumMapper', + 'BaseLanguageAnalyzer', + + # New modular Python components + 'ModularPythonSCIPIndexFactory', + 'create_python_scip_factory', + 'ModularPythonRelationshipExtractor', + 'ModularPythonEnumMapper', + 'PythonASTAnalyzer', + + # New modular JavaScript components + 'ModularJavaScriptSCIPIndexFactory', + 'create_javascript_scip_factory', + 'ModularJavaScriptRelationshipExtractor', + 'ModularJavaScriptEnumMapper', + 'JavaScriptSyntaxAnalyzer', + + # New modular Java components + 'ModularJavaSCIPIndexFactory', + 'create_java_scip_factory', + 'ModularJavaRelationshipExtractor', + 'ModularJavaEnumMapper', + 'JavaTreeSitterAnalyzer', + + # New modular Objective-C components + 'ModularObjectiveCSCIPIndexFactory', + 'create_objective_c_scip_factory', + 'ModularObjectiveCRelationshipExtractor', + 'ModularObjectiveCEnumMapper', + 'ObjectiveCClangAnalyzer', + + # New modular Zig components + 'ModularZigSCIPIndexFactory', + 'create_zig_scip_factory', + 'ModularZigRelationshipExtractor', + 'ModularZigEnumMapper', + 'ZigTreeSitterAnalyzer', + + # New modular Fallback components + 'ModularFallbackSCIPIndexFactory', + 'create_fallback_scip_factory', + 'ModularFallbackRelationshipExtractor', + 'ModularFallbackEnumMapper', + 'FallbackBasicAnalyzer', + + # Advanced features + 'SCIPCacheManager', + 'BatchProcessor', + 'CacheEntry', + 'StreamingIndexer', + 'IndexingProgress', + 'IndexMerger', + 'SCIPFrameworkAPI', + 'SCIPConfig', + 'create_scip_framework' +] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/__init__.py b/src/code_index_mcp/scip/framework/base/__init__.py new file mode 100644 index 0000000..65456c8 --- /dev/null +++ b/src/code_index_mcp/scip/framework/base/__init__.py @@ -0,0 +1,13 @@ +"""Base classes for SCIP framework components.""" + +from .index_factory import SCIPIndexFactory +from .relationship_extractor import BaseRelationshipExtractor +from .enum_mapper import BaseEnumMapper +from .language_analyzer import BaseLanguageAnalyzer + +__all__ = [ + 'SCIPIndexFactory', + 'BaseRelationshipExtractor', + 'BaseEnumMapper', + 'BaseLanguageAnalyzer', +] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/enum_mapper.py b/src/code_index_mcp/scip/framework/base/enum_mapper.py new file mode 100644 index 0000000..c929bee --- /dev/null +++ b/src/code_index_mcp/scip/framework/base/enum_mapper.py @@ -0,0 +1,38 @@ +"""Base enum mapper class for SCIP compliance.""" + +from abc import ABC, abstractmethod + + +class BaseEnumMapper(ABC): + """Base enum mapper class - mandatory implementation for all languages.""" + + @abstractmethod + def map_symbol_kind(self, language_kind: str) -> int: + """Map language-specific type to SCIP SymbolKind.""" + pass + + @abstractmethod + def map_syntax_kind(self, language_syntax: str) -> int: + """Map language-specific syntax to SCIP SyntaxKind.""" + pass + + @abstractmethod + def map_symbol_role(self, language_role: str) -> int: + """Map language-specific role to SCIP SymbolRole.""" + pass + + def validate_enum_value(self, enum_value: int, enum_type: str) -> bool: + """Validate enum value validity.""" + valid_ranges = { + 'SymbolKind': range(0, 65), # Updated range based on actual protobuf + 'SyntaxKind': range(0, 30), # 0-29 according to SCIP standard + 'SymbolRole': [1, 2, 4, 8, 16, 32] # Bit flags + } + + if enum_type in valid_ranges: + if enum_type == 'SymbolRole': + return enum_value in valid_ranges[enum_type] or any(enum_value & flag for flag in valid_ranges[enum_type]) + else: + return enum_value in valid_ranges[enum_type] + + return False \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/index_factory.py b/src/code_index_mcp/scip/framework/base/index_factory.py new file mode 100644 index 0000000..068c3d9 --- /dev/null +++ b/src/code_index_mcp/scip/framework/base/index_factory.py @@ -0,0 +1,206 @@ +"""Abstract factory base class for SCIP index generation with guaranteed completeness.""" + +from abc import ABC, abstractmethod +from typing import Set, List, Iterator +from ..types import SCIPContext +from ..symbol_generator import SCIPSymbolGenerator +from ..position_calculator import SCIPPositionCalculator +from .relationship_extractor import BaseRelationshipExtractor +from .enum_mapper import BaseEnumMapper +from ...proto import scip_pb2 +from ...core.relationship_types import InternalRelationshipType + + +class SCIPIndexFactory(ABC): + """Abstract factory for SCIP index generation with guaranteed completeness.""" + + def __init__(self, + project_root: str, + symbol_generator: SCIPSymbolGenerator, + relationship_extractor: BaseRelationshipExtractor, + enum_mapper: BaseEnumMapper, + position_calculator: SCIPPositionCalculator): + """ + Constructor injection ensures all required components are provided. + + Args: + project_root: Root directory of the project + symbol_generator: SCIP symbol ID generator + relationship_extractor: Language-specific relationship extractor + enum_mapper: Language-specific enum mapper + position_calculator: UTF-8 compliant position calculator + """ + self.project_root = project_root + self.symbol_generator = symbol_generator + self.relationship_extractor = relationship_extractor + self.enum_mapper = enum_mapper + self.position_calculator = position_calculator + + @abstractmethod + def get_language(self) -> str: + """Return the language identifier.""" + pass + + @abstractmethod + def get_supported_extensions(self) -> Set[str]: + """Return supported file extensions.""" + pass + + @abstractmethod + def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: + """Extract symbol definitions from source code.""" + pass + + @abstractmethod + def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: + """Extract symbol occurrences from source code.""" + pass + + def create_document(self, file_path: str, content: str) -> scip_pb2.Document: + """ + Create complete SCIP document with all essential components. + + This method is final and ensures all components are used. + """ + document = scip_pb2.Document() + document.relative_path = self._get_relative_path(file_path) + document.language = self.get_language() + + # Create processing context + context = SCIPContext(file_path, content, [], {}) + + # Extract symbols (guaranteed to be implemented) + symbols = list(self._extract_symbols(context)) + document.symbols.extend(symbols) + + # Extract occurrences (guaranteed to be implemented) + occurrences = list(self._extract_occurrences(context)) + document.occurrences.extend(occurrences) + + # Extract relationships (guaranteed to be available) + relationships = list(self.relationship_extractor.extract_all_relationships(context)) + self._add_relationships_to_document(document, relationships) + + return document + + def build_complete_index(self, files: List[str]) -> scip_pb2.Index: + """Build complete SCIP index with all 6 essential content categories.""" + index = scip_pb2.Index() + + # 1. Create metadata + index.metadata.CopyFrom(self.create_metadata()) + + # 2. Process all documents + documents = [] + for file_path in files: + if self.can_handle_file(file_path): + document = self.create_document(file_path, self._read_file(file_path)) + documents.append(document) + + index.documents.extend(documents) + + # 3. Extract external symbols + external_symbols = self.extract_external_symbols(documents) + index.external_symbols.extend(external_symbols) + + return index + + def create_metadata(self) -> scip_pb2.Metadata: + """Create standard SCIP metadata.""" + metadata = scip_pb2.Metadata() + metadata.version = scip_pb2.UnspecifiedProtocolVersion + metadata.tool_info.name = "code-index-mcp" + metadata.tool_info.version = "2.1.1" + metadata.tool_info.arguments.extend(["scip-indexing", self.get_language()]) + metadata.project_root = self.project_root + metadata.text_document_encoding = scip_pb2.UTF8 + return metadata + + @abstractmethod + def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """Extract external symbols from imports and dependencies.""" + pass + + @abstractmethod + def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: + """ + Build cross-document relationships for language-specific processing. + + This method should analyze the provided documents and create relationships + between symbols across different files, taking into account the language's + specific module system and import semantics. + + Args: + documents: List of SCIP documents for this language + full_index: Complete SCIP index with all documents and symbols + + Returns: + Number of cross-document relationships added + """ + pass + + def can_handle_file(self, file_path: str) -> bool: + """Check if this factory can handle the file.""" + import os + extension = os.path.splitext(file_path)[1].lower() + return extension in self.get_supported_extensions() + + def _get_relative_path(self, file_path: str) -> str: + """Get relative path from project root.""" + import os + return os.path.relpath(file_path, self.project_root) + + def _read_file(self, file_path: str) -> str: + """Read file content.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + except Exception: + return "" + + def _add_relationships_to_document(self, document: scip_pb2.Document, relationships): + """Add relationships to document symbols.""" + # Build a map of symbol_id -> SymbolInformation for quick lookup + symbol_map = {} + for symbol_info in document.symbols: + symbol_map[symbol_info.symbol] = symbol_info + + # Process each relationship + for rel in relationships: + # Add forward relationship (source -> target) + if rel.source_symbol in symbol_map: + source_symbol_info = symbol_map[rel.source_symbol] + + # Create SCIP Relationship + scip_rel = scip_pb2.Relationship() + scip_rel.symbol = rel.target_symbol + + # Map relationship type to SCIP flags + if rel.relationship_type == InternalRelationshipType.CALLS: + scip_rel.is_reference = True + elif rel.relationship_type == InternalRelationshipType.INHERITS: + scip_rel.is_reference = True + elif rel.relationship_type == InternalRelationshipType.IMPLEMENTS: + scip_rel.is_implementation = True + elif rel.relationship_type == InternalRelationshipType.IMPORTS: + scip_rel.is_reference = True + elif rel.relationship_type == InternalRelationshipType.CONTAINS: + scip_rel.is_definition = True + else: + scip_rel.is_reference = True # Default + + # Add to source symbol's relationships + source_symbol_info.relationships.append(scip_rel) + + # Add reverse relationship for called_by (target -> source) + if rel.relationship_type == InternalRelationshipType.CALLS: + if rel.target_symbol in symbol_map: + target_symbol_info = symbol_map[rel.target_symbol] + + # Create reverse relationship for called_by + reverse_rel = scip_pb2.Relationship() + reverse_rel.symbol = rel.source_symbol + reverse_rel.is_reference = True # called_by is a reference + + # Add to target symbol's relationships + target_symbol_info.relationships.append(reverse_rel) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/language_analyzer.py b/src/code_index_mcp/scip/framework/base/language_analyzer.py new file mode 100644 index 0000000..358cbd0 --- /dev/null +++ b/src/code_index_mcp/scip/framework/base/language_analyzer.py @@ -0,0 +1,77 @@ +"""Base language analyzer class for different parsing approaches.""" + +from abc import ABC, abstractmethod +from typing import Dict, List, Any, Optional + + +class BaseLanguageAnalyzer(ABC): + """Base class for language-specific analyzers (AST, regex, tree-sitter, etc.).""" + + @abstractmethod + def parse(self, content: str, filename: str = ""): + """Parse source code content into an internal representation.""" + pass + + @abstractmethod + def is_symbol_definition(self, node) -> bool: + """Check if a node represents a symbol definition.""" + pass + + @abstractmethod + def is_symbol_reference(self, node) -> bool: + """Check if a node represents a symbol reference.""" + pass + + @abstractmethod + def get_symbol_name(self, node) -> Optional[str]: + """Extract symbol name from a node.""" + pass + + @abstractmethod + def get_node_position(self, node) -> tuple: + """Get position information from a node.""" + pass + + def extract_symbols(self, content: str) -> List[Dict[str, Any]]: + """Extract all symbols from content - default implementation.""" + symbols = [] + try: + parsed = self.parse(content) + nodes = self.walk(parsed) if hasattr(self, 'walk') else [parsed] + + for node in nodes: + if self.is_symbol_definition(node): + symbol_name = self.get_symbol_name(node) + if symbol_name: + position = self.get_node_position(node) + symbols.append({ + 'name': symbol_name, + 'position': position, + 'node': node + }) + except Exception: + pass + + return symbols + + def extract_references(self, content: str) -> List[Dict[str, Any]]: + """Extract all symbol references from content - default implementation.""" + references = [] + try: + parsed = self.parse(content) + nodes = self.walk(parsed) if hasattr(self, 'walk') else [parsed] + + for node in nodes: + if self.is_symbol_reference(node): + symbol_name = self.get_symbol_name(node) + if symbol_name: + position = self.get_node_position(node) + references.append({ + 'name': symbol_name, + 'position': position, + 'node': node + }) + except Exception: + pass + + return references \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/relationship_extractor.py b/src/code_index_mcp/scip/framework/base/relationship_extractor.py new file mode 100644 index 0000000..1a851dd --- /dev/null +++ b/src/code_index_mcp/scip/framework/base/relationship_extractor.py @@ -0,0 +1,41 @@ +"""Base class for all language-specific relationship extractors.""" + +from abc import ABC, abstractmethod +from typing import Iterator +from ..types import SCIPContext, Relationship + + +class BaseRelationshipExtractor(ABC): + """Base class for all language-specific relationship extractors.""" + + @abstractmethod + def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract inheritance relationships - required for all OOP languages.""" + pass + + @abstractmethod + def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract call relationships - required for all languages.""" + pass + + @abstractmethod + def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract import/dependency relationships - required for all languages.""" + pass + + def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract composition relationships - optional implementation.""" + return iter([]) + + def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract interface relationships - optional implementation.""" + return iter([]) + + def extract_all_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract all relationships using implemented methods.""" + # Yield from all relationship extraction methods + yield from self.extract_inheritance_relationships(context) + yield from self.extract_call_relationships(context) + yield from self.extract_import_relationships(context) + yield from self.extract_composition_relationships(context) + yield from self.extract_interface_relationships(context) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/caching_system.py b/src/code_index_mcp/scip/framework/caching_system.py new file mode 100644 index 0000000..eaa0392 --- /dev/null +++ b/src/code_index_mcp/scip/framework/caching_system.py @@ -0,0 +1,346 @@ +"""SCIP Framework Caching System - Performance optimization with intelligent caching.""" + +import logging +import hashlib +import pickle +import os +import time +from typing import Dict, Any, Optional, List, Tuple +from datetime import datetime, timedelta +from dataclasses import dataclass +from pathlib import Path + +from ..proto import scip_pb2 + +logger = logging.getLogger(__name__) + + +@dataclass +class CacheEntry: + """Cache entry with metadata.""" + data: Any + created_at: datetime + file_hash: str + access_count: int = 0 + last_accessed: Optional[datetime] = None + + +class SCIPCacheManager: + """Advanced caching system for SCIP framework with intelligent invalidation.""" + + def __init__(self, cache_dir: Optional[str] = None, max_memory_entries: int = 1000): + """Initialize cache manager.""" + self.cache_dir = Path(cache_dir) if cache_dir else Path.cwd() / ".scip_cache" + self.cache_dir.mkdir(exist_ok=True) + + # In-memory cache for frequently accessed items + self._memory_cache: Dict[str, CacheEntry] = {} + self.max_memory_entries = max_memory_entries + + # File modification tracking + self._file_hashes: Dict[str, str] = {} + + # Performance metrics + self._cache_hits = 0 + self._cache_misses = 0 + self._cache_invalidations = 0 + + logger.debug(f"Initialized SCIP cache manager with directory: {self.cache_dir}") + + def get_document_cache(self, file_path: str) -> Optional[scip_pb2.Document]: + """Get cached document if valid.""" + cache_key = self._get_cache_key("document", file_path) + + # Check if file has been modified + if self._is_file_modified(file_path): + self._invalidate_file_cache(file_path) + return None + + # Try memory cache first + if cache_key in self._memory_cache: + entry = self._memory_cache[cache_key] + entry.access_count += 1 + entry.last_accessed = datetime.now() + self._cache_hits += 1 + logger.debug(f"Memory cache hit for document: {file_path}") + return entry.data + + # Try disk cache + disk_entry = self._load_from_disk(cache_key) + if disk_entry: + # Move to memory cache for faster access + self._memory_cache[cache_key] = disk_entry + self._cache_hits += 1 + logger.debug(f"Disk cache hit for document: {file_path}") + return disk_entry.data + + self._cache_misses += 1 + return None + + def cache_document(self, file_path: str, document: scip_pb2.Document) -> None: + """Cache document with file modification tracking.""" + cache_key = self._get_cache_key("document", file_path) + file_hash = self._calculate_file_hash(file_path) + + entry = CacheEntry( + data=document, + created_at=datetime.now(), + file_hash=file_hash + ) + + # Store in memory cache + self._memory_cache[cache_key] = entry + self._file_hashes[file_path] = file_hash + + # Evict old entries if memory cache is full + self._evict_old_entries() + + # Store on disk for persistence + self._save_to_disk(cache_key, entry) + + logger.debug(f"Cached document: {file_path}") + + def get_symbol_cache(self, symbol_id: str) -> Optional[scip_pb2.SymbolInformation]: + """Get cached symbol information.""" + cache_key = self._get_cache_key("symbol", symbol_id) + + if cache_key in self._memory_cache: + entry = self._memory_cache[cache_key] + entry.access_count += 1 + entry.last_accessed = datetime.now() + self._cache_hits += 1 + return entry.data + + disk_entry = self._load_from_disk(cache_key) + if disk_entry: + self._memory_cache[cache_key] = disk_entry + self._cache_hits += 1 + return disk_entry.data + + self._cache_misses += 1 + return None + + def cache_symbol(self, symbol_id: str, symbol_info: scip_pb2.SymbolInformation) -> None: + """Cache symbol information.""" + cache_key = self._get_cache_key("symbol", symbol_id) + + entry = CacheEntry( + data=symbol_info, + created_at=datetime.now(), + file_hash="" # Symbols don't have associated files directly + ) + + self._memory_cache[cache_key] = entry + self._save_to_disk(cache_key, entry) + + logger.debug(f"Cached symbol: {symbol_id}") + + def get_relationship_cache(self, source_symbol: str, target_symbol: str) -> Optional[List[str]]: + """Get cached relationships between symbols.""" + cache_key = self._get_cache_key("relationship", f"{source_symbol}::{target_symbol}") + + if cache_key in self._memory_cache: + entry = self._memory_cache[cache_key] + entry.access_count += 1 + self._cache_hits += 1 + return entry.data + + self._cache_misses += 1 + return None + + def cache_relationships(self, source_symbol: str, target_symbol: str, relationships: List[str]) -> None: + """Cache relationships between symbols.""" + cache_key = self._get_cache_key("relationship", f"{source_symbol}::{target_symbol}") + + entry = CacheEntry( + data=relationships, + created_at=datetime.now(), + file_hash="" + ) + + self._memory_cache[cache_key] = entry + logger.debug(f"Cached relationships: {source_symbol} -> {target_symbol}") + + def invalidate_file_cache(self, file_path: str) -> None: + """Invalidate all cache entries related to a file.""" + self._invalidate_file_cache(file_path) + + def invalidate_all_cache(self) -> None: + """Clear all caches.""" + self._memory_cache.clear() + self._file_hashes.clear() + + # Clear disk cache + for cache_file in self.cache_dir.glob("*.cache"): + try: + cache_file.unlink() + except OSError as e: + logger.warning(f"Failed to delete cache file {cache_file}: {e}") + + self._cache_invalidations += 1 + logger.info("Invalidated all caches") + + def get_cache_statistics(self) -> Dict[str, Any]: + """Get cache performance statistics.""" + total_requests = self._cache_hits + self._cache_misses + hit_rate = (self._cache_hits / total_requests) if total_requests > 0 else 0 + + return { + "cache_hits": self._cache_hits, + "cache_misses": self._cache_misses, + "hit_rate": f"{hit_rate:.2%}", + "memory_entries": len(self._memory_cache), + "max_memory_entries": self.max_memory_entries, + "cache_invalidations": self._cache_invalidations, + "tracked_files": len(self._file_hashes), + "cache_directory": str(self.cache_dir) + } + + def _get_cache_key(self, cache_type: str, identifier: str) -> str: + """Generate cache key for identifier.""" + return f"{cache_type}_{hashlib.md5(identifier.encode()).hexdigest()}" + + def _calculate_file_hash(self, file_path: str) -> str: + """Calculate hash of file content.""" + try: + with open(file_path, 'rb') as f: + return hashlib.md5(f.read()).hexdigest() + except (OSError, IOError) as e: + logger.warning(f"Failed to calculate hash for {file_path}: {e}") + return "" + + def _is_file_modified(self, file_path: str) -> bool: + """Check if file has been modified since last cache.""" + if file_path not in self._file_hashes: + return True + + current_hash = self._calculate_file_hash(file_path) + return current_hash != self._file_hashes[file_path] + + def _invalidate_file_cache(self, file_path: str) -> None: + """Invalidate cache entries for a specific file.""" + # Remove from file hash tracking + if file_path in self._file_hashes: + del self._file_hashes[file_path] + + # Find and remove related cache entries + document_key = self._get_cache_key("document", file_path) + if document_key in self._memory_cache: + del self._memory_cache[document_key] + + # Remove from disk cache + cache_file = self.cache_dir / f"{document_key}.cache" + if cache_file.exists(): + try: + cache_file.unlink() + except OSError as e: + logger.warning(f"Failed to delete cache file {cache_file}: {e}") + + self._cache_invalidations += 1 + logger.debug(f"Invalidated cache for file: {file_path}") + + def _evict_old_entries(self) -> None: + """Evict least recently used entries when memory cache is full.""" + if len(self._memory_cache) <= self.max_memory_entries: + return + + # Sort by last accessed time (least recent first) + sorted_entries = sorted( + self._memory_cache.items(), + key=lambda x: x[1].last_accessed or x[1].created_at + ) + + # Remove oldest 10% of entries + entries_to_remove = max(1, len(sorted_entries) // 10) + for i in range(entries_to_remove): + key_to_remove = sorted_entries[i][0] + del self._memory_cache[key_to_remove] + + logger.debug(f"Evicted {entries_to_remove} cache entries") + + def _save_to_disk(self, cache_key: str, entry: CacheEntry) -> None: + """Save cache entry to disk.""" + try: + cache_file = self.cache_dir / f"{cache_key}.cache" + with open(cache_file, 'wb') as f: + pickle.dump(entry, f) + except (OSError, IOError, pickle.PickleError) as e: + logger.warning(f"Failed to save cache entry {cache_key}: {e}") + + def _load_from_disk(self, cache_key: str) -> Optional[CacheEntry]: + """Load cache entry from disk.""" + try: + cache_file = self.cache_dir / f"{cache_key}.cache" + if not cache_file.exists(): + return None + + # Check if cache file is too old (older than 24 hours) + if time.time() - cache_file.stat().st_mtime > 86400: # 24 hours + cache_file.unlink() + return None + + with open(cache_file, 'rb') as f: + entry = pickle.load(f) + entry.last_accessed = datetime.now() + return entry + + except (OSError, IOError, pickle.PickleError) as e: + logger.warning(f"Failed to load cache entry {cache_key}: {e}") + return None + + +class BatchProcessor: + """Batch processing system for optimized SCIP index generation.""" + + def __init__(self, cache_manager: SCIPCacheManager, batch_size: int = 50): + """Initialize batch processor.""" + self.cache_manager = cache_manager + self.batch_size = batch_size + self._pending_documents: List[Tuple[str, str]] = [] # (file_path, content) + self._processed_count = 0 + + def add_file(self, file_path: str, content: str) -> None: + """Add file to processing batch.""" + self._pending_documents.append((file_path, content)) + + # Process batch when it reaches the target size + if len(self._pending_documents) >= self.batch_size: + self.process_batch() + + def process_batch(self) -> List[scip_pb2.Document]: + """Process current batch of files.""" + if not self._pending_documents: + return [] + + logger.info(f"Processing batch of {len(self._pending_documents)} files") + documents = [] + + for file_path, content in self._pending_documents: + # Check cache first + cached_doc = self.cache_manager.get_document_cache(file_path) + if cached_doc: + documents.append(cached_doc) + logger.debug(f"Using cached document for {file_path}") + else: + # Process file (this would be implemented by the specific factory) + logger.debug(f"Processing file {file_path}") + # Placeholder for actual processing + documents.append(scip_pb2.Document()) + + self._processed_count += len(self._pending_documents) + self._pending_documents.clear() + + logger.info(f"Completed batch processing. Total processed: {self._processed_count}") + return documents + + def finalize(self) -> List[scip_pb2.Document]: + """Process any remaining files in the batch.""" + return self.process_batch() + + def get_stats(self) -> Dict[str, int]: + """Get batch processing statistics.""" + return { + "processed_files": self._processed_count, + "pending_files": len(self._pending_documents), + "batch_size": self.batch_size + } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/compliance_validator.py b/src/code_index_mcp/scip/framework/compliance_validator.py new file mode 100644 index 0000000..dca9eb2 --- /dev/null +++ b/src/code_index_mcp/scip/framework/compliance_validator.py @@ -0,0 +1,319 @@ +"""SCIP Compliance Validator - Runtime verification for SCIP standard compliance.""" + +import logging +import re +from typing import List, Dict, Optional, Tuple, Any +from .types import SCIPPositionInfo +from ..proto import scip_pb2 + + +logger = logging.getLogger(__name__) + + +class SCIPComplianceValidator: + """SCIP compliance validator for runtime verification of generated content.""" + + # SCIP symbol ID format patterns + LOCAL_SYMBOL_PATTERN = re.compile(r'^local\s+.+$') + GLOBAL_SYMBOL_PATTERN = re.compile(r'^[^\s]+\s+[^\s]+\s+[^\s]+(\s+[^\s]+)?\s+.+$') + + def __init__(self): + """Initialize compliance validator.""" + self.validation_errors = [] + self.validation_warnings = [] + + def validate_document(self, document: scip_pb2.Document) -> bool: + """ + Validate complete SCIP document for compliance. + + Args: + document: SCIP Document to validate + + Returns: + True if document is compliant, False otherwise + """ + self.clear_validation_results() + + try: + # Validate document structure + self._validate_document_structure(document) + + # Validate all symbol occurrences + for occurrence in document.occurrences: + self._validate_occurrence(occurrence) + + # Validate all symbol information + for symbol_info in document.symbols: + self._validate_symbol_information(symbol_info) + + # Check for consistency between occurrences and symbols + self._validate_occurrence_symbol_consistency(document) + + # Log validation results + if self.validation_errors: + logger.error(f"Document validation failed with {len(self.validation_errors)} errors") + for error in self.validation_errors: + logger.error(f" - {error}") + return False + + if self.validation_warnings: + logger.warning(f"Document validation completed with {len(self.validation_warnings)} warnings") + for warning in self.validation_warnings: + logger.warning(f" - {warning}") + + logger.debug("Document validation passed") + return True + + except Exception as e: + self._add_error(f"Validation exception: {e}") + return False + + def validate_index(self, index: scip_pb2.Index) -> bool: + """ + Validate complete SCIP index for compliance. + + Args: + index: SCIP Index to validate + + Returns: + True if index is compliant, False otherwise + """ + self.clear_validation_results() + + try: + # Validate index metadata + if index.HasField('metadata'): + self._validate_metadata(index.metadata) + else: + self._add_error("Index missing required metadata") + + # Validate all documents + for document in index.documents: + if not self.validate_document(document): + self._add_error(f"Document validation failed: {document.relative_path}") + + # Validate external symbols + for external_symbol in index.external_symbols: + self._validate_symbol_information(external_symbol) + + return len(self.validation_errors) == 0 + + except Exception as e: + self._add_error(f"Index validation exception: {e}") + return False + + def validate_symbol_id(self, symbol_id: str) -> bool: + """ + Validate symbol ID against SCIP grammar. + + Args: + symbol_id: Symbol ID to validate + + Returns: + True if valid, False otherwise + """ + if not symbol_id: + return False + + if symbol_id.startswith('local '): + return self._validate_local_symbol(symbol_id[6:]) + else: + return self._validate_global_symbol(symbol_id) + + def validate_position(self, position: SCIPPositionInfo, content: str) -> bool: + """ + Validate position information against content. + + Args: + position: Position to validate + content: Source content + + Returns: + True if position is valid, False otherwise + """ + try: + # Basic position validation + if not position.validate(): + return False + + # Document bounds validation + if not self._is_within_document_bounds(position, content): + return False + + # UTF-8 compliance validation + if not self._is_utf8_compliant(position, content): + return False + + return True + + except Exception as e: + logger.error(f"Position validation error: {e}") + return False + + def _validate_document_structure(self, document: scip_pb2.Document) -> None: + """Validate basic document structure.""" + if not document.relative_path: + self._add_error("Document missing relative_path") + + if not document.language: + self._add_warning("Document missing language specification") + + # Check path format + if '\\' in document.relative_path: + self._add_warning("Document path should use forward slashes") + + def _validate_occurrence(self, occurrence: scip_pb2.Occurrence) -> None: + """Validate SCIP occurrence.""" + # Validate symbol ID + if not self.validate_symbol_id(occurrence.symbol): + self._add_error(f"Invalid symbol ID in occurrence: {occurrence.symbol}") + + # Validate symbol roles + if not self._validate_symbol_roles(occurrence.symbol_roles): + self._add_error(f"Invalid symbol roles: {occurrence.symbol_roles}") + + # Validate syntax kind + if not self._validate_syntax_kind(occurrence.syntax_kind): + self._add_error(f"Invalid syntax kind: {occurrence.syntax_kind}") + + # Validate range + if occurrence.HasField('range'): + self._validate_range(occurrence.range) + + def _validate_symbol_information(self, symbol_info: scip_pb2.SymbolInformation) -> None: + """Validate SCIP symbol information.""" + # Validate symbol ID + if not self.validate_symbol_id(symbol_info.symbol): + self._add_error(f"Invalid symbol ID in symbol info: {symbol_info.symbol}") + + # Validate symbol kind + if not self._validate_symbol_kind(symbol_info.kind): + self._add_error(f"Invalid symbol kind: {symbol_info.kind}") + + # Validate display name + if not symbol_info.display_name: + self._add_warning(f"Symbol missing display name: {symbol_info.symbol}") + + def _validate_metadata(self, metadata: scip_pb2.Metadata) -> None: + """Validate SCIP metadata.""" + if not metadata.HasField('tool_info'): + self._add_error("Metadata missing tool_info") + else: + if not metadata.tool_info.name: + self._add_error("Metadata tool_info missing name") + if not metadata.tool_info.version: + self._add_warning("Metadata tool_info missing version") + + if not metadata.project_root: + self._add_error("Metadata missing project_root") + + # Validate text encoding + if metadata.text_document_encoding == scip_pb2.UnspecifiedTextDocumentEncoding: + self._add_warning("Metadata has unspecified text encoding") + + def _validate_range(self, range_obj: scip_pb2.Range) -> None: + """Validate SCIP range object.""" + if len(range_obj.start) < 2 or len(range_obj.end) < 2: + self._add_error("Range missing start or end positions (need [line, character])") + return + + start_line, start_char = range_obj.start[0], range_obj.start[1] + end_line, end_char = range_obj.end[0], range_obj.end[1] + + # Validate position ordering + if start_line > end_line or (start_line == end_line and start_char > end_char): + self._add_error(f"Invalid range: start position after end position") + + # Validate non-negative positions + if start_line < 0 or start_char < 0 or end_line < 0 or end_char < 0: + self._add_error("Range positions cannot be negative") + + def _validate_occurrence_symbol_consistency(self, document: scip_pb2.Document) -> None: + """Validate consistency between occurrences and symbol definitions.""" + defined_symbols = {symbol.symbol for symbol in document.symbols} + referenced_symbols = {occ.symbol for occ in document.occurrences} + + # Check for undefined symbols (warnings, not errors) + undefined_refs = referenced_symbols - defined_symbols + for undefined_symbol in undefined_refs: + if undefined_symbol.startswith('local '): + self._add_warning(f"Reference to undefined local symbol: {undefined_symbol}") + + def _validate_local_symbol(self, local_id: str) -> bool: + """Validate local symbol format.""" + return bool(local_id and not local_id.startswith(' ') and not local_id.endswith(' ')) + + def _validate_global_symbol(self, symbol_id: str) -> bool: + """Validate global symbol format.""" + parts = symbol_id.split(' ') + return len(parts) >= 3 and all(part.strip() for part in parts) + + def _validate_symbol_kind(self, kind: int) -> bool: + """Validate SymbolKind enum value.""" + return 0 <= kind <= 64 # SCIP SymbolKind range (updated to match actual protobuf) + + def _validate_syntax_kind(self, kind: int) -> bool: + """Validate SyntaxKind enum value.""" + return 0 <= kind <= 29 # SCIP SyntaxKind range + + def _validate_symbol_roles(self, roles: int) -> bool: + """Validate SymbolRole bit flags.""" + valid_flags = [1, 2, 4, 8, 16, 32] # Definition, Import, WriteAccess, ReadAccess, Generated, Test + + if roles in valid_flags: + return True + + # Check if it's a valid combination of flags + return (roles & ~sum(valid_flags)) == 0 and roles > 0 + + def _is_within_document_bounds(self, position: SCIPPositionInfo, content: str) -> bool: + """Check if position is within document boundaries.""" + lines = content.split('\n') + return ( + 0 <= position.start_line < len(lines) and + 0 <= position.end_line < len(lines) and + 0 <= position.start_column <= len(lines[position.start_line]) and + 0 <= position.end_column <= len(lines[position.end_line]) + ) + + def _is_utf8_compliant(self, position: SCIPPositionInfo, content: str) -> bool: + """Validate UTF-8 character position accuracy.""" + try: + lines = content.split('\n') + + # Test encoding/decoding at position boundaries + if position.start_line < len(lines): + start_line_text = lines[position.start_line][:position.start_column] + start_line_text.encode('utf-8').decode('utf-8') + + if position.end_line < len(lines): + end_line_text = lines[position.end_line][:position.end_column] + end_line_text.encode('utf-8').decode('utf-8') + + return True + + except (UnicodeEncodeError, UnicodeDecodeError, IndexError): + return False + + def _add_error(self, message: str) -> None: + """Add validation error.""" + self.validation_errors.append(message) + + def _add_warning(self, message: str) -> None: + """Add validation warning.""" + self.validation_warnings.append(message) + + def clear_validation_results(self) -> None: + """Clear previous validation results.""" + self.validation_errors.clear() + self.validation_warnings.clear() + + def get_validation_summary(self) -> dict: + """Get summary of validation results.""" + return { + 'errors': len(self.validation_errors), + 'warnings': len(self.validation_warnings), + 'error_messages': self.validation_errors.copy(), + 'warning_messages': self.validation_warnings.copy(), + 'is_valid': len(self.validation_errors) == 0 + } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/__init__.py b/src/code_index_mcp/scip/framework/fallback/__init__.py new file mode 100644 index 0000000..e9cce6e --- /dev/null +++ b/src/code_index_mcp/scip/framework/fallback/__init__.py @@ -0,0 +1,14 @@ +"""Fallback SCIP Framework Module - For unsupported languages and files.""" + +from .factory import FallbackSCIPIndexFactory, create_fallback_scip_factory +from .relationship_extractor import FallbackRelationshipExtractor +from .enum_mapper import FallbackEnumMapper +from .basic_analyzer import FallbackBasicAnalyzer + +__all__ = [ + 'FallbackSCIPIndexFactory', + 'create_fallback_scip_factory', + 'FallbackRelationshipExtractor', + 'FallbackEnumMapper', + 'FallbackBasicAnalyzer' +] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/basic_analyzer.py b/src/code_index_mcp/scip/framework/fallback/basic_analyzer.py new file mode 100644 index 0000000..f561e08 --- /dev/null +++ b/src/code_index_mcp/scip/framework/fallback/basic_analyzer.py @@ -0,0 +1,156 @@ +"""Fallback basic analyzer implementation.""" + +from typing import Iterator, Optional, Set, List, Dict, Any +from ..types import SCIPContext +from ..base.language_analyzer import BaseLanguageAnalyzer +from pathlib import Path + + +class FallbackBasicAnalyzer(BaseLanguageAnalyzer): + """Fallback analyzer for basic file analysis without parsing.""" + + def __init__(self): + """Initialize the fallback basic analyzer.""" + self._processed_files: Set[str] = set() + + def parse(self, content: str, filename: str = ""): + """Parse content (no-op for fallback, returns file info).""" + return { + 'filename': filename, + 'content_length': len(content), + 'line_count': content.count('\n') + 1, + 'type': 'fallback_file' + } + + def walk(self, tree) -> Iterator: + """Walk tree nodes (returns single file node for fallback).""" + yield tree # Return the entire file as a single "node" + + def is_symbol_definition(self, node) -> bool: + """Check if node represents a symbol definition (file-level only).""" + return isinstance(node, dict) and node.get('type') == 'fallback_file' + + def is_symbol_reference(self, node) -> bool: + """Check if node represents a symbol reference (none for fallback).""" + return False # Fallback doesn't analyze references + + def get_symbol_name(self, node) -> Optional[str]: + """Extract symbol name from node (filename for fallback).""" + if isinstance(node, dict) and 'filename' in node: + return Path(node['filename']).stem + return None + + def get_node_position(self, node) -> tuple: + """Get position information from node.""" + if isinstance(node, dict): + line_count = node.get('line_count', 1) + return (0, 0, line_count - 1, 0) # Start to end of file + return (0, 0, 0, 0) + + def extract_file_info(self, content: str, filename: str) -> Dict[str, Any]: + """Extract basic file information.""" + path = Path(filename) + + return { + 'filename': filename, + 'basename': path.name, + 'stem': path.stem, + 'suffix': path.suffix, + 'content_length': len(content), + 'line_count': content.count('\n') + 1, + 'language': self.detect_language_from_extension(path.suffix), + 'is_binary': self._is_likely_binary(content), + 'encoding': 'utf-8' # Assume UTF-8 for text files + } + + def detect_language_from_extension(self, extension: str) -> str: + """Detect specific language from file extension.""" + extension_mapping = { + # Programming languages + '.c': 'c', + '.cpp': 'cpp', '.cc': 'cpp', '.cxx': 'cpp', '.c++': 'cpp', + '.h': 'c', '.hpp': 'cpp', '.hh': 'cpp', '.hxx': 'cpp', + '.js': 'javascript', '.mjs': 'javascript', '.jsx': 'javascript', + '.ts': 'typescript', '.tsx': 'typescript', + '.py': 'python', '.pyi': 'python', '.pyx': 'python', + '.java': 'java', + '.go': 'go', + '.rs': 'rust', + '.rb': 'ruby', + '.cs': 'csharp', + '.php': 'php', + '.swift': 'swift', + '.kt': 'kotlin', '.kts': 'kotlin', + '.scala': 'scala', + '.r': 'r', + '.lua': 'lua', + '.perl': 'perl', '.pl': 'perl', + '.zig': 'zig', + '.dart': 'dart', + '.m': 'objective-c', '.mm': 'objective-c', + + # Web and markup + '.html': 'html', '.htm': 'html', + '.css': 'css', + '.scss': 'scss', '.sass': 'sass', + '.less': 'less', + '.vue': 'vue', + '.svelte': 'svelte', + '.astro': 'astro', + + # Data and config + '.json': 'json', + '.xml': 'xml', + '.yaml': 'yaml', '.yml': 'yaml', + '.toml': 'toml', + '.ini': 'ini', + '.cfg': 'ini', + '.conf': 'ini', + + # Documentation + '.md': 'markdown', '.markdown': 'markdown', + '.mdx': 'mdx', + '.tex': 'latex', + '.rst': 'rst', + + # Database and query + '.sql': 'sql', + '.cql': 'cql', + '.cypher': 'cypher', + '.sparql': 'sparql', + '.graphql': 'graphql', '.gql': 'graphql', + + # Shell and scripts + '.sh': 'shell', '.bash': 'bash', + '.zsh': 'zsh', '.fish': 'fish', + '.ps1': 'powershell', + '.bat': 'batch', '.cmd': 'batch', + + # Template languages + '.handlebars': 'handlebars', '.hbs': 'handlebars', + '.ejs': 'ejs', + '.pug': 'pug', + '.mustache': 'mustache', + + # Other + '.dockerfile': 'dockerfile', + '.gitignore': 'gitignore', + '.env': 'dotenv', + } + + return extension_mapping.get(extension.lower(), 'text') + + def get_file_statistics(self, content: str) -> Dict[str, int]: + """Get basic file statistics.""" + return { + 'total_characters': len(content), + 'total_lines': content.count('\n') + 1, + 'non_empty_lines': len([line for line in content.split('\n') if line.strip()]), + 'blank_lines': content.count('\n') + 1 - len([line for line in content.split('\n') if line.strip()]), + 'estimated_words': len(content.split()) if content.strip() else 0 + } + + def _is_likely_binary(self, content: str, sample_size: int = 1024) -> bool: + """Check if content is likely binary based on null bytes.""" + sample = content[:sample_size] + return '\x00' in sample or any(ord(c) > 127 for c in sample[:100]) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/enum_mapper.py b/src/code_index_mcp/scip/framework/fallback/enum_mapper.py new file mode 100644 index 0000000..08d338f --- /dev/null +++ b/src/code_index_mcp/scip/framework/fallback/enum_mapper.py @@ -0,0 +1,102 @@ +"""Fallback enum mapper implementation.""" + +from typing import Dict, Optional +from ..base.enum_mapper import BaseEnumMapper +from ...proto import scip_pb2 + + +class FallbackEnumMapper(BaseEnumMapper): + """Fallback enum mapper for basic SCIP enum mappings.""" + + def __init__(self): + """Initialize fallback enum mapper with minimal mappings.""" + super().__init__() + + # Minimal symbol kind mappings for fallback + self._symbol_kind_map = { + 'file': scip_pb2.File, + 'text': scip_pb2.File, + 'unknown': scip_pb2.UnspecifiedSymbolKind, + } + + # Minimal symbol role mappings + self._symbol_role_map = { + 'definition': scip_pb2.Definition, + 'reference': scip_pb2.Read, + } + + # Minimal syntax kind mappings + self._syntax_kind_map = { + 'file': scip_pb2.UnspecifiedSyntaxKind, + 'text': scip_pb2.UnspecifiedSyntaxKind, + 'identifier': scip_pb2.IdentifierKeyword, + } + + def map_symbol_kind(self, fallback_kind: str) -> int: + """Map fallback symbol kind to SCIP SymbolKind enum.""" + kind = self._symbol_kind_map.get(fallback_kind.lower()) + if kind is not None: + return kind + + # Default to File for fallback + return scip_pb2.File + + def map_symbol_role(self, fallback_role: str) -> int: + """Map fallback symbol role to SCIP SymbolRole enum.""" + role = self._symbol_role_map.get(fallback_role.lower()) + if role is not None: + return role + + # Default to Definition for fallback + return scip_pb2.Definition + + def map_syntax_kind(self, fallback_syntax: str) -> int: + """Map fallback syntax kind to SCIP SyntaxKind enum.""" + syntax = self._syntax_kind_map.get(fallback_syntax.lower()) + if syntax is not None: + return syntax + + # Default to UnspecifiedSyntaxKind for fallback + return scip_pb2.UnspecifiedSyntaxKind + + def get_symbol_kind_name(self, kind: int) -> Optional[str]: + """Get human-readable name for symbol kind.""" + reverse_map = {v: k for k, v in self._symbol_kind_map.items()} + return reverse_map.get(kind) + + def get_symbol_role_name(self, role: int) -> Optional[str]: + """Get human-readable name for symbol role.""" + reverse_map = {v: k for k, v in self._symbol_role_map.items()} + return reverse_map.get(role) + + def get_syntax_kind_name(self, syntax: int) -> Optional[str]: + """Get human-readable name for syntax kind.""" + reverse_map = {v: k for k, v in self._syntax_kind_map.items()} + return reverse_map.get(syntax) + + def validate_symbol_kind(self, kind: int) -> bool: + """Validate if symbol kind is valid.""" + # Accept all valid SCIP symbol kinds + return 0 <= kind <= 64 + + def validate_symbol_role(self, role: int) -> bool: + """Validate if symbol role is valid.""" + # Accept all valid SCIP symbol roles + return 0 <= role <= 32 + + def validate_syntax_kind(self, syntax: int) -> bool: + """Validate if syntax kind is valid.""" + # Accept all valid SCIP syntax kinds + return 0 <= syntax <= 1000 + + def get_supported_symbol_kinds(self) -> Dict[str, int]: + """Get all supported symbol kinds.""" + return self._symbol_kind_map.copy() + + def get_supported_symbol_roles(self) -> Dict[str, int]: + """Get all supported symbol roles.""" + return self._symbol_role_map.copy() + + def get_supported_syntax_kinds(self) -> Dict[str, int]: + """Get all supported syntax kinds.""" + return self._syntax_kind_map.copy() \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/factory.py b/src/code_index_mcp/scip/framework/fallback/factory.py new file mode 100644 index 0000000..4d57f4e --- /dev/null +++ b/src/code_index_mcp/scip/framework/fallback/factory.py @@ -0,0 +1,153 @@ +"""Fallback SCIP Index Factory implementation.""" + +import os +from pathlib import Path +from typing import Set, List, Iterator, Optional +from ..base.index_factory import SCIPIndexFactory +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..base.enum_mapper import BaseEnumMapper +from ..symbol_generator import SCIPSymbolGenerator +from ..position_calculator import SCIPPositionCalculator +from ..types import SCIPContext +from .relationship_extractor import FallbackRelationshipExtractor +from .enum_mapper import FallbackEnumMapper +from .basic_analyzer import FallbackBasicAnalyzer +from ...proto import scip_pb2 +from ....constants import SUPPORTED_EXTENSIONS + + +class FallbackSCIPIndexFactory(SCIPIndexFactory): + """Fallback SCIP Index factory for unsupported languages and files.""" + + def __init__(self, + project_root: str, + symbol_generator: SCIPSymbolGenerator, + relationship_extractor: BaseRelationshipExtractor, + enum_mapper: BaseEnumMapper, + position_calculator: SCIPPositionCalculator): + """Initialize Fallback factory with required components via constructor injection.""" + super().__init__(project_root, symbol_generator, relationship_extractor, + enum_mapper, position_calculator) + self.basic_analyzer = FallbackBasicAnalyzer() + + def get_language(self) -> str: + """Return language identifier.""" + return "text" + + def get_supported_extensions(self) -> Set[str]: + """Return all supported file extensions as fallback handles everything.""" + return SUPPORTED_EXTENSIONS + + def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: + """Extract minimal symbol information (file-level only).""" + try: + # Only create a file-level symbol for fallback + file_name = Path(context.file_path).stem + if file_name: + symbol_info = self._create_file_symbol(context, file_name) + if symbol_info: + yield symbol_info + + except Exception as e: + # Silently handle errors in fallback + pass + + def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: + """Extract minimal occurrences (file-level only).""" + try: + # Create single occurrence for the entire file + file_name = Path(context.file_path).stem + if file_name: + occurrence = self._create_file_occurrence(context, file_name) + if occurrence: + yield occurrence + + except Exception as e: + # Silently handle errors in fallback + pass + + def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """Extract external symbols (none for fallback).""" + return [] # Fallback doesn't analyze external dependencies + + def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: + """ + Build cross-document relationships for fallback (no relationships). + + Fallback factory doesn't create cross-document relationships as it handles + unsupported languages with minimal symbol information. + """ + return 0 # No cross-document relationships for fallback + + def _create_file_symbol(self, context: SCIPContext, file_name: str) -> Optional[scip_pb2.SymbolInformation]: + """Create SCIP symbol information for the file itself.""" + symbol_info = scip_pb2.SymbolInformation() + + # Detect language from file extension + language = self.basic_analyzer.detect_language_from_extension( + Path(context.file_path).suffix + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol( + language=language, + file_path=context.file_path, + symbol_path=[file_name], + descriptor="" + ) + symbol_info.display_name = file_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('file') + symbol_info.documentation.append( + f"File: {context.file_path} ({language})" + ) + + return symbol_info + + def _create_file_occurrence(self, context: SCIPContext, file_name: str) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence for the file itself.""" + occurrence = scip_pb2.Occurrence() + + # Set range to cover entire file (0,0) to (lines, 0) + lines = context.content.count('\n') + occurrence.range.start.extend([0, 0]) + occurrence.range.end.extend([lines, 0]) + + # Detect language from file extension + language = self.basic_analyzer.detect_language_from_extension( + Path(context.file_path).suffix + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol( + language=language, + file_path=context.file_path, + symbol_path=[file_name], + descriptor="" + ) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('file') + + return occurrence + + +def create_fallback_scip_factory(project_root: str) -> FallbackSCIPIndexFactory: + """ + Factory creator for Fallback SCIP factory. + Ensures all required components are properly assembled via constructor injection. + """ + symbol_generator = SCIPSymbolGenerator( + scheme="scip-fallback", + package_manager="generic", + package_name=Path(project_root).name, + version="HEAD" + ) + + relationship_extractor = FallbackRelationshipExtractor() + enum_mapper = FallbackEnumMapper() + position_calculator = SCIPPositionCalculator() + + return FallbackSCIPIndexFactory( + project_root=project_root, + symbol_generator=symbol_generator, + relationship_extractor=relationship_extractor, # Guaranteed to be provided + enum_mapper=enum_mapper, + position_calculator=position_calculator + ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/relationship_extractor.py b/src/code_index_mcp/scip/framework/fallback/relationship_extractor.py new file mode 100644 index 0000000..facc4d4 --- /dev/null +++ b/src/code_index_mcp/scip/framework/fallback/relationship_extractor.py @@ -0,0 +1,85 @@ +"""Fallback relationship extractor implementation.""" + +from typing import List, Dict, Set, Optional, Any +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..relationship_manager import SymbolRelationship, RelationshipType +from ..types import SCIPContext + + +class FallbackRelationshipExtractor(BaseRelationshipExtractor): + """Fallback relationship extractor - minimal relationship analysis.""" + + def __init__(self): + """Initialize fallback relationship extractor.""" + super().__init__() + + def extract_symbol_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: + """Extract symbol relationships from fallback context (minimal analysis).""" + relationships = [] + + # For fallback, we only create minimal file-level relationships + try: + file_symbol = self._create_file_symbol_id(context.file_path) + + # Create self-relationship for the file + relationships.append(SymbolRelationship( + source_symbol=file_symbol, + target_symbol=file_symbol, + relationship_type=RelationshipType.CONTAINS, + source_location=(0, 0), + target_location=(0, 0), + context_info={ + "type": "file_self_reference", + "description": f"File contains itself: {context.file_path}" + } + )) + + except Exception: + # Silently handle any errors in fallback mode + pass + + return relationships + + def extract_import_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: + """Extract import relationships (none for fallback).""" + return [] # Fallback doesn't analyze imports + + def extract_inheritance_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: + """Extract inheritance relationships (none for fallback).""" + return [] # Fallback doesn't analyze inheritance + + def extract_call_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: + """Extract call relationships (none for fallback).""" + return [] # Fallback doesn't analyze function calls + + def extract_field_access_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: + """Extract field access relationships (none for fallback).""" + return [] # Fallback doesn't analyze field access + + def extract_type_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: + """Extract type relationships (none for fallback).""" + return [] # Fallback doesn't analyze types + + def resolve_cross_file_references(self, + local_relationships: List[SymbolRelationship], + global_symbol_map: Dict[str, Any]) -> List[SymbolRelationship]: + """Resolve cross-file references (none for fallback).""" + return local_relationships # No cross-file analysis in fallback + + def get_relationship_statistics(self) -> Dict[str, int]: + """Get relationship extraction statistics.""" + return { + "total_relationships": 0, + "import_relationships": 0, + "inheritance_relationships": 0, + "call_relationships": 0, + "field_access_relationships": 0, + "type_relationships": 0, + "cross_file_relationships": 0 + } + + def _create_file_symbol_id(self, file_path: str) -> str: + """Create a simple symbol ID for the file.""" + from pathlib import Path + file_name = Path(file_path).stem + return f"local {file_name}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/index_factory.py b/src/code_index_mcp/scip/framework/index_factory.py new file mode 100644 index 0000000..b78f343 --- /dev/null +++ b/src/code_index_mcp/scip/framework/index_factory.py @@ -0,0 +1,337 @@ +"""SCIP Index Factory - Abstract factory ensuring complete SCIP Index generation.""" + +import logging +import os +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Dict, Optional, Tuple, Any + +from .types import SCIPSymbolDescriptor, SCIPPositionInfo +from .compliance_validator import SCIPComplianceValidator +from ..proto import scip_pb2 + + +logger = logging.getLogger(__name__) + + +class SCIPIndexFactory(ABC): + """ + Abstract factory ensuring complete SCIP Index generation. + + This factory ensures all generated SCIP indexes contain the 6 essential content categories: + 1. Index Metadata + 2. Document Collection + 3. Symbol Definitions + 4. Symbol Occurrences + 5. Symbol Relationships + 6. External Symbols + """ + + def __init__(self, project_root: str): + """ + Initialize SCIP index factory. + + Args: + project_root: Absolute path to project root + """ + self.project_root = Path(project_root).resolve() + self.project_name = self.project_root.name + self._validator = SCIPComplianceValidator() + + logger.debug(f"Initialized SCIP Index Factory for project: {self.project_name}") + + @abstractmethod + def create_metadata(self, project_root: str) -> scip_pb2.Metadata: + """ + Create standard-compliant metadata (Category 1). + + Args: + project_root: Project root directory + + Returns: + SCIP Metadata object with all required fields + """ + pass + + @abstractmethod + def create_document(self, file_path: str, content: str) -> scip_pb2.Document: + """ + Create complete document with all occurrences and symbols (Category 2). + + Args: + file_path: Path to source file + content: File content + + Returns: + SCIP Document with complete symbol information + """ + pass + + @abstractmethod + def create_symbol_definition(self, + name: str, + kind: str, + scope: List[str], + file_path: str, + position: Optional[SCIPPositionInfo] = None, + documentation: Optional[List[str]] = None) -> scip_pb2.SymbolInformation: + """ + Create SCIP-compliant symbol definition (Category 3). + + Args: + name: Symbol name + kind: Symbol kind (function, class, variable, etc.) + scope: Scope path + file_path: File where symbol is defined + position: Optional position information + documentation: Optional documentation + + Returns: + SCIP SymbolInformation object + """ + pass + + @abstractmethod + def create_symbol_occurrence(self, + symbol_id: str, + position: SCIPPositionInfo, + role: str, + syntax: str) -> scip_pb2.Occurrence: + """ + Create SCIP-compliant symbol occurrence (Category 4). + + Args: + symbol_id: SCIP symbol identifier + position: Position information + role: Symbol role (definition, reference, etc.) + syntax: Syntax kind + + Returns: + SCIP Occurrence object + """ + pass + + @abstractmethod + def create_symbol_relationship(self, + source: str, + target: str, + rel_type: str) -> scip_pb2.Relationship: + """ + Create SCIP-compliant symbol relationship (Category 5). + + Args: + source: Source symbol ID + target: Target symbol ID + rel_type: Relationship type (inheritance, call, import, etc.) + + Returns: + SCIP Relationship object + """ + pass + + @abstractmethod + def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """ + Extract external symbols from imports and dependencies (Category 6). + + Args: + documents: List of processed documents + + Returns: + List of external symbol information + """ + pass + + def _extract_symbol_relationships(self, files: List[str], symbol_definitions: Dict[str, str], + documents: List[scip_pb2.Document]) -> None: + """ + Extract symbol relationships (Category 5). + + Default implementation does nothing. Subclasses can override to provide + language-specific relationship extraction. + + Args: + files: List of file paths + symbol_definitions: Mapping of symbol names to symbol IDs + documents: List of processed documents to update with relationships + """ + # Default implementation - no relationship extraction + pass + + def build_complete_index(self, files: List[str]) -> scip_pb2.Index: + """ + Build complete SCIP Index with all 6 content categories. + + Args: + files: List of file paths to index + + Returns: + Complete SCIP Index + + Raises: + RuntimeError: If index validation fails + """ + logger.info(f"Building complete SCIP index for {len(files)} files") + + index = scip_pb2.Index() + + # 1. Create metadata (Category 1) + logger.debug("Creating index metadata...") + index.metadata.CopyFrom(self.create_metadata(str(self.project_root))) + + # 2. Process all documents (Category 2) + logger.debug(f"Processing {len(files)} documents...") + documents = [] + symbol_definitions = {} # Track all symbol definitions for relationship extraction + + for file_path in files: + try: + content = self._read_file(file_path) + if content is not None: + doc = self.create_document(file_path, content) + documents.append(doc) + + # Collect symbol definitions for relationship extraction + for symbol_info in doc.symbols: + symbol_definitions[symbol_info.display_name] = symbol_info.symbol + + logger.debug(f"Processed document: {doc.relative_path}") + else: + logger.warning(f"Skipped unreadable file: {file_path}") + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + continue + + index.documents.extend(documents) + logger.info(f"Successfully processed {len(documents)} documents") + + # 2.5. Extract relationships (Category 5) - if supported by factory + logger.debug("Extracting symbol relationships...") + try: + self._extract_symbol_relationships(files, symbol_definitions, documents) + logger.info("Completed relationship extraction") + except Exception as e: + logger.warning(f"Relationship extraction failed: {e}") + + # 3. Extract external symbols (Category 6) + logger.debug("Extracting external symbols...") + try: + external_symbols = self.extract_external_symbols(documents) + index.external_symbols.extend(external_symbols) + logger.info(f"Extracted {len(external_symbols)} external symbols") + except Exception as e: + logger.warning(f"Failed to extract external symbols: {e}") + + # 4. Validate complete index + logger.debug("Validating complete index...") + if not self._validator.validate_index(index): + validation_summary = self._validator.get_validation_summary() + error_msg = f"Index validation failed: {validation_summary['error_messages']}" + logger.error(error_msg) + raise RuntimeError(error_msg) + + # Log final statistics + total_occurrences = sum(len(doc.occurrences) for doc in documents) + total_symbols = sum(len(doc.symbols) for doc in documents) + + logger.info(f"Created complete SCIP index:") + logger.info(f" - Documents: {len(documents)}") + logger.info(f" - Occurrences: {total_occurrences}") + logger.info(f" - Symbol Definitions: {total_symbols}") + logger.info(f" - External Symbols: {len(external_symbols)}") + + return index + + def validate_generated_content(self, content: Any) -> bool: + """ + Validate any generated SCIP content for compliance. + + Args: + content: SCIP content to validate (Index, Document, etc.) + + Returns: + True if content is compliant + """ + try: + if isinstance(content, scip_pb2.Index): + return self._validator.validate_index(content) + elif isinstance(content, scip_pb2.Document): + return self._validator.validate_document(content) + else: + logger.warning(f"Unknown content type for validation: {type(content)}") + return False + except Exception as e: + logger.error(f"Validation failed: {e}") + return False + + def get_validation_summary(self) -> dict: + """Get detailed validation summary from last validation operation.""" + return self._validator.get_validation_summary() + + def _read_file(self, file_path: str) -> Optional[str]: + """ + Read file content with encoding detection. + + Args: + file_path: Path to file + + Returns: + File content or None if reading fails + """ + encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252'] + + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + return f.read() + except UnicodeDecodeError: + continue + except (OSError, PermissionError, FileNotFoundError) as e: + logger.warning(f"Could not read {file_path}: {e}") + return None + + logger.warning(f"Could not decode {file_path} with any supported encoding") + return None + + def _get_relative_path(self, file_path: str) -> str: + """ + Get relative path from project root. + + Args: + file_path: Absolute or relative file path + + Returns: + Relative path from project root + """ + try: + path = Path(file_path) + if path.is_absolute(): + return str(path.relative_to(self.project_root)).replace('\\', '/') + return file_path.replace('\\', '/') + except ValueError: + # If path is not under project_root, return as-is + return str(Path(file_path)).replace('\\', '/') + + def _validate_symbol_id(self, symbol_id: str) -> bool: + """Validate symbol ID format.""" + return self._validator.validate_symbol_id(symbol_id) + + def _validate_position(self, position: SCIPPositionInfo, content: str) -> bool: + """Validate position information.""" + return self._validator.validate_position(position, content) + + def get_factory_info(self) -> dict: + """Get information about this factory instance.""" + return { + 'project_root': str(self.project_root), + 'project_name': self.project_name, + 'factory_type': self.__class__.__name__, + 'supported_categories': [ + 'Index Metadata', + 'Document Collection', + 'Symbol Definitions', + 'Symbol Occurrences', + 'Symbol Relationships', + 'External Symbols' + ] + } diff --git a/src/code_index_mcp/scip/framework/java/__init__.py b/src/code_index_mcp/scip/framework/java/__init__.py new file mode 100644 index 0000000..f9bd800 --- /dev/null +++ b/src/code_index_mcp/scip/framework/java/__init__.py @@ -0,0 +1,14 @@ +"""Java SCIP framework module.""" + +from .factory import JavaSCIPIndexFactory, create_java_scip_factory +from .enum_mapper import JavaEnumMapper +from .relationship_extractor import JavaRelationshipExtractor +from .tree_sitter_analyzer import JavaTreeSitterAnalyzer + +__all__ = [ + 'JavaSCIPIndexFactory', + 'create_java_scip_factory', + 'JavaEnumMapper', + 'JavaRelationshipExtractor', + 'JavaTreeSitterAnalyzer' +] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/java/enum_mapper.py b/src/code_index_mcp/scip/framework/java/enum_mapper.py new file mode 100644 index 0000000..13d5f77 --- /dev/null +++ b/src/code_index_mcp/scip/framework/java/enum_mapper.py @@ -0,0 +1,200 @@ +"""Java enum mapper implementation.""" + +from ..base.enum_mapper import BaseEnumMapper +from ...proto import scip_pb2 + + +class JavaEnumMapper(BaseEnumMapper): + """Java-specific enum mapper for SCIP compliance.""" + + # Java symbol kind mappings + SYMBOL_KIND_MAP = { + 'method': scip_pb2.Method, + 'class': scip_pb2.Class, + 'interface': scip_pb2.Interface, + 'enum': scip_pb2.Enum, + 'field': scip_pb2.Field, + 'variable': scip_pb2.Variable, + 'parameter': scip_pb2.Parameter, + 'constructor': scip_pb2.Constructor, + 'package': scip_pb2.Package, + 'annotation': scip_pb2.Interface, + 'constant': scip_pb2.Constant, + 'local_variable': scip_pb2.Variable, + 'type_parameter': scip_pb2.TypeParameter, + } + + # Java syntax kind mappings + SYNTAX_KIND_MAP = { + 'method_declaration': scip_pb2.IdentifierFunctionDefinition, + 'class_declaration': scip_pb2.IdentifierType, + 'interface_declaration': scip_pb2.IdentifierType, + 'enum_declaration': scip_pb2.IdentifierType, + 'field_declaration': scip_pb2.IdentifierAttribute, + 'variable_declaration': scip_pb2.IdentifierLocal, + 'parameter_declaration': scip_pb2.IdentifierParameter, + 'constructor_declaration': scip_pb2.IdentifierFunctionDefinition, + 'annotation_declaration': scip_pb2.IdentifierType, + 'identifier': scip_pb2.Identifier, + 'keyword': scip_pb2.IdentifierKeyword, + 'string_literal': scip_pb2.StringLiteral, + 'numeric_literal': scip_pb2.NumericLiteral, + 'boolean_literal': scip_pb2.BooleanLiteral, + 'comment': scip_pb2.Comment, + 'punctuation': scip_pb2.PunctuationDelimiter, + } + + # Java symbol role mappings (official SCIP naming) + SYMBOL_ROLE_MAP = { + 'definition': scip_pb2.Definition, + 'import': scip_pb2.Import, + 'write': scip_pb2.Write, # Official SCIP naming + 'read': scip_pb2.Read, # Official SCIP naming + 'generated': scip_pb2.Generated, + 'test': scip_pb2.Test, + 'type': scip_pb2.Type, # Add missing Type role + 'reference': scip_pb2.Read, # Default reference is read access + } + + def map_symbol_kind(self, language_kind: str) -> int: + """Map Java symbol type to SCIP SymbolKind.""" + kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SymbolKind'): + raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") + + return kind + + def map_syntax_kind(self, language_syntax: str) -> int: + """Map Java syntax element to SCIP SyntaxKind.""" + kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SyntaxKind'): + raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") + + return kind + + def map_symbol_role(self, language_role: str) -> int: + """Map Java symbol role to SCIP SymbolRole.""" + role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) + + # Validate enum value + if not self.validate_enum_value(role, 'SymbolRole'): + raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") + + return role + + def get_java_node_symbol_kind(self, node_type: str) -> str: + """ + Map Java tree-sitter node type to internal symbol kind string. + + Args: + node_type: Java tree-sitter node type (e.g., 'method_declaration', 'class_declaration') + + Returns: + Internal symbol kind string for use with map_symbol_kind() + """ + node_kind_map = { + 'method_declaration': 'method', + 'constructor_declaration': 'constructor', + 'class_declaration': 'class', + 'interface_declaration': 'interface', + 'enum_declaration': 'enum', + 'field_declaration': 'field', + 'local_variable_declaration': 'local_variable', + 'formal_parameter': 'parameter', + 'annotation_type_declaration': 'annotation', + 'type_parameter': 'type_parameter', + } + + return node_kind_map.get(node_type, 'variable') + + def get_java_node_syntax_kind(self, node_type: str, context: str = None) -> str: + """ + Map Java tree-sitter node type to internal syntax kind string. + + Args: + node_type: Java tree-sitter node type + context: Additional context for disambiguation + + Returns: + Internal syntax kind string for use with map_syntax_kind() + """ + node_syntax_map = { + 'method_declaration': 'method_declaration', + 'constructor_declaration': 'constructor_declaration', + 'class_declaration': 'class_declaration', + 'interface_declaration': 'interface_declaration', + 'enum_declaration': 'enum_declaration', + 'field_declaration': 'field_declaration', + 'local_variable_declaration': 'variable_declaration', + 'formal_parameter': 'parameter_declaration', + 'annotation_type_declaration': 'annotation_declaration', + 'identifier': 'identifier', + 'string_literal': 'string_literal', + 'decimal_integer_literal': 'numeric_literal', + 'hex_integer_literal': 'numeric_literal', + 'octal_integer_literal': 'numeric_literal', + 'binary_integer_literal': 'numeric_literal', + 'decimal_floating_point_literal': 'numeric_literal', + 'hex_floating_point_literal': 'numeric_literal', + 'true': 'boolean_literal', + 'false': 'boolean_literal', + 'null_literal': 'boolean_literal', + } + + return node_syntax_map.get(node_type, 'identifier') + + def get_java_node_symbol_role(self, node_type: str, context: str = None) -> str: + """ + Map Java tree-sitter node type to internal symbol role string. + + Args: + node_type: Java tree-sitter node type + context: Additional context (e.g., 'in_assignment', 'in_call') + + Returns: + Internal symbol role string for use with map_symbol_role() + """ + if context == 'definition': + return 'definition' + elif context == 'assignment': + return 'write' + elif context == 'import': + return 'import' + elif node_type in ['method_declaration', 'constructor_declaration', 'class_declaration', + 'interface_declaration', 'enum_declaration', 'field_declaration', + 'annotation_type_declaration']: + return 'definition' + else: + return 'reference' + + def is_valid_java_symbol_kind(self, symbol_kind: str) -> bool: + """Check if symbol kind is valid for Java.""" + return symbol_kind in self.SYMBOL_KIND_MAP + + def is_valid_java_syntax_kind(self, syntax_kind: str) -> bool: + """Check if syntax kind is valid for Java.""" + return syntax_kind in self.SYNTAX_KIND_MAP + + def is_valid_java_symbol_role(self, symbol_role: str) -> bool: + """Check if symbol role is valid for Java.""" + return symbol_role in self.SYMBOL_ROLE_MAP + + def get_all_java_symbol_kinds(self) -> list: + """Get all available Java symbol kinds.""" + return list(self.SYMBOL_KIND_MAP.keys()) + + def get_all_java_syntax_kinds(self) -> list: + """Get all available Java syntax kinds.""" + return list(self.SYNTAX_KIND_MAP.keys()) + + def get_all_java_symbol_roles(self) -> list: + """Get all available Java symbol roles.""" + return list(self.SYMBOL_ROLE_MAP.keys()) + + def get_java_type_reference_role(self) -> str: + """Get symbol role for type references (e.g., in generic parameters).""" + return 'type' \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/java/factory.py b/src/code_index_mcp/scip/framework/java/factory.py new file mode 100644 index 0000000..8883a2d --- /dev/null +++ b/src/code_index_mcp/scip/framework/java/factory.py @@ -0,0 +1,399 @@ +"""Java SCIP Index Factory implementation.""" + +import os +from pathlib import Path +from typing import Set, List, Iterator, Optional +from ..base.index_factory import SCIPIndexFactory +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..base.enum_mapper import BaseEnumMapper +from ..symbol_generator import SCIPSymbolGenerator +from ..position_calculator import SCIPPositionCalculator +from ..types import SCIPContext, SCIPSymbolDescriptor +from .relationship_extractor import JavaRelationshipExtractor +from .enum_mapper import JavaEnumMapper +from .tree_sitter_analyzer import JavaTreeSitterAnalyzer +from ...proto import scip_pb2 + +try: + import tree_sitter + from tree_sitter_java import language as java_language + TREE_SITTER_AVAILABLE = True +except ImportError: + TREE_SITTER_AVAILABLE = False + + +class JavaSCIPIndexFactory(SCIPIndexFactory): + """Java-specific SCIP Index factory implementation with constructor injection.""" + + def __init__(self, + project_root: str, + symbol_generator: SCIPSymbolGenerator, + relationship_extractor: BaseRelationshipExtractor, + enum_mapper: BaseEnumMapper, + position_calculator: SCIPPositionCalculator): + """Initialize Java factory with required components via constructor injection.""" + if not TREE_SITTER_AVAILABLE: + raise ImportError("Tree-sitter Java library not available") + + super().__init__(project_root, symbol_generator, relationship_extractor, + enum_mapper, position_calculator) + self.tree_analyzer = JavaTreeSitterAnalyzer() + + def get_language(self) -> str: + """Return language identifier.""" + return "java" + + def get_supported_extensions(self) -> Set[str]: + """Return supported file extensions.""" + return {'.java'} + + def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: + """Extract Java symbol definitions using tree-sitter analysis.""" + try: + tree = self.tree_analyzer.parse(context.content) + + for node in self.tree_analyzer.walk(tree): + if self.tree_analyzer.is_symbol_definition(node): + symbol_info = self._create_symbol_from_tree_node(node, context) + if symbol_info: + yield symbol_info + + except SyntaxError as e: + # Handle syntax errors gracefully + pass + + def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: + """Extract Java symbol occurrences.""" + try: + tree = self.tree_analyzer.parse(context.content) + + for node in self.tree_analyzer.walk(tree): + if (self.tree_analyzer.is_symbol_definition(node) or + self.tree_analyzer.is_symbol_reference(node)): + occurrence = self._create_occurrence_from_tree_node(node, context) + if occurrence: + yield occurrence + + except SyntaxError as e: + # Handle syntax errors gracefully + pass + + def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """Extract Java external symbols from imports.""" + external_symbols = [] + + for doc in documents: + try: + content = self._read_file(os.path.join(self.project_root, doc.relative_path)) + tree = self.tree_analyzer.parse(content) + + # Extract import statements + import_statements = self.tree_analyzer.extract_import_statements(tree) + for import_path in import_statements: + external_symbol = self._create_external_symbol_from_import(import_path) + if external_symbol: + external_symbols.append(external_symbol) + + except Exception as e: + # Skip problematic files + continue + + return external_symbols + + def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: + """ + Build Java-specific cross-document relationships. + + This implementation provides basic cross-document relationship support + for Java. A more sophisticated implementation would analyze package imports + and class dependencies. + """ + # For now, use a simplified approach + # TODO: Implement proper Java package import analysis + return 0 # Placeholder - no relationships added yet + + def _create_symbol_from_tree_node(self, node, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: + """Create SCIP symbol information from tree-sitter node.""" + symbol_info = scip_pb2.SymbolInformation() + + symbol_name = self.tree_analyzer.get_symbol_name(node) + if not symbol_name: + return None + + if node.type == 'class_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="class", + scope_path=context.scope_stack, + descriptor_suffix="#" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('class') + + elif node.type == 'interface_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('interface') + + elif node.type == 'enum_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('enum') + + elif node.type == 'method_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('method') + + elif node.type == 'constructor_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('constructor') + + elif node.type == 'field_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('field') + + elif node.type == 'local_variable_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('local_variable') + + elif node.type == 'formal_parameter': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('parameter') + + else: + return None + + return symbol_info + + def _create_occurrence_from_tree_node(self, node, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence from tree-sitter node.""" + occurrence = scip_pb2.Occurrence() + + # Calculate position using position calculator + try: + position_info = self.position_calculator.calculate_positions_from_tree_node( + context.content, node + ) + + # Set range + occurrence.range.start.extend([position_info.start_line, position_info.start_column]) + occurrence.range.end.extend([position_info.end_line, position_info.end_column]) + + except Exception as e: + # Skip if position calculation fails + return None + + symbol_name = self.tree_analyzer.get_symbol_name(node) + if not symbol_name: + return None + + # Set symbol and roles based on node type + if node.type == 'class_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('class_declaration') + + elif node.type == 'interface_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('interface_declaration') + + elif node.type == 'method_declaration': + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('method_declaration') + + elif node.type in ['identifier', 'type_identifier']: + # Handle variable references + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') + + else: + return None + + return occurrence + + def _create_external_symbol_from_import(self, import_path: str) -> Optional[scip_pb2.SymbolInformation]: + """Create external symbol from import statement.""" + symbol_info = scip_pb2.SymbolInformation() + + # Determine if it's a standard library or external dependency + if import_path.startswith('java.') or import_path.startswith('javax.'): + symbol_info.symbol = f"java-stdlib {import_path}" + symbol_info.display_name = import_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('package') + symbol_info.documentation.append(f"Java standard library: {import_path}") + else: + symbol_info.symbol = f"java-external {import_path}" + symbol_info.display_name = import_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('package') + symbol_info.documentation.append(f"External Java package: {import_path}") + + return symbol_info + + +def create_java_scip_factory(project_root: str) -> JavaSCIPIndexFactory: + """ + Factory creator for Java SCIP factory. + Ensures all required components are properly assembled via constructor injection. + """ + if not TREE_SITTER_AVAILABLE: + raise ImportError("Tree-sitter Java library not available") + + symbol_generator = SCIPSymbolGenerator( + scheme="scip-java", + package_manager="maven", + package_name=Path(project_root).name, + version="HEAD" + ) + + relationship_extractor = JavaRelationshipExtractor() + enum_mapper = JavaEnumMapper() + position_calculator = SCIPPositionCalculator() + + return JavaSCIPIndexFactory( + project_root=project_root, + symbol_generator=symbol_generator, + relationship_extractor=relationship_extractor, # Guaranteed to be provided + enum_mapper=enum_mapper, + position_calculator=position_calculator + ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/java/relationship_extractor.py b/src/code_index_mcp/scip/framework/java/relationship_extractor.py new file mode 100644 index 0000000..092b3ea --- /dev/null +++ b/src/code_index_mcp/scip/framework/java/relationship_extractor.py @@ -0,0 +1,295 @@ +"""Java relationship extractor implementation.""" + +from typing import Iterator, Optional, List +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..types import SCIPContext, Relationship +from ...core.relationship_types import InternalRelationshipType + +try: + import tree_sitter + from tree_sitter_java import language as java_language + TREE_SITTER_AVAILABLE = True +except ImportError: + TREE_SITTER_AVAILABLE = False + + +class JavaRelationshipExtractor(BaseRelationshipExtractor): + """Java-specific relationship extractor using tree-sitter analysis.""" + + def __init__(self): + """Initialize the Java relationship extractor.""" + if not TREE_SITTER_AVAILABLE: + raise ImportError("Tree-sitter Java library not available") + + java_lang = tree_sitter.Language(java_language()) + self.parser = tree_sitter.Parser(java_lang) + + def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract inheritance relationships from Java classes.""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + for node in self._walk_tree(tree.root_node): + if node.type == 'class_declaration': + class_name = self._get_class_name(node) + if not class_name: + continue + + class_symbol_id = self._create_class_symbol_id(class_name, context) + + # Look for extends clause + extends_node = self._find_child_by_type(node, 'superclass') + if extends_node: + parent_type = self._find_child_by_type(extends_node, 'type_identifier') + if parent_type: + parent_name = self._get_node_text(parent_type, context.content) + parent_symbol_id = self._create_class_symbol_id(parent_name, context) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract method call relationships.""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + for node in self._walk_tree(tree.root_node): + if node.type == 'method_declaration': + method_name = self._get_method_name(node) + if not method_name: + continue + + method_symbol_id = self._create_method_symbol_id(method_name, context) + + # Find method invocations within this method + for call_node in self._walk_tree(node): + if call_node.type == 'method_invocation': + target_method = self._get_invocation_target(call_node, context.content) + if target_method and target_method != method_name: + target_symbol_id = self._create_method_symbol_id(target_method, context) + yield Relationship( + source_symbol=method_symbol_id, + target_symbol=target_symbol_id, + relationship_type=InternalRelationshipType.CALLS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract import/dependency relationships.""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + file_symbol_id = self._create_file_symbol_id(context.file_path) + + for node in self._walk_tree(tree.root_node): + if node.type == 'import_declaration': + import_path = self._get_import_path(node, context.content) + if import_path: + # Determine if it's a standard library or external dependency + if import_path.startswith('java.') or import_path.startswith('javax.'): + module_symbol_id = f"java-stdlib {import_path}" + else: + module_symbol_id = f"java-external {import_path}" + + yield Relationship( + source_symbol=file_symbol_id, + target_symbol=module_symbol_id, + relationship_type=InternalRelationshipType.IMPORTS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract composition relationships (class fields).""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + for node in self._walk_tree(tree.root_node): + if node.type == 'class_declaration': + class_name = self._get_class_name(node) + if not class_name: + continue + + class_symbol_id = self._create_class_symbol_id(class_name, context) + + # Find field declarations in this class + for field_node in self._walk_tree(node): + if field_node.type == 'field_declaration': + field_name = self._get_field_name(field_node, context.content) + if field_name: + field_symbol_id = self._create_field_symbol_id(field_name, class_symbol_id) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=field_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract interface implementation relationships.""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + for node in self._walk_tree(tree.root_node): + if node.type == 'class_declaration': + class_name = self._get_class_name(node) + if not class_name: + continue + + class_symbol_id = self._create_class_symbol_id(class_name, context) + + # Look for implements clause + implements_node = self._find_child_by_type(node, 'super_interfaces') + if implements_node: + for interface_node in self._find_children_by_type(implements_node, 'type_identifier'): + interface_name = self._get_node_text(interface_node, context.content) + interface_symbol_id = self._create_interface_symbol_id(interface_name, context) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=interface_symbol_id, + relationship_type=InternalRelationshipType.IMPLEMENTS + ) + + elif node.type == 'interface_declaration': + interface_name = self._get_interface_name(node, context.content) + if not interface_name: + continue + + interface_symbol_id = self._create_interface_symbol_id(interface_name, context) + + # Look for extends clause in interface + extends_node = self._find_child_by_type(node, 'extends_interfaces') + if extends_node: + for parent_interface_node in self._find_children_by_type(extends_node, 'type_identifier'): + parent_interface_name = self._get_node_text(parent_interface_node, context.content) + parent_symbol_id = self._create_interface_symbol_id(parent_interface_name, context) + yield Relationship( + source_symbol=interface_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + + except Exception: + # Skip files with parsing errors + return + + def _walk_tree(self, node) -> Iterator: + """Walk tree-sitter tree nodes.""" + yield node + for child in node.children: + yield from self._walk_tree(child) + + def _find_child_by_type(self, node, node_type: str): + """Find first child node of specified type.""" + for child in node.children: + if child.type == node_type: + return child + return None + + def _find_children_by_type(self, node, node_type: str) -> List: + """Find all child nodes of specified type.""" + children = [] + for child in node.children: + if child.type == node_type: + children.append(child) + return children + + def _get_node_text(self, node, content: str) -> str: + """Get text content of a tree-sitter node.""" + return content[node.start_byte:node.end_byte] + + def _get_class_name(self, class_node) -> Optional[str]: + """Extract class name from class declaration node.""" + identifier_node = self._find_child_by_type(class_node, 'identifier') + if identifier_node: + return identifier_node.text.decode('utf8') + return None + + def _get_method_name(self, method_node) -> Optional[str]: + """Extract method name from method declaration node.""" + identifier_node = self._find_child_by_type(method_node, 'identifier') + if identifier_node: + return identifier_node.text.decode('utf8') + return None + + def _get_interface_name(self, interface_node, content: str) -> Optional[str]: + """Extract interface name from interface declaration node.""" + identifier_node = self._find_child_by_type(interface_node, 'identifier') + if identifier_node: + return self._get_node_text(identifier_node, content) + return None + + def _get_field_name(self, field_node, content: str) -> Optional[str]: + """Extract field name from field declaration node.""" + # Field declarations can have multiple declarators + declarator = self._find_child_by_type(field_node, 'variable_declarator') + if declarator: + identifier = self._find_child_by_type(declarator, 'identifier') + if identifier: + return self._get_node_text(identifier, content) + return None + + def _get_import_path(self, import_node, content: str) -> Optional[str]: + """Extract import path from import declaration.""" + # Look for scoped_identifier or identifier in import + for child in import_node.children: + if child.type in ['scoped_identifier', 'identifier']: + return self._get_node_text(child, content) + return None + + def _get_invocation_target(self, invocation_node, content: str) -> Optional[str]: + """Extract target method name from method invocation.""" + identifier_node = self._find_child_by_type(invocation_node, 'identifier') + if identifier_node: + return self._get_node_text(identifier_node, content) + + # Handle method calls like object.method() + field_access = self._find_child_by_type(invocation_node, 'field_access') + if field_access: + identifier = self._find_child_by_type(field_access, 'identifier') + if identifier: + return self._get_node_text(identifier, content) + + return None + + def _create_class_symbol_id(self, class_name: str, context: SCIPContext) -> str: + """Create symbol ID for class.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{class_name}" if scope_path else class_name + return f"local {local_id}#" + + def _create_method_symbol_id(self, method_name: str, context: SCIPContext) -> str: + """Create symbol ID for method.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{method_name}" if scope_path else method_name + return f"local {local_id}()." + + def _create_interface_symbol_id(self, interface_name: str, context: SCIPContext) -> str: + """Create symbol ID for interface.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{interface_name}" if scope_path else interface_name + return f"local {local_id}#" + + def _create_field_symbol_id(self, field_name: str, class_symbol_id: str) -> str: + """Create symbol ID for field.""" + # Extract class name from class symbol ID + class_name = class_symbol_id.replace("local ", "").replace("#", "") + return f"local {class_name}.{field_name}" + + def _create_file_symbol_id(self, file_path: str) -> str: + """Create symbol ID for file.""" + return f"local {file_path}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/java/tree_sitter_analyzer.py b/src/code_index_mcp/scip/framework/java/tree_sitter_analyzer.py new file mode 100644 index 0000000..0f0c841 --- /dev/null +++ b/src/code_index_mcp/scip/framework/java/tree_sitter_analyzer.py @@ -0,0 +1,327 @@ +"""Java tree-sitter analyzer implementation.""" + +from typing import Iterator, Optional, Set, List, Dict, Any +from ..types import SCIPContext +from ..base.language_analyzer import BaseLanguageAnalyzer + +try: + import tree_sitter + from tree_sitter_java import language as java_language + TREE_SITTER_AVAILABLE = True +except ImportError: + TREE_SITTER_AVAILABLE = False + + +class JavaTreeSitterAnalyzer(BaseLanguageAnalyzer): + """Java analyzer using tree-sitter for AST parsing.""" + + def __init__(self): + """Initialize the Java tree-sitter analyzer.""" + if not TREE_SITTER_AVAILABLE: + raise ImportError("Tree-sitter Java library not available") + + java_lang = tree_sitter.Language(java_language()) + self.parser = tree_sitter.Parser(java_lang) + self._processed_nodes: Set[int] = set() + + def parse(self, content: str, filename: str = ""): + """Parse Java source code into tree-sitter AST.""" + try: + return self.parser.parse(bytes(content, 'utf8')) + except Exception as e: + raise SyntaxError(f"Java syntax error in {filename}: {e}") + + def walk(self, tree) -> Iterator: + """Walk tree-sitter tree nodes, avoiding duplicates.""" + for node in self._walk_node(tree.root_node): + node_id = id(node) + if node_id not in self._processed_nodes: + self._processed_nodes.add(node_id) + yield node + + def _walk_node(self, node) -> Iterator: + """Recursively walk tree nodes.""" + yield node + for child in node.children: + yield from self._walk_node(child) + + def is_symbol_definition(self, node) -> bool: + """Check if tree-sitter node represents a symbol definition.""" + return node.type in { + 'class_declaration', + 'interface_declaration', + 'enum_declaration', + 'method_declaration', + 'constructor_declaration', + 'field_declaration', + 'local_variable_declaration', + 'formal_parameter', + 'annotation_type_declaration', + } + + def is_symbol_reference(self, node) -> bool: + """Check if tree-sitter node represents a symbol reference.""" + return node.type in { + 'identifier', + 'type_identifier', + 'method_invocation', + 'field_access', + } + + def get_symbol_name(self, node) -> Optional[str]: + """Extract symbol name from tree-sitter node.""" + if node.type in ['class_declaration', 'interface_declaration', 'enum_declaration', + 'method_declaration', 'constructor_declaration', 'annotation_type_declaration']: + identifier_node = self._find_child_by_type(node, 'identifier') + if identifier_node: + return identifier_node.text.decode('utf8') + + elif node.type == 'field_declaration': + # Field declarations can have multiple declarators + declarator = self._find_child_by_type(node, 'variable_declarator') + if declarator: + identifier = self._find_child_by_type(declarator, 'identifier') + if identifier: + return identifier.text.decode('utf8') + + elif node.type == 'local_variable_declaration': + declarator = self._find_child_by_type(node, 'variable_declarator') + if declarator: + identifier = self._find_child_by_type(declarator, 'identifier') + if identifier: + return identifier.text.decode('utf8') + + elif node.type == 'formal_parameter': + identifier = self._find_child_by_type(node, 'identifier') + if identifier: + return identifier.text.decode('utf8') + + elif node.type in ['identifier', 'type_identifier']: + return node.text.decode('utf8') + + return None + + def get_node_position(self, node) -> tuple: + """Get position information from tree-sitter node.""" + start_line = node.start_point[0] + start_col = node.start_point[1] + end_line = node.end_point[0] + end_col = node.end_point[1] + + return (start_line, start_col, end_line, end_col) + + def extract_class_info(self, tree) -> List[Dict[str, Any]]: + """Extract class information from the AST.""" + classes = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'class_declaration': + class_info = { + 'name': self.get_symbol_name(node), + 'type': 'class', + 'position': self.get_node_position(node), + 'modifiers': self._extract_modifiers(node), + 'superclass': self._extract_superclass(node), + 'interfaces': self._extract_implemented_interfaces(node), + 'methods': self._extract_class_methods(node), + 'fields': self._extract_class_fields(node), + } + classes.append(class_info) + + return classes + + def extract_interface_info(self, tree) -> List[Dict[str, Any]]: + """Extract interface information from the AST.""" + interfaces = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'interface_declaration': + interface_info = { + 'name': self.get_symbol_name(node), + 'type': 'interface', + 'position': self.get_node_position(node), + 'modifiers': self._extract_modifiers(node), + 'extends': self._extract_extended_interfaces(node), + 'methods': self._extract_interface_methods(node), + } + interfaces.append(interface_info) + + return interfaces + + def extract_method_info(self, tree) -> List[Dict[str, Any]]: + """Extract method information from the AST.""" + methods = [] + + for node in self._walk_node(tree.root_node): + if node.type in ['method_declaration', 'constructor_declaration']: + method_info = { + 'name': self.get_symbol_name(node), + 'type': 'constructor' if node.type == 'constructor_declaration' else 'method', + 'position': self.get_node_position(node), + 'modifiers': self._extract_modifiers(node), + 'return_type': self._extract_return_type(node), + 'parameters': self._extract_method_parameters(node), + 'throws': self._extract_throws_clause(node), + } + methods.append(method_info) + + return methods + + def extract_import_statements(self, tree) -> List[str]: + """Extract import statements from the AST.""" + imports = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'import_declaration': + import_path = self._extract_import_path(node) + if import_path: + imports.append(import_path) + + return imports + + def extract_package_declaration(self, tree) -> Optional[str]: + """Extract package declaration from the AST.""" + for node in self._walk_node(tree.root_node): + if node.type == 'package_declaration': + return self._extract_package_name(node) + return None + + def _find_child_by_type(self, node, node_type: str): + """Find first child node of specified type.""" + for child in node.children: + if child.type == node_type: + return child + return None + + def _find_children_by_type(self, node, node_type: str) -> List: + """Find all child nodes of specified type.""" + children = [] + for child in node.children: + if child.type == node_type: + children.append(child) + return children + + def _extract_modifiers(self, node) -> List[str]: + """Extract modifiers from a declaration node.""" + modifiers = [] + modifiers_node = self._find_child_by_type(node, 'modifiers') + if modifiers_node: + for child in modifiers_node.children: + if child.type in ['public', 'private', 'protected', 'static', 'final', + 'abstract', 'synchronized', 'volatile', 'transient', 'native']: + modifiers.append(child.type) + return modifiers + + def _extract_superclass(self, class_node) -> Optional[str]: + """Extract superclass name from class declaration.""" + superclass_node = self._find_child_by_type(class_node, 'superclass') + if superclass_node: + type_node = self._find_child_by_type(superclass_node, 'type_identifier') + if type_node: + return type_node.text.decode('utf8') + return None + + def _extract_implemented_interfaces(self, class_node) -> List[str]: + """Extract implemented interface names from class declaration.""" + interfaces = [] + interfaces_node = self._find_child_by_type(class_node, 'super_interfaces') + if interfaces_node: + for interface_node in self._find_children_by_type(interfaces_node, 'type_identifier'): + interfaces.append(interface_node.text.decode('utf8')) + return interfaces + + def _extract_extended_interfaces(self, interface_node) -> List[str]: + """Extract extended interface names from interface declaration.""" + interfaces = [] + extends_node = self._find_child_by_type(interface_node, 'extends_interfaces') + if extends_node: + for interface_node in self._find_children_by_type(extends_node, 'type_identifier'): + interfaces.append(interface_node.text.decode('utf8')) + return interfaces + + def _extract_class_methods(self, class_node) -> List[str]: + """Extract method names from class declaration.""" + methods = [] + for child in class_node.children: + if child.type in ['method_declaration', 'constructor_declaration']: + method_name = self.get_symbol_name(child) + if method_name: + methods.append(method_name) + return methods + + def _extract_class_fields(self, class_node) -> List[str]: + """Extract field names from class declaration.""" + fields = [] + for child in class_node.children: + if child.type == 'field_declaration': + field_name = self.get_symbol_name(child) + if field_name: + fields.append(field_name) + return fields + + def _extract_interface_methods(self, interface_node) -> List[str]: + """Extract method names from interface declaration.""" + methods = [] + for child in interface_node.children: + if child.type == 'method_declaration': + method_name = self.get_symbol_name(child) + if method_name: + methods.append(method_name) + return methods + + def _extract_return_type(self, method_node) -> Optional[str]: + """Extract return type from method declaration.""" + # Constructor declarations don't have return types + if method_node.type == 'constructor_declaration': + return None + + # Look for various return type patterns + for child in method_node.children: + if child.type in ['type_identifier', 'primitive_type', 'array_type', 'generic_type']: + return child.text.decode('utf8') + return None + + def _extract_method_parameters(self, method_node) -> List[Dict[str, str]]: + """Extract parameter information from method declaration.""" + parameters = [] + formal_params_node = self._find_child_by_type(method_node, 'formal_parameters') + if formal_params_node: + for param_node in self._find_children_by_type(formal_params_node, 'formal_parameter'): + param_name = self.get_symbol_name(param_node) + param_type = self._extract_parameter_type(param_node) + if param_name: + parameters.append({ + 'name': param_name, + 'type': param_type or 'unknown' + }) + return parameters + + def _extract_parameter_type(self, param_node) -> Optional[str]: + """Extract parameter type from formal parameter node.""" + for child in param_node.children: + if child.type in ['type_identifier', 'primitive_type', 'array_type', 'generic_type']: + return child.text.decode('utf8') + return None + + def _extract_throws_clause(self, method_node) -> List[str]: + """Extract throws clause from method declaration.""" + throws = [] + throws_node = self._find_child_by_type(method_node, 'throws') + if throws_node: + for exception_node in self._find_children_by_type(throws_node, 'type_identifier'): + throws.append(exception_node.text.decode('utf8')) + return throws + + def _extract_import_path(self, import_node) -> Optional[str]: + """Extract import path from import declaration.""" + for child in import_node.children: + if child.type in ['scoped_identifier', 'identifier']: + return child.text.decode('utf8') + return None + + def _extract_package_name(self, package_node) -> Optional[str]: + """Extract package name from package declaration.""" + for child in package_node.children: + if child.type in ['scoped_identifier', 'identifier']: + return child.text.decode('utf8') + return None \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/__init__.py b/src/code_index_mcp/scip/framework/javascript/__init__.py new file mode 100644 index 0000000..f15ddd6 --- /dev/null +++ b/src/code_index_mcp/scip/framework/javascript/__init__.py @@ -0,0 +1,14 @@ +"""JavaScript/TypeScript-specific SCIP framework components.""" + +from .factory import JavaScriptSCIPIndexFactory, create_javascript_scip_factory +from .relationship_extractor import JavaScriptRelationshipExtractor +from .enum_mapper import JavaScriptEnumMapper +from .syntax_analyzer import JavaScriptSyntaxAnalyzer + +__all__ = [ + 'JavaScriptSCIPIndexFactory', + 'create_javascript_scip_factory', + 'JavaScriptRelationshipExtractor', + 'JavaScriptEnumMapper', + 'JavaScriptSyntaxAnalyzer', +] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/enum_mapper.py b/src/code_index_mcp/scip/framework/javascript/enum_mapper.py new file mode 100644 index 0000000..e5f03ab --- /dev/null +++ b/src/code_index_mcp/scip/framework/javascript/enum_mapper.py @@ -0,0 +1,237 @@ +"""JavaScript enum mapper implementation.""" + +from typing import Dict +from ..base.enum_mapper import BaseEnumMapper +from ...proto import scip_pb2 + + +class JavaScriptEnumMapper(BaseEnumMapper): + """JavaScript/TypeScript-specific enum mapper for SCIP compliance.""" + + # JavaScript symbol kind mappings + SYMBOL_KIND_MAP = { + 'function': scip_pb2.Function, + 'arrow_function': scip_pb2.Function, + 'method': scip_pb2.Method, + 'class': scip_pb2.Class, + 'variable': scip_pb2.Variable, + 'constant': scip_pb2.Constant, + 'module': scip_pb2.Module, + 'parameter': scip_pb2.Parameter, + 'property': scip_pb2.Property, + 'constructor': scip_pb2.Constructor, + 'field': scip_pb2.Field, + 'namespace': scip_pb2.Namespace, + 'interface': scip_pb2.Interface, + 'type': scip_pb2.Type, + 'object': scip_pb2.Object, + 'enum': scip_pb2.Enum, + } + + # JavaScript syntax kind mappings + SYNTAX_KIND_MAP = { + 'function_definition': scip_pb2.IdentifierFunctionDefinition, + 'class_definition': scip_pb2.IdentifierType, + 'variable_definition': scip_pb2.IdentifierLocal, + 'parameter_definition': scip_pb2.IdentifierParameter, + 'property_definition': scip_pb2.IdentifierAttribute, + 'method_definition': scip_pb2.IdentifierFunctionDefinition, + 'interface_definition': scip_pb2.IdentifierType, + 'type_definition': scip_pb2.IdentifierType, + 'identifier': scip_pb2.Identifier, + 'keyword': scip_pb2.IdentifierKeyword, + 'string_literal': scip_pb2.StringLiteral, + 'numeric_literal': scip_pb2.NumericLiteral, + 'boolean_literal': scip_pb2.BooleanLiteral, + 'regex_literal': scip_pb2.RegexEscape, + 'comment': scip_pb2.Comment, + 'punctuation': scip_pb2.PunctuationDelimiter, + 'operator': scip_pb2.PunctuationDelimiter, + } + + # JavaScript symbol role mappings (official SCIP naming) + SYMBOL_ROLE_MAP = { + 'definition': scip_pb2.Definition, + 'import': scip_pb2.Import, + 'write': scip_pb2.Write, # Official SCIP naming + 'read': scip_pb2.Read, # Official SCIP naming + 'generated': scip_pb2.Generated, + 'test': scip_pb2.Test, + 'type': scip_pb2.Type, # Add missing Type role + 'reference': scip_pb2.Read, # Default reference is read access + 'export': scip_pb2.Definition, # Exports are definitions + } + + def map_symbol_kind(self, language_kind: str) -> int: + """Map JavaScript symbol type to SCIP SymbolKind.""" + kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SymbolKind'): + raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") + + return kind + + def map_syntax_kind(self, language_syntax: str) -> int: + """Map JavaScript syntax element to SCIP SyntaxKind.""" + kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SyntaxKind'): + raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") + + return kind + + def map_symbol_role(self, language_role: str) -> int: + """Map JavaScript symbol role to SCIP SymbolRole.""" + role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) + + # Validate enum value + if not self.validate_enum_value(role, 'SymbolRole'): + raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") + + return role + + def get_javascript_pattern_symbol_kind(self, pattern_type: str) -> str: + """ + Map JavaScript pattern type to internal symbol kind string. + + Args: + pattern_type: Pattern type from regex matches (e.g., 'function', 'class') + + Returns: + Internal symbol kind string for use with map_symbol_kind() + """ + pattern_kind_map = { + 'function': 'function', + 'arrow_function': 'arrow_function', + 'class': 'class', + 'const': 'constant', + 'let': 'variable', + 'var': 'variable', + 'method': 'method', + 'object_method': 'function', + 'constructor': 'constructor', + 'interface': 'interface', + 'type': 'type', + 'enum': 'enum', + 'namespace': 'namespace', + } + + return pattern_kind_map.get(pattern_type, 'variable') + + def get_javascript_pattern_syntax_kind(self, pattern_type: str, context: str = None) -> str: + """ + Map JavaScript pattern type to internal syntax kind string. + + Args: + pattern_type: Pattern type from regex matches + context: Additional context for disambiguation + + Returns: + Internal syntax kind string for use with map_syntax_kind() + """ + pattern_syntax_map = { + 'function': 'function_definition', + 'arrow_function': 'function_definition', + 'class': 'class_definition', + 'const': 'variable_definition', + 'let': 'variable_definition', + 'var': 'variable_definition', + 'method': 'method_definition', + 'object_method': 'function_definition', + 'interface': 'interface_definition', + 'type': 'type_definition', + 'identifier': 'identifier', + 'string': 'string_literal', + 'number': 'numeric_literal', + 'boolean': 'boolean_literal', + 'regex': 'regex_literal', + } + + return pattern_syntax_map.get(pattern_type, 'identifier') + + def get_javascript_pattern_symbol_role(self, pattern_type: str, context: str = None) -> str: + """ + Map JavaScript pattern type to internal symbol role string. + + Args: + pattern_type: Pattern type from regex matches + context: Additional context (e.g., 'in_assignment', 'in_call') + + Returns: + Internal symbol role string for use with map_symbol_role() + """ + if context == 'definition': + return 'definition' + elif context == 'assignment': + return 'write' + elif context == 'import': + return 'import' + elif context == 'export': + return 'export' + elif pattern_type in ['function', 'arrow_function', 'class', 'method', 'object_method', + 'const', 'let', 'var', 'interface', 'type']: + return 'definition' + else: + return 'reference' + + def get_typescript_specific_kinds(self) -> Dict[str, str]: + """Get TypeScript-specific symbol kinds.""" + return { + 'interface': 'interface', + 'type_alias': 'type', + 'enum': 'enum', + 'namespace': 'namespace', + 'generic_type': 'type', + 'union_type': 'type', + 'intersection_type': 'type', + } + + def get_javascript_type_reference_role(self) -> str: + """Get symbol role for type references (e.g., in TypeScript annotations).""" + return 'type' + + def is_valid_javascript_symbol_kind(self, symbol_kind: str) -> bool: + """Check if symbol kind is valid for JavaScript.""" + return symbol_kind in self.SYMBOL_KIND_MAP + + def is_valid_javascript_syntax_kind(self, syntax_kind: str) -> bool: + """Check if syntax kind is valid for JavaScript.""" + return syntax_kind in self.SYNTAX_KIND_MAP + + def is_valid_javascript_symbol_role(self, symbol_role: str) -> bool: + """Check if symbol role is valid for JavaScript.""" + return symbol_role in self.SYMBOL_ROLE_MAP + + def get_all_javascript_symbol_kinds(self) -> list: + """Get all available JavaScript symbol kinds.""" + return list(self.SYMBOL_KIND_MAP.keys()) + + def get_all_javascript_syntax_kinds(self) -> list: + """Get all available JavaScript syntax kinds.""" + return list(self.SYNTAX_KIND_MAP.keys()) + + def get_all_javascript_symbol_roles(self) -> list: + """Get all available JavaScript symbol roles.""" + return list(self.SYMBOL_ROLE_MAP.keys()) + + def supports_typescript(self) -> bool: + """Check if TypeScript features are supported.""" + return True + + def get_es6_feature_kinds(self) -> Dict[str, str]: + """Get ES6+ specific feature mappings.""" + return { + 'arrow_function': 'function', + 'class': 'class', + 'const': 'constant', + 'let': 'variable', + 'destructuring': 'variable', + 'spread_operator': 'operator', + 'template_literal': 'string_literal', + 'async_function': 'function', + 'generator_function': 'function', + 'module_export': 'module', + 'module_import': 'module', + } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/factory.py b/src/code_index_mcp/scip/framework/javascript/factory.py new file mode 100644 index 0000000..a08d8d9 --- /dev/null +++ b/src/code_index_mcp/scip/framework/javascript/factory.py @@ -0,0 +1,376 @@ +"""JavaScript/TypeScript SCIP Index Factory implementation.""" + +import re +import os +from pathlib import Path +from typing import Set, List, Iterator, Optional, Dict, Any +from ..base.index_factory import SCIPIndexFactory +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..base.enum_mapper import BaseEnumMapper +from ..symbol_generator import SCIPSymbolGenerator +from ..position_calculator import SCIPPositionCalculator +from ..types import SCIPContext, SCIPSymbolDescriptor +from .relationship_extractor import JavaScriptRelationshipExtractor +from .enum_mapper import JavaScriptEnumMapper +from .syntax_analyzer import JavaScriptSyntaxAnalyzer +from ...proto import scip_pb2 + + +class JavaScriptSCIPIndexFactory(SCIPIndexFactory): + """JavaScript/TypeScript-specific SCIP Index factory implementation with constructor injection.""" + + def __init__(self, + project_root: str, + symbol_generator: SCIPSymbolGenerator, + relationship_extractor: BaseRelationshipExtractor, + enum_mapper: BaseEnumMapper, + position_calculator: SCIPPositionCalculator): + """Initialize JavaScript factory with required components via constructor injection.""" + super().__init__(project_root, symbol_generator, relationship_extractor, + enum_mapper, position_calculator) + self.syntax_analyzer = JavaScriptSyntaxAnalyzer() + + def get_language(self) -> str: + """Return language identifier.""" + return "javascript" + + def get_supported_extensions(self) -> Set[str]: + """Return supported file extensions.""" + return {'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs'} + + def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: + """Extract JavaScript symbol definitions using regex-based analysis.""" + try: + patterns = self.syntax_analyzer.get_symbol_patterns() + + for pattern_type, pattern in patterns.items(): + for match in re.finditer(pattern, context.content, re.MULTILINE): + symbol_info = self._create_symbol_from_match(match, pattern_type, context) + if symbol_info: + yield symbol_info + + except Exception as e: + # Handle parsing errors gracefully + pass + + def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: + """Extract JavaScript symbol occurrences.""" + try: + patterns = self.syntax_analyzer.get_occurrence_patterns() + + for pattern_type, pattern in patterns.items(): + for match in re.finditer(pattern, context.content, re.MULTILINE): + occurrence = self._create_occurrence_from_match(match, pattern_type, context) + if occurrence: + yield occurrence + + except Exception as e: + # Handle parsing errors gracefully + pass + + def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """Extract JavaScript external symbols from imports.""" + external_symbols = [] + + for doc in documents: + try: + content = self._read_file(os.path.join(self.project_root, doc.relative_path)) + import_patterns = self.syntax_analyzer.get_import_patterns() + + for pattern_type, pattern in import_patterns.items(): + for match in re.finditer(pattern, content, re.MULTILINE): + external_symbol = self._create_external_symbol_from_import_match(match, pattern_type) + if external_symbol: + external_symbols.append(external_symbol) + + except Exception as e: + # Skip problematic files + continue + + return external_symbols + + def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: + """ + Build JavaScript-specific cross-document relationships. + + This implementation provides basic cross-document relationship support + for JavaScript/TypeScript. A more sophisticated implementation would + analyze ES6 imports and require statements. + """ + # For now, use a simplified approach + # TODO: Implement proper JavaScript import/export analysis + return 0 # Placeholder - no relationships added yet + + def _create_symbol_from_match(self, match: re.Match, pattern_type: str, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: + """Create SCIP symbol information from regex match.""" + symbol_info = scip_pb2.SymbolInformation() + + if pattern_type == 'function': + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = name + symbol_info.kind = self.enum_mapper.map_symbol_kind('function') + + elif pattern_type == 'arrow_function': + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = name + symbol_info.kind = self.enum_mapper.map_symbol_kind('function') + + elif pattern_type == 'class': + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = name + symbol_info.kind = self.enum_mapper.map_symbol_kind('class') + + elif pattern_type == 'const': + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = name + symbol_info.kind = self.enum_mapper.map_symbol_kind('constant') + + elif pattern_type == 'method': + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = name + symbol_info.kind = self.enum_mapper.map_symbol_kind('method') + + elif pattern_type == 'object_method': + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = name + symbol_info.kind = self.enum_mapper.map_symbol_kind('function') + + else: + return None + + return symbol_info + + def _create_occurrence_from_match(self, match: re.Match, pattern_type: str, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence from regex match.""" + occurrence = scip_pb2.Occurrence() + + # Calculate position using position calculator + try: + start_pos = match.start() + end_pos = match.end() + + position_info = self.position_calculator.calculate_positions_from_offset( + context.content, start_pos, end_pos + ) + + # Set range + occurrence.range.start.extend([position_info.start_line, position_info.start_column]) + occurrence.range.end.extend([position_info.end_line, position_info.end_column]) + + except Exception as e: + # Skip if position calculation fails + return None + + # Set symbol and roles based on pattern type + if pattern_type in ['function', 'arrow_function', 'method', 'object_method']: + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('function_definition') + + elif pattern_type == 'class': + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('class_definition') + + elif pattern_type in ['const', 'let', 'var']: + name = match.group(1) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('variable_definition') + + elif pattern_type == 'identifier': + name = match.group(0) + descriptor = SCIPSymbolDescriptor( + + name=name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') + + else: + return None + + return occurrence + + def _create_external_symbol_from_import_match(self, match: re.Match, pattern_type: str) -> Optional[scip_pb2.SymbolInformation]: + """Create external symbol from import statement match.""" + symbol_info = scip_pb2.SymbolInformation() + + if pattern_type == 'es6_import': + # import { name } from 'module' + module_name = match.group(2) if match.lastindex >= 2 else match.group(1) + symbol_info.symbol = f"npm {module_name}" + symbol_info.display_name = module_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"ES6 imported module: {module_name}") + return symbol_info + + elif pattern_type == 'require': + # const name = require('module') + module_name = match.group(2) if match.lastindex >= 2 else match.group(1) + symbol_info.symbol = f"npm {module_name}" + symbol_info.display_name = module_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"CommonJS required module: {module_name}") + return symbol_info + + elif pattern_type == 'dynamic_import': + # import('module') + module_name = match.group(1) + symbol_info.symbol = f"npm {module_name}" + symbol_info.display_name = module_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"Dynamic imported module: {module_name}") + return symbol_info + + return None + + +def create_javascript_scip_factory(project_root: str) -> JavaScriptSCIPIndexFactory: + """ + Factory creator for JavaScript SCIP factory. + Ensures all required components are properly assembled via constructor injection. + """ + symbol_generator = SCIPSymbolGenerator( + scheme="scip-javascript", + package_manager="npm", + package_name=Path(project_root).name, + version="HEAD" + ) + + relationship_extractor = JavaScriptRelationshipExtractor() + enum_mapper = JavaScriptEnumMapper() + position_calculator = SCIPPositionCalculator() + + return JavaScriptSCIPIndexFactory( + project_root=project_root, + symbol_generator=symbol_generator, + relationship_extractor=relationship_extractor, # Guaranteed to be provided + enum_mapper=enum_mapper, + position_calculator=position_calculator + ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/relationship_extractor.py b/src/code_index_mcp/scip/framework/javascript/relationship_extractor.py new file mode 100644 index 0000000..7b25afe --- /dev/null +++ b/src/code_index_mcp/scip/framework/javascript/relationship_extractor.py @@ -0,0 +1,281 @@ +"""JavaScript relationship extractor implementation.""" + +import re +from typing import Iterator, Dict, List +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..types import SCIPContext, Relationship +from ...core.relationship_types import InternalRelationshipType + + +class JavaScriptRelationshipExtractor(BaseRelationshipExtractor): + """JavaScript-specific relationship extractor using regex-based analysis.""" + + def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract inheritance relationships from JavaScript classes.""" + try: + # ES6 class inheritance: class Child extends Parent + class_extends_pattern = r'class\s+(\w+)\s+extends\s+(\w+)' + + for match in re.finditer(class_extends_pattern, context.content, re.MULTILINE): + child_class = match.group(1) + parent_class = match.group(2) + + child_symbol_id = self._create_class_symbol_id(child_class, context) + parent_symbol_id = self._create_class_symbol_id(parent_class, context) + + yield Relationship( + source_symbol=child_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + + # Prototype inheritance: Object.setPrototypeOf or Object.create + prototype_pattern = r'Object\.setPrototypeOf\s*\(\s*(\w+)\.prototype\s*,\s*(\w+)\.prototype\s*\)' + + for match in re.finditer(prototype_pattern, context.content, re.MULTILINE): + child_obj = match.group(1) + parent_obj = match.group(2) + + child_symbol_id = self._create_function_symbol_id(child_obj, context) + parent_symbol_id = self._create_function_symbol_id(parent_obj, context) + + yield Relationship( + source_symbol=child_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract function/method call relationships.""" + try: + # Function calls: functionName() or object.method() + function_call_patterns = [ + r'(\w+)\s*\(', # Direct function calls + r'(\w+)\.(\w+)\s*\(', # Method calls + r'this\.(\w+)\s*\(', # Method calls on this + r'super\.(\w+)\s*\(', # Super method calls + ] + + # Find all function definitions first + function_defs = self._extract_function_definitions(context.content) + + for func_name in function_defs: + func_symbol_id = self._create_function_symbol_id(func_name, context) + + # Look for calls within this function + func_body = self._extract_function_body(context.content, func_name) + if func_body: + for pattern in function_call_patterns: + for match in re.finditer(pattern, func_body, re.MULTILINE): + if pattern == r'(\w+)\.(\w+)\s*\(': + # Method call + target_function = match.group(2) + elif pattern == r'this\.(\w+)\s*\(' or pattern == r'super\.(\w+)\s*\(': + target_function = match.group(1) + else: + # Direct function call + target_function = match.group(1) + + if target_function and target_function != func_name: + target_symbol_id = self._create_function_symbol_id(target_function, context) + yield Relationship( + source_symbol=func_symbol_id, + target_symbol=target_symbol_id, + relationship_type=InternalRelationshipType.CALLS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract import/dependency relationships.""" + try: + import_patterns = { + 'es6_import': r'import\s+(?:\{[^}]+\}\s+from\s+)?[\'"]([^\'"]+)[\'"]', + 'require': r'require\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)', + 'dynamic_import': r'import\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)', + 'export_from': r'export\s+(?:\{[^}]+\}\s+)?from\s+[\'"]([^\'"]+)[\'"]' + } + + file_symbol_id = self._create_file_symbol_id(context.file_path) + + for pattern_type, pattern in import_patterns.items(): + for match in re.finditer(pattern, context.content, re.MULTILINE): + module_name = match.group(1) + + # Determine if it's a local or external module + if module_name.startswith('.'): + # Local module + module_symbol_id = f"local {module_name}" + else: + # External module (npm package) + module_symbol_id = f"npm {module_name}" + + yield Relationship( + source_symbol=file_symbol_id, + target_symbol=module_symbol_id, + relationship_type=InternalRelationshipType.IMPORTS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract composition relationships (object properties).""" + try: + # Class property definitions + class_property_pattern = r'class\s+(\w+)\s*\{[^}]*?(\w+)\s*=' + + for match in re.finditer(class_property_pattern, context.content, re.MULTILINE | re.DOTALL): + class_name = match.group(1) + property_name = match.group(2) + + class_symbol_id = self._create_class_symbol_id(class_name, context) + property_symbol_id = self._create_property_symbol_id(property_name, class_symbol_id) + + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=property_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + + # Object literal properties + object_literal_pattern = r'const\s+(\w+)\s*=\s*\{[^}]*?(\w+)\s*:' + + for match in re.finditer(object_literal_pattern, context.content, re.MULTILINE | re.DOTALL): + object_name = match.group(1) + property_name = match.group(2) + + object_symbol_id = self._create_variable_symbol_id(object_name, context) + property_symbol_id = self._create_property_symbol_id(property_name, object_symbol_id) + + yield Relationship( + source_symbol=object_symbol_id, + target_symbol=property_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract interface relationships (TypeScript interfaces).""" + try: + # TypeScript interface implementation + interface_impl_pattern = r'class\s+(\w+)\s+implements\s+([^{]+)' + + for match in re.finditer(interface_impl_pattern, context.content, re.MULTILINE): + class_name = match.group(1) + interfaces = match.group(2).strip() + + class_symbol_id = self._create_class_symbol_id(class_name, context) + + # Parse multiple interfaces + for interface_name in re.findall(r'\w+', interfaces): + interface_symbol_id = self._create_interface_symbol_id(interface_name, context) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=interface_symbol_id, + relationship_type=InternalRelationshipType.IMPLEMENTS + ) + + # TypeScript interface extension + interface_extends_pattern = r'interface\s+(\w+)\s+extends\s+([^{]+)' + + for match in re.finditer(interface_extends_pattern, context.content, re.MULTILINE): + child_interface = match.group(1) + parent_interfaces = match.group(2).strip() + + child_symbol_id = self._create_interface_symbol_id(child_interface, context) + + for parent_interface in re.findall(r'\w+', parent_interfaces): + parent_symbol_id = self._create_interface_symbol_id(parent_interface, context) + yield Relationship( + source_symbol=child_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + + except Exception: + # Skip files with parsing errors + return + + def _extract_function_definitions(self, content: str) -> List[str]: + """Extract function definition names from content.""" + function_patterns = [ + r'function\s+(\w+)\s*\(', + r'(?:const|let|var)\s+(\w+)\s*=\s*function', + r'(?:const|let|var)\s+(\w+)\s*=\s*\([^)]*\)\s*=>', + r'(\w+)\s*\([^)]*\)\s*\{', # Method definitions + ] + + functions = [] + for pattern in function_patterns: + for match in re.finditer(pattern, content, re.MULTILINE): + functions.append(match.group(1)) + + return list(set(functions)) # Remove duplicates + + def _extract_function_body(self, content: str, func_name: str) -> str: + """Extract the body of a specific function.""" + # Simple heuristic - find function and extract until matching brace + func_pattern = rf'(?:function\s+{func_name}\s*\(|{func_name}\s*\([^)]*\)\s*=>|\b{func_name}\s*\([^)]*\)\s*{{)' + + match = re.search(func_pattern, content, re.MULTILINE) + if match: + start_pos = match.end() + brace_count = 1 + i = start_pos + + while i < len(content) and brace_count > 0: + if content[i] == '{': + brace_count += 1 + elif content[i] == '}': + brace_count -= 1 + i += 1 + + if brace_count == 0: + return content[start_pos:i-1] + + return "" + + def _create_class_symbol_id(self, class_name: str, context: SCIPContext) -> str: + """Create symbol ID for class.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{class_name}" if scope_path else class_name + return f"local {local_id}#" + + def _create_function_symbol_id(self, function_name: str, context: SCIPContext) -> str: + """Create symbol ID for function.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{function_name}" if scope_path else function_name + return f"local {local_id}()." + + def _create_variable_symbol_id(self, variable_name: str, context: SCIPContext) -> str: + """Create symbol ID for variable.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{variable_name}" if scope_path else variable_name + return f"local {local_id}" + + def _create_property_symbol_id(self, property_name: str, parent_symbol_id: str) -> str: + """Create symbol ID for property.""" + # Extract parent name from parent symbol ID + parent_name = parent_symbol_id.replace("local ", "").rstrip("#().") + return f"local {parent_name}.{property_name}" + + def _create_interface_symbol_id(self, interface_name: str, context: SCIPContext) -> str: + """Create symbol ID for TypeScript interface.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{interface_name}" if scope_path else interface_name + return f"local {local_id}#" + + def _create_file_symbol_id(self, file_path: str) -> str: + """Create symbol ID for file.""" + return f"local {file_path}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/syntax_analyzer.py b/src/code_index_mcp/scip/framework/javascript/syntax_analyzer.py new file mode 100644 index 0000000..573cd17 --- /dev/null +++ b/src/code_index_mcp/scip/framework/javascript/syntax_analyzer.py @@ -0,0 +1,418 @@ +"""JavaScript syntax analyzer implementation.""" + +import re +from typing import Dict, List, Optional, Set, Tuple, Any + + +class JavaScriptSyntaxAnalyzer: + """JavaScript/TypeScript syntax analyzer using regex patterns.""" + + def __init__(self): + """Initialize the syntax analyzer.""" + self._symbol_patterns = self._build_symbol_patterns() + self._occurrence_patterns = self._build_occurrence_patterns() + self._import_patterns = self._build_import_patterns() + self._comment_patterns = self._build_comment_patterns() + + def get_symbol_patterns(self) -> Dict[str, str]: + """Get regex patterns for symbol definitions.""" + return self._symbol_patterns + + def get_occurrence_patterns(self) -> Dict[str, str]: + """Get regex patterns for symbol occurrences.""" + return self._occurrence_patterns + + def get_import_patterns(self) -> Dict[str, str]: + """Get regex patterns for import statements.""" + return self._import_patterns + + def _build_symbol_patterns(self) -> Dict[str, str]: + """Build regex patterns for JavaScript symbol definitions.""" + return { + # Function declarations + 'function': r'function\s+(\w+)\s*\(', + + # Arrow functions + 'arrow_function': r'(?:const|let|var)\s+(\w+)\s*=\s*(?:\([^)]*\)|\w+)\s*=>\s*', + + # Class declarations + 'class': r'class\s+(\w+)(?:\s+extends\s+\w+)?\s*\{', + + # Method definitions (inside classes or objects) + 'method': r'(?:async\s+)?(\w+)\s*\([^)]*\)\s*\{', + + # Object method assignment + 'object_method': r'(\w+)\s*:\s*(?:async\s+)?function\s*\([^)]*\)\s*\{', + + # Variable declarations + 'const': r'const\s+(\w+)(?:\s*:\s*[^=]+)?\s*=', + 'let': r'let\s+(\w+)(?:\s*:\s*[^=]+)?(?:\s*=|;)', + 'var': r'var\s+(\w+)(?:\s*:\s*[^=]+)?(?:\s*=|;)', + + # TypeScript interfaces + 'interface': r'interface\s+(\w+)(?:\s+extends\s+[^{]+)?\s*\{', + + # TypeScript type aliases + 'type': r'type\s+(\w+)(?:<[^>]*>)?\s*=', + + # TypeScript enums + 'enum': r'enum\s+(\w+)\s*\{', + + # TypeScript namespaces + 'namespace': r'namespace\s+(\w+)\s*\{', + + # Constructor functions (legacy pattern) + 'constructor': r'function\s+(\w+)\s*\([^)]*\)\s*\{[^}]*this\.', + + # Module exports + 'export_function': r'export\s+(?:default\s+)?function\s+(\w+)\s*\(', + 'export_class': r'export\s+(?:default\s+)?class\s+(\w+)', + 'export_const': r'export\s+const\s+(\w+)\s*=', + + # Destructuring assignments + 'destructure': r'(?:const|let|var)\s*\{\s*(\w+)(?:\s*,\s*\w+)*\s*\}\s*=', + } + + def _build_occurrence_patterns(self) -> Dict[str, str]: + """Build regex patterns for symbol occurrences/references.""" + return { + # Function calls + 'function_call': r'(\w+)\s*\(', + + # Method calls + 'method_call': r'(\w+)\.(\w+)\s*\(', + + # Property access + 'property_access': r'(\w+)\.(\w+)(?!\s*\()', + + # Variable references + 'identifier': r'\b(\w+)\b', + + # this references + 'this_reference': r'this\.(\w+)', + + # super references + 'super_reference': r'super\.(\w+)', + + # Template literal expressions + 'template_expression': r'\$\{([^}]+)\}', + + # Assignment targets + 'assignment': r'(\w+)\s*[+\-*/%&|^]?=', + + # Function parameters + 'parameter': r'function\s+\w+\s*\(([^)]*)\)', + + # Object literal properties + 'object_property': r'(\w+)\s*:', + } + + def _build_import_patterns(self) -> Dict[str, str]: + """Build regex patterns for import statements.""" + return { + # ES6 imports + 'es6_import': r'import\s+(?:\{([^}]+)\}|(\w+)|\*\s+as\s+(\w+))\s+from\s+[\'"]([^\'"]+)[\'"]', + + # Default imports + 'default_import': r'import\s+(\w+)\s+from\s+[\'"]([^\'"]+)[\'"]', + + # Named imports + 'named_import': r'import\s+\{([^}]+)\}\s+from\s+[\'"]([^\'"]+)[\'"]', + + # Namespace imports + 'namespace_import': r'import\s+\*\s+as\s+(\w+)\s+from\s+[\'"]([^\'"]+)[\'"]', + + # Side effect imports + 'side_effect_import': r'import\s+[\'"]([^\'"]+)[\'"]', + + # CommonJS require + 'require': r'(?:const|let|var)\s+(?:\{([^}]+)\}|(\w+))\s*=\s*require\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)', + + # Dynamic imports + 'dynamic_import': r'import\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)', + + # Re-exports + 'export_from': r'export\s+(?:\{([^}]+)\}|\*(?:\s+as\s+(\w+))?)\s+from\s+[\'"]([^\'"]+)[\'"]', + } + + def _build_comment_patterns(self) -> Dict[str, str]: + """Build regex patterns for comments.""" + return { + 'single_line': r'//.*$', + 'multi_line': r'/\*[\s\S]*?\*/', + 'jsdoc': r'/\*\*[\s\S]*?\*/', + } + + def extract_functions(self, content: str) -> List[Dict[str, Any]]: + """Extract function information from JavaScript content.""" + functions = [] + + # Function declarations + for match in re.finditer(self._symbol_patterns['function'], content, re.MULTILINE): + functions.append({ + 'name': match.group(1), + 'type': 'function', + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + # Arrow functions + for match in re.finditer(self._symbol_patterns['arrow_function'], content, re.MULTILINE): + functions.append({ + 'name': match.group(1), + 'type': 'arrow_function', + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + # Methods + for match in re.finditer(self._symbol_patterns['method'], content, re.MULTILINE): + functions.append({ + 'name': match.group(1), + 'type': 'method', + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + return functions + + def extract_classes(self, content: str) -> List[Dict[str, Any]]: + """Extract class information from JavaScript content.""" + classes = [] + + for match in re.finditer(self._symbol_patterns['class'], content, re.MULTILINE): + class_info = { + 'name': match.group(1), + 'type': 'class', + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n'), + 'methods': [], + 'properties': [] + } + + # Extract class body + class_body = self._extract_class_body(content, match.end()) + if class_body: + class_info['methods'] = self._extract_class_methods(class_body) + class_info['properties'] = self._extract_class_properties(class_body) + + classes.append(class_info) + + return classes + + def extract_variables(self, content: str) -> List[Dict[str, Any]]: + """Extract variable declarations from JavaScript content.""" + variables = [] + + for var_type in ['const', 'let', 'var']: + pattern = self._symbol_patterns[var_type] + for match in re.finditer(pattern, content, re.MULTILINE): + variables.append({ + 'name': match.group(1), + 'type': var_type, + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + return variables + + def extract_imports(self, content: str) -> List[Dict[str, Any]]: + """Extract import statements from JavaScript content.""" + imports = [] + + for import_type, pattern in self._import_patterns.items(): + for match in re.finditer(pattern, content, re.MULTILINE): + import_info = { + 'type': import_type, + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n'), + 'raw': match.group(0) + } + + # Extract specific information based on import type + if import_type == 'es6_import': + import_info['module'] = match.group(4) if match.lastindex >= 4 else match.group(3) + import_info['imports'] = match.group(1) if match.group(1) else match.group(2) + elif import_type in ['default_import', 'namespace_import']: + import_info['name'] = match.group(1) + import_info['module'] = match.group(2) + elif import_type == 'require': + import_info['module'] = match.group(3) if match.lastindex >= 3 else match.group(2) + import_info['name'] = match.group(2) if match.lastindex >= 2 else match.group(1) + elif import_type == 'dynamic_import': + import_info['module'] = match.group(1) + + imports.append(import_info) + + return imports + + def extract_exports(self, content: str) -> List[Dict[str, Any]]: + """Extract export statements from JavaScript content.""" + exports = [] + + export_patterns = { + 'export_default': r'export\s+default\s+(?:function\s+(\w+)|class\s+(\w+)|(\w+))', + 'export_named': r'export\s+\{([^}]+)\}', + 'export_function': r'export\s+function\s+(\w+)', + 'export_class': r'export\s+class\s+(\w+)', + 'export_const': r'export\s+const\s+(\w+)', + } + + for export_type, pattern in export_patterns.items(): + for match in re.finditer(pattern, content, re.MULTILINE): + exports.append({ + 'type': export_type, + 'name': match.group(1) if match.group(1) else match.group(0), + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + return exports + + def remove_comments(self, content: str) -> str: + """Remove comments from JavaScript content.""" + # Remove single-line comments + content = re.sub(self._comment_patterns['single_line'], '', content, flags=re.MULTILINE) + + # Remove multi-line comments + content = re.sub(self._comment_patterns['multi_line'], '', content, flags=re.DOTALL) + + return content + + def extract_string_literals(self, content: str) -> List[Dict[str, Any]]: + """Extract string literals from JavaScript content.""" + string_patterns = { + 'single_quote': r"'([^'\\]|\\.)*'", + 'double_quote': r'"([^"\\\\]|\\\\.)*"', + 'template_literal': r'`([^`\\\\]|\\\\.)*`', + } + + strings = [] + for string_type, pattern in string_patterns.items(): + for match in re.finditer(pattern, content, re.MULTILINE): + strings.append({ + 'type': string_type, + 'value': match.group(0), + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + return strings + + def _extract_class_body(self, content: str, start_pos: int) -> str: + """Extract the body of a class from start position.""" + brace_count = 0 + i = start_pos + + # Find the opening brace + while i < len(content) and content[i] != '{': + i += 1 + + if i >= len(content): + return "" + + start_body = i + 1 + brace_count = 1 + i += 1 + + # Find the matching closing brace + while i < len(content) and brace_count > 0: + if content[i] == '{': + brace_count += 1 + elif content[i] == '}': + brace_count -= 1 + i += 1 + + if brace_count == 0: + return content[start_body:i-1] + + return "" + + def _extract_class_methods(self, class_body: str) -> List[str]: + """Extract method names from class body.""" + methods = [] + + method_pattern = r'(?:async\s+)?(\w+)\s*\([^)]*\)\s*\{' + for match in re.finditer(method_pattern, class_body, re.MULTILINE): + methods.append(match.group(1)) + + return methods + + def _extract_class_properties(self, class_body: str) -> List[str]: + """Extract property names from class body.""" + properties = [] + + property_patterns = [ + r'(\w+)\s*=', # Property assignment + r'(\w+)\s*;', # Property declaration (TypeScript) + ] + + for pattern in property_patterns: + for match in re.finditer(pattern, class_body, re.MULTILINE): + prop_name = match.group(1) + if prop_name not in ['constructor'] and not prop_name.startswith('_'): + properties.append(prop_name) + + return properties + + def is_typescript_file(self, file_path: str) -> bool: + """Check if file is TypeScript based on extension.""" + return file_path.endswith(('.ts', '.tsx')) + + def extract_typescript_features(self, content: str) -> Dict[str, List[Dict[str, Any]]]: + """Extract TypeScript-specific features.""" + if not self.is_typescript_file: + return {} + + features = { + 'interfaces': [], + 'types': [], + 'enums': [], + 'namespaces': [] + } + + # Extract interfaces + for match in re.finditer(self._symbol_patterns['interface'], content, re.MULTILINE): + features['interfaces'].append({ + 'name': match.group(1), + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + # Extract type aliases + for match in re.finditer(self._symbol_patterns['type'], content, re.MULTILINE): + features['types'].append({ + 'name': match.group(1), + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + # Extract enums + for match in re.finditer(self._symbol_patterns['enum'], content, re.MULTILINE): + features['enums'].append({ + 'name': match.group(1), + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + # Extract namespaces + for match in re.finditer(self._symbol_patterns['namespace'], content, re.MULTILINE): + features['namespaces'].append({ + 'name': match.group(1), + 'start': match.start(), + 'end': match.end(), + 'line': content[:match.start()].count('\n') + }) + + return features \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/__init__.py b/src/code_index_mcp/scip/framework/objective_c/__init__.py new file mode 100644 index 0000000..ae824b0 --- /dev/null +++ b/src/code_index_mcp/scip/framework/objective_c/__init__.py @@ -0,0 +1,14 @@ +"""Objective-C SCIP framework module.""" + +from .factory import ObjectiveCSCIPIndexFactory, create_objective_c_scip_factory +from .enum_mapper import ObjectiveCEnumMapper +from .relationship_extractor import ObjectiveCRelationshipExtractor +from .clang_analyzer import ObjectiveCClangAnalyzer + +__all__ = [ + 'ObjectiveCSCIPIndexFactory', + 'create_objective_c_scip_factory', + 'ObjectiveCEnumMapper', + 'ObjectiveCRelationshipExtractor', + 'ObjectiveCClangAnalyzer' +] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/clang_analyzer.py b/src/code_index_mcp/scip/framework/objective_c/clang_analyzer.py new file mode 100644 index 0000000..8ddc861 --- /dev/null +++ b/src/code_index_mcp/scip/framework/objective_c/clang_analyzer.py @@ -0,0 +1,338 @@ +"""Objective-C libclang analyzer implementation.""" + +from typing import Iterator, Optional, Set, List, Dict, Any +from ..types import SCIPContext +from ..base.language_analyzer import BaseLanguageAnalyzer + +try: + import clang.cindex as clang + from clang.cindex import CursorKind, TypeKind + LIBCLANG_AVAILABLE = True +except ImportError: + LIBCLANG_AVAILABLE = False + clang = None + CursorKind = None + TypeKind = None + + +class ObjectiveCClangAnalyzer(BaseLanguageAnalyzer): + """Objective-C analyzer using libclang for AST parsing.""" + + def __init__(self): + """Initialize the Objective-C libclang analyzer.""" + if not LIBCLANG_AVAILABLE: + raise ImportError("libclang library not available") + + self.index = clang.Index.create() + self._processed_cursors: Set[int] = set() + + def parse(self, content: str, filename: str = ""): + """Parse Objective-C source code into libclang AST.""" + try: + # Create a temporary file for parsing + args = ['-x', 'objective-c', '-I/usr/include', '-I/usr/local/include'] + return self.index.parse(filename, args=args, unsaved_files=[(filename, content)]) + except Exception as e: + raise SyntaxError(f"Objective-C syntax error in {filename}: {e}") + + def walk(self, translation_unit) -> Iterator: + """Walk libclang cursor nodes, avoiding duplicates.""" + for cursor in self._walk_cursor(translation_unit.cursor): + cursor_id = hash((cursor.spelling, cursor.location.line, cursor.location.column)) + if cursor_id not in self._processed_cursors: + self._processed_cursors.add(cursor_id) + yield cursor + + def _walk_cursor(self, cursor) -> Iterator: + """Recursively walk cursor nodes.""" + yield cursor + for child in cursor.get_children(): + yield from self._walk_cursor(child) + + def is_symbol_definition(self, cursor) -> bool: + """Check if libclang cursor represents a symbol definition.""" + return cursor.kind in { + CursorKind.OBJC_INTERFACE_DECL, + CursorKind.OBJC_IMPLEMENTATION_DECL, + CursorKind.OBJC_PROTOCOL_DECL, + CursorKind.OBJC_CATEGORY_DECL, + CursorKind.OBJC_CATEGORY_IMPL_DECL, + CursorKind.OBJC_INSTANCE_METHOD_DECL, + CursorKind.OBJC_CLASS_METHOD_DECL, + CursorKind.OBJC_PROPERTY_DECL, + CursorKind.OBJC_IVAR_DECL, + CursorKind.CLASS_DECL, + CursorKind.STRUCT_DECL, + CursorKind.UNION_DECL, + CursorKind.ENUM_DECL, + CursorKind.FUNCTION_DECL, + CursorKind.VAR_DECL, + CursorKind.FIELD_DECL, + CursorKind.TYPEDEF_DECL, + CursorKind.MACRO_DEFINITION, + CursorKind.ENUM_CONSTANT_DECL, + } + + def is_symbol_reference(self, cursor) -> bool: + """Check if libclang cursor represents a symbol reference.""" + return cursor.kind in { + CursorKind.DECL_REF_EXPR, + CursorKind.MEMBER_REF_EXPR, + CursorKind.OBJC_MESSAGE_EXPR, + CursorKind.OBJC_SELECTOR_REF, + CursorKind.OBJC_PROTOCOL_REF, + CursorKind.OBJC_CLASS_REF, + CursorKind.OBJC_SUPER_CLASS_REF, + CursorKind.TYPE_REF, + CursorKind.CALL_EXPR, + } + + def get_symbol_name(self, cursor) -> Optional[str]: + """Extract symbol name from libclang cursor.""" + return cursor.spelling if cursor.spelling else None + + def get_node_position(self, cursor) -> tuple: + """Get position information from libclang cursor.""" + start_line = cursor.location.line - 1 # Convert to 0-based + start_col = cursor.location.column - 1 + + # Estimate end position based on symbol name length + if cursor.spelling: + end_line = start_line + end_col = start_col + len(cursor.spelling) + else: + end_line = start_line + end_col = start_col + 1 + + return (start_line, start_col, end_line, end_col) + + def extract_interface_info(self, translation_unit) -> List[Dict[str, Any]]: + """Extract Objective-C interface information from the AST.""" + interfaces = [] + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: + interface_info = { + 'name': cursor.spelling, + 'type': 'interface', + 'position': self.get_node_position(cursor), + 'superclass': self._extract_superclass(cursor), + 'protocols': self._extract_protocols(cursor), + 'methods': self._extract_interface_methods(cursor), + 'properties': self._extract_interface_properties(cursor), + } + interfaces.append(interface_info) + + return interfaces + + def extract_implementation_info(self, translation_unit) -> List[Dict[str, Any]]: + """Extract Objective-C implementation information from the AST.""" + implementations = [] + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind == CursorKind.OBJC_IMPLEMENTATION_DECL: + impl_info = { + 'name': cursor.spelling, + 'type': 'implementation', + 'position': self.get_node_position(cursor), + 'methods': self._extract_implementation_methods(cursor), + 'ivars': self._extract_implementation_ivars(cursor), + } + implementations.append(impl_info) + + return implementations + + def extract_protocol_info(self, translation_unit) -> List[Dict[str, Any]]: + """Extract Objective-C protocol information from the AST.""" + protocols = [] + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind == CursorKind.OBJC_PROTOCOL_DECL: + protocol_info = { + 'name': cursor.spelling, + 'type': 'protocol', + 'position': self.get_node_position(cursor), + 'parent_protocols': self._extract_parent_protocols(cursor), + 'methods': self._extract_protocol_methods(cursor), + 'properties': self._extract_protocol_properties(cursor), + } + protocols.append(protocol_info) + + return protocols + + def extract_method_info(self, translation_unit) -> List[Dict[str, Any]]: + """Extract method information from the AST.""" + methods = [] + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): + method_info = { + 'name': cursor.spelling, + 'type': 'instance_method' if cursor.objc_method_kind == 1 else 'class_method', + 'position': self.get_node_position(cursor), + 'return_type': self._extract_return_type(cursor), + 'parameters': self._extract_method_parameters(cursor), + 'is_definition': cursor.is_definition(), + } + methods.append(method_info) + + return methods + + def extract_property_info(self, translation_unit) -> List[Dict[str, Any]]: + """Extract property information from the AST.""" + properties = [] + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind == CursorKind.OBJC_PROPERTY_DECL: + property_info = { + 'name': cursor.spelling, + 'type': 'property', + 'position': self.get_node_position(cursor), + 'property_type': self._extract_property_type(cursor), + 'attributes': self._extract_property_attributes(cursor), + } + properties.append(property_info) + + return properties + + def extract_include_statements(self, translation_unit) -> List[str]: + """Extract include statements from the AST.""" + includes = [] + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind == CursorKind.INCLUSION_DIRECTIVE: + included_file = cursor.get_included_file() + if included_file: + includes.append(included_file.name) + + return includes + + def extract_category_info(self, translation_unit) -> List[Dict[str, Any]]: + """Extract Objective-C category information from the AST.""" + categories = [] + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind in [CursorKind.OBJC_CATEGORY_DECL, CursorKind.OBJC_CATEGORY_IMPL_DECL]: + category_info = { + 'name': cursor.spelling, + 'type': 'category_interface' if cursor.kind == CursorKind.OBJC_CATEGORY_DECL else 'category_implementation', + 'position': self.get_node_position(cursor), + 'extended_class': self._extract_extended_class(cursor), + 'methods': self._extract_category_methods(cursor), + } + categories.append(category_info) + + return categories + + def _extract_superclass(self, interface_cursor) -> Optional[str]: + """Extract superclass name from interface declaration.""" + for child in interface_cursor.get_children(): + if child.kind == CursorKind.OBJC_SUPER_CLASS_REF: + return child.spelling + return None + + def _extract_protocols(self, interface_cursor) -> List[str]: + """Extract protocol names from interface declaration.""" + protocols = [] + for child in interface_cursor.get_children(): + if child.kind == CursorKind.OBJC_PROTOCOL_REF: + protocols.append(child.spelling) + return protocols + + def _extract_parent_protocols(self, protocol_cursor) -> List[str]: + """Extract parent protocol names from protocol declaration.""" + protocols = [] + for child in protocol_cursor.get_children(): + if child.kind == CursorKind.OBJC_PROTOCOL_REF: + protocols.append(child.spelling) + return protocols + + def _extract_interface_methods(self, interface_cursor) -> List[str]: + """Extract method names from interface declaration.""" + methods = [] + for child in interface_cursor.get_children(): + if child.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): + methods.append(child.spelling) + return methods + + def _extract_implementation_methods(self, impl_cursor) -> List[str]: + """Extract method names from implementation.""" + methods = [] + for child in impl_cursor.get_children(): + if child.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): + methods.append(child.spelling) + return methods + + def _extract_protocol_methods(self, protocol_cursor) -> List[str]: + """Extract method names from protocol declaration.""" + methods = [] + for child in protocol_cursor.get_children(): + if child.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): + methods.append(child.spelling) + return methods + + def _extract_category_methods(self, category_cursor) -> List[str]: + """Extract method names from category.""" + methods = [] + for child in category_cursor.get_children(): + if child.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): + methods.append(child.spelling) + return methods + + def _extract_interface_properties(self, interface_cursor) -> List[str]: + """Extract property names from interface declaration.""" + properties = [] + for child in interface_cursor.get_children(): + if child.kind == CursorKind.OBJC_PROPERTY_DECL: + properties.append(child.spelling) + return properties + + def _extract_protocol_properties(self, protocol_cursor) -> List[str]: + """Extract property names from protocol declaration.""" + properties = [] + for child in protocol_cursor.get_children(): + if child.kind == CursorKind.OBJC_PROPERTY_DECL: + properties.append(child.spelling) + return properties + + def _extract_implementation_ivars(self, impl_cursor) -> List[str]: + """Extract instance variable names from implementation.""" + ivars = [] + for child in impl_cursor.get_children(): + if child.kind == CursorKind.OBJC_IVAR_DECL: + ivars.append(child.spelling) + return ivars + + def _extract_extended_class(self, category_cursor) -> Optional[str]: + """Extract the class name that a category extends.""" + # The extended class is typically the first child that's a class reference + for child in category_cursor.get_children(): + if child.kind == CursorKind.OBJC_CLASS_REF: + return child.spelling + return None + + def _extract_return_type(self, method_cursor) -> Optional[str]: + """Extract return type from method declaration.""" + return method_cursor.result_type.spelling if method_cursor.result_type else None + + def _extract_method_parameters(self, method_cursor) -> List[Dict[str, str]]: + """Extract parameter information from method declaration.""" + parameters = [] + for child in method_cursor.get_children(): + if child.kind == CursorKind.PARM_DECL: + param_info = { + 'name': child.spelling, + 'type': child.type.spelling if child.type else 'unknown' + } + parameters.append(param_info) + return parameters + + def _extract_property_type(self, property_cursor) -> Optional[str]: + """Extract property type from property declaration.""" + return property_cursor.type.spelling if property_cursor.type else None + + def _extract_property_attributes(self, property_cursor) -> List[str]: + """Extract property attributes (readonly, strong, etc.).""" + # This is a simplified implementation - libclang doesn't easily expose + # property attributes, so we'd need to parse the source text for full accuracy + return [] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/enum_mapper.py b/src/code_index_mcp/scip/framework/objective_c/enum_mapper.py new file mode 100644 index 0000000..5d2f62b --- /dev/null +++ b/src/code_index_mcp/scip/framework/objective_c/enum_mapper.py @@ -0,0 +1,228 @@ +"""Objective-C enum mapper implementation.""" + +from typing import Dict +from ..base.enum_mapper import BaseEnumMapper +from ...proto import scip_pb2 + + +class ObjectiveCEnumMapper(BaseEnumMapper): + """Objective-C-specific enum mapper for SCIP compliance.""" + + # Objective-C symbol kind mappings + SYMBOL_KIND_MAP = { + 'method': scip_pb2.Method, + 'class': scip_pb2.Class, + 'interface': scip_pb2.Interface, + 'protocol': scip_pb2.Interface, # Protocols are similar to interfaces + 'category': scip_pb2.Class, # Categories extend classes + 'enum': scip_pb2.Enum, + 'field': scip_pb2.Field, + 'property': scip_pb2.Property, + 'variable': scip_pb2.Variable, + 'parameter': scip_pb2.Parameter, + 'function': scip_pb2.Function, + 'macro': scip_pb2.Macro, + 'constant': scip_pb2.Constant, + 'typedef': scip_pb2.Type, + 'struct': scip_pb2.Struct, + 'union': scip_pb2.Struct, + 'ivar': scip_pb2.Field, # Instance variables + } + + # Objective-C syntax kind mappings + SYNTAX_KIND_MAP = { + 'method_declaration': scip_pb2.IdentifierFunctionDefinition, + 'class_declaration': scip_pb2.IdentifierType, + 'interface_declaration': scip_pb2.IdentifierType, + 'protocol_declaration': scip_pb2.IdentifierType, + 'category_declaration': scip_pb2.IdentifierType, + 'enum_declaration': scip_pb2.IdentifierType, + 'field_declaration': scip_pb2.IdentifierAttribute, + 'property_declaration': scip_pb2.IdentifierAttribute, + 'variable_declaration': scip_pb2.IdentifierLocal, + 'parameter_declaration': scip_pb2.IdentifierParameter, + 'function_declaration': scip_pb2.IdentifierFunctionDefinition, + 'macro_declaration': scip_pb2.IdentifierKeyword, + 'typedef_declaration': scip_pb2.IdentifierType, + 'struct_declaration': scip_pb2.IdentifierType, + 'union_declaration': scip_pb2.IdentifierType, + 'identifier': scip_pb2.Identifier, + 'keyword': scip_pb2.IdentifierKeyword, + 'string_literal': scip_pb2.StringLiteral, + 'numeric_literal': scip_pb2.NumericLiteral, + 'boolean_literal': scip_pb2.BooleanLiteral, + 'comment': scip_pb2.Comment, + 'punctuation': scip_pb2.PunctuationDelimiter, + } + + # Objective-C symbol role mappings (official SCIP naming) + SYMBOL_ROLE_MAP = { + 'definition': scip_pb2.Definition, + 'import': scip_pb2.Import, + 'write': scip_pb2.Write, # Official SCIP naming + 'read': scip_pb2.Read, # Official SCIP naming + 'generated': scip_pb2.Generated, + 'test': scip_pb2.Test, + 'type': scip_pb2.Type, # Add missing Type role + 'reference': scip_pb2.Read, # Default reference is read access + } + + def map_symbol_kind(self, language_kind: str) -> int: + """Map Objective-C symbol type to SCIP SymbolKind.""" + kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SymbolKind'): + raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") + + return kind + + def map_syntax_kind(self, language_syntax: str) -> int: + """Map Objective-C syntax element to SCIP SyntaxKind.""" + kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SyntaxKind'): + raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") + + return kind + + def map_symbol_role(self, language_role: str) -> int: + """Map Objective-C symbol role to SCIP SymbolRole.""" + role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) + + # Validate enum value + if not self.validate_enum_value(role, 'SymbolRole'): + raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") + + return role + + def get_objc_cursor_symbol_kind(self, cursor_kind: str) -> str: + """ + Map libclang cursor kind to internal symbol kind string. + + Args: + cursor_kind: libclang cursor kind (e.g., 'OBJC_INTERFACE_DECL', 'OBJC_INSTANCE_METHOD_DECL') + + Returns: + Internal symbol kind string for use with map_symbol_kind() + """ + cursor_kind_map = { + 'OBJC_INTERFACE_DECL': 'interface', + 'OBJC_IMPLEMENTATION_DECL': 'class', + 'OBJC_PROTOCOL_DECL': 'protocol', + 'OBJC_CATEGORY_DECL': 'category', + 'OBJC_CATEGORY_IMPL_DECL': 'category', + 'OBJC_INSTANCE_METHOD_DECL': 'method', + 'OBJC_CLASS_METHOD_DECL': 'method', + 'OBJC_PROPERTY_DECL': 'property', + 'OBJC_IVAR_DECL': 'ivar', + 'CLASS_DECL': 'class', + 'STRUCT_DECL': 'struct', + 'UNION_DECL': 'union', + 'ENUM_DECL': 'enum', + 'FUNCTION_DECL': 'function', + 'VAR_DECL': 'variable', + 'PARM_DECL': 'parameter', + 'FIELD_DECL': 'field', + 'TYPEDEF_DECL': 'typedef', + 'MACRO_DEFINITION': 'macro', + 'ENUM_CONSTANT_DECL': 'constant', + } + + return cursor_kind_map.get(cursor_kind, 'variable') + + def get_objc_cursor_syntax_kind(self, cursor_kind: str, context: str = None) -> str: + """ + Map libclang cursor kind to internal syntax kind string. + + Args: + cursor_kind: libclang cursor kind + context: Additional context for disambiguation + + Returns: + Internal syntax kind string for use with map_syntax_kind() + """ + cursor_syntax_map = { + 'OBJC_INTERFACE_DECL': 'interface_declaration', + 'OBJC_IMPLEMENTATION_DECL': 'class_declaration', + 'OBJC_PROTOCOL_DECL': 'protocol_declaration', + 'OBJC_CATEGORY_DECL': 'category_declaration', + 'OBJC_CATEGORY_IMPL_DECL': 'category_declaration', + 'OBJC_INSTANCE_METHOD_DECL': 'method_declaration', + 'OBJC_CLASS_METHOD_DECL': 'method_declaration', + 'OBJC_PROPERTY_DECL': 'property_declaration', + 'OBJC_IVAR_DECL': 'field_declaration', + 'CLASS_DECL': 'class_declaration', + 'STRUCT_DECL': 'struct_declaration', + 'UNION_DECL': 'union_declaration', + 'ENUM_DECL': 'enum_declaration', + 'FUNCTION_DECL': 'function_declaration', + 'VAR_DECL': 'variable_declaration', + 'PARM_DECL': 'parameter_declaration', + 'FIELD_DECL': 'field_declaration', + 'TYPEDEF_DECL': 'typedef_declaration', + 'MACRO_DEFINITION': 'macro_declaration', + } + + return cursor_syntax_map.get(cursor_kind, 'identifier') + + def get_objc_cursor_symbol_role(self, cursor_kind: str, context: str = None) -> str: + """ + Map libclang cursor kind to internal symbol role string. + + Args: + cursor_kind: libclang cursor kind + context: Additional context (e.g., 'in_assignment', 'in_call') + + Returns: + Internal symbol role string for use with map_symbol_role() + """ + if context == 'definition': + return 'definition' + elif context == 'assignment': + return 'write' + elif context == 'import': + return 'import' + elif cursor_kind in ['OBJC_INTERFACE_DECL', 'OBJC_IMPLEMENTATION_DECL', 'OBJC_PROTOCOL_DECL', + 'OBJC_CATEGORY_DECL', 'OBJC_INSTANCE_METHOD_DECL', 'OBJC_CLASS_METHOD_DECL', 'OBJC_PROPERTY_DECL', + 'CLASS_DECL', 'STRUCT_DECL', 'FUNCTION_DECL', 'VAR_DECL', 'TYPEDEF_DECL']: + return 'definition' + else: + return 'reference' + + def is_valid_objc_symbol_kind(self, symbol_kind: str) -> bool: + """Check if symbol kind is valid for Objective-C.""" + return symbol_kind in self.SYMBOL_KIND_MAP + + def is_valid_objc_syntax_kind(self, syntax_kind: str) -> bool: + """Check if syntax kind is valid for Objective-C.""" + return syntax_kind in self.SYNTAX_KIND_MAP + + def is_valid_objc_symbol_role(self, symbol_role: str) -> bool: + """Check if symbol role is valid for Objective-C.""" + return symbol_role in self.SYMBOL_ROLE_MAP + + def get_all_objc_symbol_kinds(self) -> list: + """Get all available Objective-C symbol kinds.""" + return list(self.SYMBOL_KIND_MAP.keys()) + + def get_all_objc_syntax_kinds(self) -> list: + """Get all available Objective-C syntax kinds.""" + return list(self.SYNTAX_KIND_MAP.keys()) + + def get_all_objc_symbol_roles(self) -> list: + """Get all available Objective-C symbol roles.""" + return list(self.SYMBOL_ROLE_MAP.keys()) + + def get_objective_c_specific_kinds(self) -> Dict[str, str]: + """Get Objective-C-specific symbol kinds.""" + return { + 'interface': 'interface', + 'protocol': 'protocol', + 'category': 'category', + 'property': 'property', + 'ivar': 'ivar', + 'class_method': 'method', + 'instance_method': 'method', + } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/factory.py b/src/code_index_mcp/scip/framework/objective_c/factory.py new file mode 100644 index 0000000..cfdfd9a --- /dev/null +++ b/src/code_index_mcp/scip/framework/objective_c/factory.py @@ -0,0 +1,500 @@ +"""Objective-C SCIP Index Factory implementation.""" + +import os +from pathlib import Path +from typing import Set, List, Iterator, Optional +from ..base.index_factory import SCIPIndexFactory +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..base.enum_mapper import BaseEnumMapper +from ..symbol_generator import SCIPSymbolGenerator +from ..position_calculator import SCIPPositionCalculator +from ..types import SCIPContext, SCIPSymbolDescriptor +from .relationship_extractor import ObjectiveCRelationshipExtractor +from .enum_mapper import ObjectiveCEnumMapper +from .clang_analyzer import ObjectiveCClangAnalyzer +from ...proto import scip_pb2 + +try: + import clang.cindex as clang + from clang.cindex import CursorKind + LIBCLANG_AVAILABLE = True +except ImportError: + LIBCLANG_AVAILABLE = False + clang = None + CursorKind = None + + +class ObjectiveCSCIPIndexFactory(SCIPIndexFactory): + """Objective-C-specific SCIP Index factory implementation with constructor injection.""" + + def __init__(self, + project_root: str, + symbol_generator: SCIPSymbolGenerator, + relationship_extractor: BaseRelationshipExtractor, + enum_mapper: BaseEnumMapper, + position_calculator: SCIPPositionCalculator): + """Initialize Objective-C factory with required components via constructor injection.""" + if not LIBCLANG_AVAILABLE: + raise ImportError("libclang library not available") + + super().__init__(project_root, symbol_generator, relationship_extractor, + enum_mapper, position_calculator) + self.clang_analyzer = ObjectiveCClangAnalyzer() + + def get_language(self) -> str: + """Return language identifier.""" + return "objective-c" + + def get_supported_extensions(self) -> Set[str]: + """Return supported file extensions.""" + return {'.m', '.mm', '.h'} + + def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: + """Extract Objective-C symbol definitions using libclang analysis.""" + try: + translation_unit = self.clang_analyzer.parse(context.content, context.file_path) + + for cursor in self.clang_analyzer.walk(translation_unit): + if self.clang_analyzer.is_symbol_definition(cursor): + symbol_info = self._create_symbol_from_clang_cursor(cursor, context) + if symbol_info: + yield symbol_info + + except SyntaxError as e: + # Handle syntax errors gracefully + pass + + def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: + """Extract Objective-C symbol occurrences.""" + try: + translation_unit = self.clang_analyzer.parse(context.content, context.file_path) + + for cursor in self.clang_analyzer.walk(translation_unit): + if (self.clang_analyzer.is_symbol_definition(cursor) or + self.clang_analyzer.is_symbol_reference(cursor)): + occurrence = self._create_occurrence_from_clang_cursor(cursor, context) + if occurrence: + yield occurrence + + except SyntaxError as e: + # Handle syntax errors gracefully + pass + + def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """Extract Objective-C external symbols from imports.""" + external_symbols = [] + + for doc in documents: + try: + content = self._read_file(os.path.join(self.project_root, doc.relative_path)) + translation_unit = self.clang_analyzer.parse(content, doc.relative_path) + + # Extract include statements + include_statements = self.clang_analyzer.extract_include_statements(translation_unit) + for include_path in include_statements: + external_symbol = self._create_external_symbol_from_include(include_path) + if external_symbol: + external_symbols.append(external_symbol) + + except Exception as e: + # Skip problematic files + continue + + return external_symbols + + def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: + """ + Build Objective-C-specific cross-document relationships. + + This implementation provides basic cross-document relationship support + for Objective-C. A more sophisticated implementation would analyze + #import/#include statements and framework dependencies. + """ + # For now, use a simplified approach + # TODO: Implement proper Objective-C import analysis + return 0 # Placeholder - no relationships added yet + + def _create_symbol_from_clang_cursor(self, cursor, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: + """Create SCIP symbol information from libclang cursor.""" + symbol_info = scip_pb2.SymbolInformation() + + symbol_name = self.clang_analyzer.get_symbol_name(cursor) + if not symbol_name: + return None + + if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('interface') + + elif cursor.kind == CursorKind.OBJC_IMPLEMENTATION_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('class') + + elif cursor.kind == CursorKind.OBJC_PROTOCOL_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('protocol') + + elif cursor.kind in [CursorKind.OBJC_CATEGORY_DECL, CursorKind.OBJC_CATEGORY_IMPL_DECL]: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('category') + + elif cursor.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('method') + + elif cursor.kind == CursorKind.OBJC_PROPERTY_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('property') + + elif cursor.kind == CursorKind.OBJC_IVAR_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('ivar') + + elif cursor.kind == CursorKind.FUNCTION_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('function') + + elif cursor.kind == CursorKind.VAR_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('variable') + + elif cursor.kind == CursorKind.ENUM_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('enum') + + elif cursor.kind == CursorKind.STRUCT_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('struct') + + elif cursor.kind == CursorKind.TYPEDEF_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('typedef') + + elif cursor.kind == CursorKind.MACRO_DEFINITION: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('macro') + + else: + return None + + return symbol_info + + def _create_occurrence_from_clang_cursor(self, cursor, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence from libclang cursor.""" + occurrence = scip_pb2.Occurrence() + + # Calculate position using position calculator + try: + position_info = self.position_calculator.calculate_positions_from_clang_cursor( + context.content, cursor + ) + + # Set range + occurrence.range.start.extend([position_info.start_line, position_info.start_column]) + occurrence.range.end.extend([position_info.end_line, position_info.end_column]) + + except Exception as e: + # Skip if position calculation fails + return None + + symbol_name = self.clang_analyzer.get_symbol_name(cursor) + if not symbol_name: + return None + + # Set symbol and roles based on cursor type + if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('interface_declaration') + + elif cursor.kind == CursorKind.OBJC_IMPLEMENTATION_DECL: + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('class_declaration') + + elif cursor.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('method_declaration') + + elif cursor.kind in [CursorKind.DECL_REF_EXPR, CursorKind.MEMBER_REF_EXPR]: + # Handle variable references + descriptor = SCIPSymbolDescriptor( + + name=symbol_name, + + kind="variable", + + scope_path=context.scope_stack, + + descriptor_suffix="" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') + + else: + return None + + return occurrence + + def _create_external_symbol_from_include(self, include_path: str) -> Optional[scip_pb2.SymbolInformation]: + """Create external symbol from include statement.""" + symbol_info = scip_pb2.SymbolInformation() + + # Determine if it's a system header or local header + if include_path.startswith('/System/') or include_path.startswith('/usr/'): + # System framework or library + symbol_info.symbol = f"objc-system {include_path}" + symbol_info.display_name = include_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"System header: {include_path}") + elif 'Frameworks' in include_path: + # Framework + symbol_info.symbol = f"objc-framework {include_path}" + symbol_info.display_name = include_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"Framework header: {include_path}") + else: + # Local or external header + symbol_info.symbol = f"objc-external {include_path}" + symbol_info.display_name = include_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"External header: {include_path}") + + return symbol_info + + +def create_objective_c_scip_factory(project_root: str) -> ObjectiveCSCIPIndexFactory: + """ + Factory creator for Objective-C SCIP factory. + Ensures all required components are properly assembled via constructor injection. + """ + if not LIBCLANG_AVAILABLE: + raise ImportError("libclang library not available") + + symbol_generator = SCIPSymbolGenerator( + scheme="scip-objc", + package_manager="xcode", + package_name=Path(project_root).name, + version="HEAD" + ) + + relationship_extractor = ObjectiveCRelationshipExtractor() + enum_mapper = ObjectiveCEnumMapper() + position_calculator = SCIPPositionCalculator() + + return ObjectiveCSCIPIndexFactory( + project_root=project_root, + symbol_generator=symbol_generator, + relationship_extractor=relationship_extractor, # Guaranteed to be provided + enum_mapper=enum_mapper, + position_calculator=position_calculator + ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/relationship_extractor.py b/src/code_index_mcp/scip/framework/objective_c/relationship_extractor.py new file mode 100644 index 0000000..bf884af --- /dev/null +++ b/src/code_index_mcp/scip/framework/objective_c/relationship_extractor.py @@ -0,0 +1,276 @@ +"""Objective-C relationship extractor implementation.""" + +from typing import Iterator, Optional, List +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..types import SCIPContext, Relationship +from ...core.relationship_types import InternalRelationshipType + +try: + import clang.cindex as clang + from clang.cindex import CursorKind + LIBCLANG_AVAILABLE = True +except ImportError: + LIBCLANG_AVAILABLE = False + clang = None + CursorKind = None + + +class ObjectiveCRelationshipExtractor(BaseRelationshipExtractor): + """Objective-C-specific relationship extractor using libclang analysis.""" + + def __init__(self): + """Initialize the Objective-C relationship extractor.""" + if not LIBCLANG_AVAILABLE: + raise ImportError("libclang library not available") + + self.index = clang.Index.create() + + def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract inheritance relationships from Objective-C classes and protocols.""" + try: + translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: + interface_name = cursor.spelling + if not interface_name: + continue + + interface_symbol_id = self._create_interface_symbol_id(interface_name, context) + + # Look for superclass + for child in cursor.get_children(): + if child.kind == CursorKind.OBJC_SUPER_CLASS_REF: + parent_name = child.spelling + parent_symbol_id = self._create_interface_symbol_id(parent_name, context) + yield Relationship( + source_symbol=interface_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + + elif cursor.kind == CursorKind.OBJC_PROTOCOL_DECL: + protocol_name = cursor.spelling + if not protocol_name: + continue + + protocol_symbol_id = self._create_protocol_symbol_id(protocol_name, context) + + # Look for protocol inheritance + for child in cursor.get_children(): + if child.kind == CursorKind.OBJC_PROTOCOL_REF: + parent_protocol_name = child.spelling + parent_symbol_id = self._create_protocol_symbol_id(parent_protocol_name, context) + yield Relationship( + source_symbol=protocol_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract method call relationships.""" + try: + translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): + method_name = cursor.spelling + if not method_name: + continue + + method_symbol_id = self._create_method_symbol_id(method_name, context) + + # Find method calls within this method + for child in self._walk_cursor(cursor): + if child.kind == CursorKind.OBJC_MESSAGE_EXPR: + target_method = self._get_message_target(child) + if target_method and target_method != method_name: + target_symbol_id = self._create_method_symbol_id(target_method, context) + yield Relationship( + source_symbol=method_symbol_id, + target_symbol=target_symbol_id, + relationship_type=InternalRelationshipType.CALLS + ) + elif child.kind == CursorKind.CALL_EXPR: + # C function calls + target_function = child.spelling + if target_function and target_function != method_name: + target_symbol_id = self._create_function_symbol_id(target_function, context) + yield Relationship( + source_symbol=method_symbol_id, + target_symbol=target_symbol_id, + relationship_type=InternalRelationshipType.CALLS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract import/dependency relationships.""" + try: + translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) + + file_symbol_id = self._create_file_symbol_id(context.file_path) + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind == CursorKind.INCLUSION_DIRECTIVE: + include_path = self._get_include_path(cursor) + if include_path: + # Determine if it's a system header or local header + if include_path.startswith('<') and include_path.endswith('>'): + # System header + module_symbol_id = f"objc-system {include_path[1:-1]}" + elif include_path.startswith('"') and include_path.endswith('"'): + # Local header + module_symbol_id = f"local {include_path[1:-1]}" + else: + module_symbol_id = f"objc-external {include_path}" + + yield Relationship( + source_symbol=file_symbol_id, + target_symbol=module_symbol_id, + relationship_type=InternalRelationshipType.IMPORTS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract composition relationships (properties, ivars).""" + try: + translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind in [CursorKind.OBJC_INTERFACE_DECL, CursorKind.OBJC_IMPLEMENTATION_DECL]: + class_name = cursor.spelling + if not class_name: + continue + + class_symbol_id = self._create_class_symbol_id(class_name, context) + + # Find properties and ivars in this class + for child in cursor.get_children(): + if child.kind == CursorKind.OBJC_PROPERTY_DECL: + property_name = child.spelling + if property_name: + property_symbol_id = self._create_property_symbol_id(property_name, class_symbol_id) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=property_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + elif child.kind == CursorKind.OBJC_IVAR_DECL: + ivar_name = child.spelling + if ivar_name: + ivar_symbol_id = self._create_ivar_symbol_id(ivar_name, class_symbol_id) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=ivar_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract protocol implementation relationships.""" + try: + translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) + + for cursor in self._walk_cursor(translation_unit.cursor): + if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: + interface_name = cursor.spelling + if not interface_name: + continue + + interface_symbol_id = self._create_interface_symbol_id(interface_name, context) + + # Look for protocol conformance + for child in cursor.get_children(): + if child.kind == CursorKind.OBJC_PROTOCOL_REF: + protocol_name = child.spelling + protocol_symbol_id = self._create_protocol_symbol_id(protocol_name, context) + yield Relationship( + source_symbol=interface_symbol_id, + target_symbol=protocol_symbol_id, + relationship_type=InternalRelationshipType.IMPLEMENTS + ) + + except Exception: + # Skip files with parsing errors + return + + def _walk_cursor(self, cursor) -> Iterator: + """Walk libclang cursor tree.""" + yield cursor + for child in cursor.get_children(): + yield from self._walk_cursor(child) + + def _get_message_target(self, message_expr_cursor) -> Optional[str]: + """Extract target method name from Objective-C message expression.""" + # Get the selector name from the message expression + for child in message_expr_cursor.get_children(): + if child.kind == CursorKind.OBJC_SELECTOR_REF: + return child.spelling + return None + + def _get_include_path(self, inclusion_cursor) -> Optional[str]: + """Extract include path from inclusion directive.""" + # Get the included file path + included_file = inclusion_cursor.get_included_file() + if included_file: + return included_file.name + return None + + def _create_class_symbol_id(self, class_name: str, context: SCIPContext) -> str: + """Create symbol ID for class.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{class_name}" if scope_path else class_name + return f"local {local_id}#" + + def _create_interface_symbol_id(self, interface_name: str, context: SCIPContext) -> str: + """Create symbol ID for interface.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{interface_name}" if scope_path else interface_name + return f"local {local_id}#" + + def _create_protocol_symbol_id(self, protocol_name: str, context: SCIPContext) -> str: + """Create symbol ID for protocol.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{protocol_name}" if scope_path else protocol_name + return f"local {local_id}#" + + def _create_method_symbol_id(self, method_name: str, context: SCIPContext) -> str: + """Create symbol ID for method.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{method_name}" if scope_path else method_name + return f"local {local_id}()." + + def _create_function_symbol_id(self, function_name: str, context: SCIPContext) -> str: + """Create symbol ID for C function.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{function_name}" if scope_path else function_name + return f"local {local_id}()." + + def _create_property_symbol_id(self, property_name: str, class_symbol_id: str) -> str: + """Create symbol ID for property.""" + # Extract class name from class symbol ID + class_name = class_symbol_id.replace("local ", "").replace("#", "") + return f"local {class_name}.{property_name}" + + def _create_ivar_symbol_id(self, ivar_name: str, class_symbol_id: str) -> str: + """Create symbol ID for instance variable.""" + # Extract class name from class symbol ID + class_name = class_symbol_id.replace("local ", "").replace("#", "") + return f"local {class_name}.{ivar_name}" + + def _create_file_symbol_id(self, file_path: str) -> str: + """Create symbol ID for file.""" + return f"local {file_path}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/position_calculator.py b/src/code_index_mcp/scip/framework/position_calculator.py new file mode 100644 index 0000000..6d4364d --- /dev/null +++ b/src/code_index_mcp/scip/framework/position_calculator.py @@ -0,0 +1,225 @@ +"""SCIP Position Calculator - UTF-8/UTF-16 compliant position calculation.""" + +import logging +from typing import Tuple, Optional, Any +from .types import SCIPPositionInfo + + +logger = logging.getLogger(__name__) + + +class SCIPPositionCalculator: + """SCIP position calculator - UTF-8/UTF-16 compliant with mandatory validation.""" + + def __init__(self, encoding: str = "utf-8"): + """Initialize position calculator with specified encoding.""" + self.encoding = encoding + self._line_cache = {} # Cache for line information + + def calculate_positions(self, content: str, node_info: Any) -> SCIPPositionInfo: + """Calculate precise positions with mandatory validation.""" + + # Language-specific node position extraction logic + start_line, start_col, end_line, end_col = self._extract_node_positions(content, node_info) + + # Create position information + position = SCIPPositionInfo(start_line, start_col, end_line, end_col) + + # Mandatory validation + if not position.validate(): + raise ValueError(f"Invalid position: {position}") + + # Validate within document bounds + if not self._is_within_bounds(position, content): + raise ValueError(f"Position out of document bounds: {position}") + + return position + + def calculate_positions_from_range(self, content: str, start_byte: int, end_byte: int) -> SCIPPositionInfo: + """Calculate positions from byte ranges (useful for tree-sitter nodes).""" + lines = content.split('\n') + + # Convert byte offsets to line/column positions + start_line, start_col = self._byte_offset_to_line_col(content, start_byte, lines) + end_line, end_col = self._byte_offset_to_line_col(content, end_byte, lines) + + position = SCIPPositionInfo(start_line, start_col, end_line, end_col) + + # Mandatory validation + if not position.validate(): + raise ValueError(f"Invalid position calculated from bytes [{start_byte}:{end_byte}]: {position}") + + if not self._is_within_bounds(position, content): + raise ValueError(f"Position out of document bounds: {position}") + + return position + + def calculate_positions_from_line_col(self, content: str, start_line: int, start_col: int, + end_line: int, end_col: int) -> SCIPPositionInfo: + """Calculate positions from explicit line/column coordinates.""" + position = SCIPPositionInfo(start_line, start_col, end_line, end_col) + + # Mandatory validation + if not position.validate(): + raise ValueError(f"Invalid position: {position}") + + # Validate within document bounds + if not self._is_within_bounds(position, content): + raise ValueError(f"Position out of document bounds: {position}") + + return position + + def _extract_node_positions(self, content: str, node_info: Any) -> Tuple[int, int, int, int]: + """Extract node positions - subclass implementation required.""" + # Default implementation for objects with line/column attributes + if hasattr(node_info, 'lineno') and hasattr(node_info, 'col_offset'): + # AST node (Python) + start_line = node_info.lineno - 1 # Convert to 0-indexed + start_col = node_info.col_offset + + # Estimate end position if not available + if hasattr(node_info, 'end_lineno') and hasattr(node_info, 'end_col_offset'): + end_line = node_info.end_lineno - 1 + end_col = node_info.end_col_offset + else: + # Fallback: assume single token + end_line = start_line + end_col = start_col + len(getattr(node_info, 'name', 'unknown')) + + return start_line, start_col, end_line, end_col + + elif hasattr(node_info, 'start_point') and hasattr(node_info, 'end_point'): + # Tree-sitter node + start_line = node_info.start_point[0] + start_col = node_info.start_point[1] + end_line = node_info.end_point[0] + end_col = node_info.end_point[1] + + return start_line, start_col, end_line, end_col + + elif isinstance(node_info, dict): + # Dictionary format + return ( + node_info.get('start_line', 0), + node_info.get('start_col', 0), + node_info.get('end_line', 0), + node_info.get('end_col', 0) + ) + + else: + raise NotImplementedError(f"Position extraction not implemented for node type: {type(node_info)}") + + def _byte_offset_to_line_col(self, content: str, byte_offset: int, lines: list) -> Tuple[int, int]: + """Convert byte offset to line/column position with UTF-8 awareness.""" + if byte_offset == 0: + return 0, 0 + + # Convert content to bytes for accurate offset calculation + content_bytes = content.encode(self.encoding) + + if byte_offset >= len(content_bytes): + # End of file + return len(lines) - 1, len(lines[-1]) if lines else 0 + + # Find the line containing this byte offset + current_byte = 0 + for line_num, line in enumerate(lines): + line_bytes = (line + '\n').encode(self.encoding) if line_num < len(lines) - 1 else line.encode(self.encoding) + + if current_byte + len(line_bytes) > byte_offset: + # Byte offset is within this line + offset_in_line = byte_offset - current_byte + # Convert byte offset within line to character position + line_text = line_bytes[:offset_in_line].decode(self.encoding, errors='ignore') + return line_num, len(line_text) + + current_byte += len(line_bytes) + + # Fallback + return len(lines) - 1, len(lines[-1]) if lines else 0 + + def _is_within_bounds(self, position: SCIPPositionInfo, content: str) -> bool: + """Validate position is within document bounds.""" + lines = content.split('\n') + max_line = len(lines) - 1 + + # Check line bounds + if position.start_line < 0 or position.end_line > max_line: + return False + + # Check column bounds for start position + if position.start_line <= max_line: + max_start_col = len(lines[position.start_line]) + if position.start_column < 0 or position.start_column > max_start_col: + return False + + # Check column bounds for end position + if position.end_line <= max_line: + max_end_col = len(lines[position.end_line]) + if position.end_column < 0 or position.end_column > max_end_col: + return False + + return True + + def _is_utf8_compliant(self, position: SCIPPositionInfo, content: str) -> bool: + """Validate UTF-8 character position accuracy.""" + try: + lines = content.split('\n') + + # Check if positions fall on character boundaries + if position.start_line < len(lines): + start_line_text = lines[position.start_line] + if position.start_column <= len(start_line_text): + # Check UTF-8 character boundary + char_at_pos = start_line_text[:position.start_column].encode('utf-8') + # If we can encode/decode without errors, position is valid + char_at_pos.decode('utf-8') + + if position.end_line < len(lines): + end_line_text = lines[position.end_line] + if position.end_column <= len(end_line_text): + char_at_pos = end_line_text[:position.end_column].encode('utf-8') + char_at_pos.decode('utf-8') + + return True + + except (UnicodeEncodeError, UnicodeDecodeError, IndexError): + logger.warning(f"UTF-8 compliance check failed for position: {position}") + return False + + def validate_position_full(self, position: SCIPPositionInfo, content: str) -> bool: + """Perform full position validation including UTF-8 compliance.""" + return ( + position.validate() and + self._is_within_bounds(position, content) and + self._is_utf8_compliant(position, content) + ) + + def get_position_text(self, content: str, position: SCIPPositionInfo) -> str: + """Extract text at the given position for verification.""" + try: + lines = content.split('\n') + + if position.start_line == position.end_line: + # Single line + line = lines[position.start_line] + return line[position.start_column:position.end_column] + else: + # Multi-line + result_lines = [] + + # First line + result_lines.append(lines[position.start_line][position.start_column:]) + + # Middle lines + for line_num in range(position.start_line + 1, position.end_line): + result_lines.append(lines[line_num]) + + # Last line + result_lines.append(lines[position.end_line][:position.end_column]) + + return '\n'.join(result_lines) + + except IndexError as e: + logger.error(f"Failed to extract text at position {position}: {e}") + return "" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/__init__.py b/src/code_index_mcp/scip/framework/python/__init__.py new file mode 100644 index 0000000..231b4bd --- /dev/null +++ b/src/code_index_mcp/scip/framework/python/__init__.py @@ -0,0 +1,14 @@ +"""Python-specific SCIP framework components.""" + +from .factory import PythonSCIPIndexFactory, create_python_scip_factory +from .relationship_extractor import PythonRelationshipExtractor +from .enum_mapper import PythonEnumMapper +from .ast_analyzer import PythonASTAnalyzer + +__all__ = [ + 'PythonSCIPIndexFactory', + 'create_python_scip_factory', + 'PythonRelationshipExtractor', + 'PythonEnumMapper', + 'PythonASTAnalyzer', +] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/ast_analyzer.py b/src/code_index_mcp/scip/framework/python/ast_analyzer.py new file mode 100644 index 0000000..a21a427 --- /dev/null +++ b/src/code_index_mcp/scip/framework/python/ast_analyzer.py @@ -0,0 +1,312 @@ +"""Python AST analyzer implementation.""" + +import ast +from typing import Iterator, Optional, Set, List, Dict, Any +from ..types import SCIPContext +from ..base.language_analyzer import BaseLanguageAnalyzer + + +class PythonASTAnalyzer(BaseLanguageAnalyzer): + """Python AST analyzer for deep code analysis.""" + + def __init__(self): + """Initialize the AST analyzer.""" + self._processed_nodes: Set[int] = set() + self._scope_stack: List[str] = [] + self._imports: Dict[str, str] = {} # alias -> module mapping + + def parse(self, content: str, filename: str = "") -> ast.AST: + """Parse Python source code into AST.""" + try: + return ast.parse(content, filename=filename) + except SyntaxError as e: + raise SyntaxError(f"Python syntax error in {filename}: {e}") + + def walk(self, tree: ast.AST) -> Iterator[ast.AST]: + """Walk AST nodes, avoiding duplicates.""" + for node in ast.walk(tree): + node_id = id(node) + if node_id not in self._processed_nodes: + self._processed_nodes.add(node_id) + yield node + + def is_symbol_definition(self, node: ast.AST) -> bool: + """Check if AST node represents a symbol definition.""" + return isinstance(node, ( + ast.FunctionDef, + ast.AsyncFunctionDef, + ast.ClassDef, + ast.Assign, + ast.AnnAssign, + ast.AugAssign + )) + + def is_symbol_reference(self, node: ast.AST) -> bool: + """Check if AST node represents a symbol reference.""" + return isinstance(node, ( + ast.Name, + ast.Attribute, + ast.Call + )) + + def get_symbol_name(self, node: ast.AST) -> Optional[str]: + """Extract symbol name from AST node.""" + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + return node.name + elif isinstance(node, ast.Name): + return node.id + elif isinstance(node, ast.Assign): + # Handle simple assignments + if len(node.targets) == 1: + target = node.targets[0] + if isinstance(target, ast.Name): + return target.id + elif isinstance(node, ast.AnnAssign): + if isinstance(node.target, ast.Name): + return node.target.id + elif isinstance(node, ast.AugAssign): + if isinstance(node.target, ast.Name): + return node.target.id + + return None + + def get_node_position(self, node: ast.AST) -> tuple: + """Get position information from AST node.""" + if hasattr(node, 'lineno') and hasattr(node, 'col_offset'): + start_line = node.lineno - 1 # Convert to 0-based + start_col = node.col_offset + + # Try to get end position + if hasattr(node, 'end_lineno') and hasattr(node, 'end_col_offset'): + end_line = node.end_lineno - 1 + end_col = node.end_col_offset + else: + # Estimate end position + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + name_len = len(node.name) + end_line = start_line + end_col = start_col + name_len + else: + end_line = start_line + end_col = start_col + 1 + + return (start_line, start_col, end_line, end_col) + + return (0, 0, 0, 1) # Default fallback + + def extract_decorators(self, node: ast.AST) -> List[str]: + """Extract decorator names from function or class.""" + decorators = [] + if hasattr(node, 'decorator_list'): + for decorator in node.decorator_list: + if isinstance(decorator, ast.Name): + decorators.append(decorator.id) + elif isinstance(decorator, ast.Attribute): + decorators.append(self._get_attribute_name(decorator)) + elif isinstance(decorator, ast.Call): + if isinstance(decorator.func, ast.Name): + decorators.append(decorator.func.id) + elif isinstance(decorator.func, ast.Attribute): + decorators.append(self._get_attribute_name(decorator.func)) + + return decorators + + def extract_function_arguments(self, node: ast.FunctionDef) -> List[Dict[str, Any]]: + """Extract function argument information.""" + arguments = [] + + # Regular arguments + for arg in node.args.args: + arg_info = { + 'name': arg.arg, + 'type': 'regular', + 'annotation': self._get_annotation_string(arg.annotation) if arg.annotation else None + } + arguments.append(arg_info) + + # *args + if node.args.vararg: + arg_info = { + 'name': node.args.vararg.arg, + 'type': 'vararg', + 'annotation': self._get_annotation_string(node.args.vararg.annotation) if node.args.vararg.annotation else None + } + arguments.append(arg_info) + + # **kwargs + if node.args.kwarg: + arg_info = { + 'name': node.args.kwarg.arg, + 'type': 'kwarg', + 'annotation': self._get_annotation_string(node.args.kwarg.annotation) if node.args.kwarg.annotation else None + } + arguments.append(arg_info) + + # Keyword-only arguments + for arg in node.args.kwonlyargs: + arg_info = { + 'name': arg.arg, + 'type': 'keyword_only', + 'annotation': self._get_annotation_string(arg.annotation) if arg.annotation else None + } + arguments.append(arg_info) + + return arguments + + def extract_class_bases(self, node: ast.ClassDef) -> List[str]: + """Extract base class names.""" + bases = [] + for base in node.bases: + if isinstance(base, ast.Name): + bases.append(base.id) + elif isinstance(base, ast.Attribute): + bases.append(self._get_attribute_name(base)) + + return bases + + def extract_class_methods(self, node: ast.ClassDef) -> List[Dict[str, Any]]: + """Extract class method information.""" + methods = [] + + for child in node.body: + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)): + method_info = { + 'name': child.name, + 'type': 'async_method' if isinstance(child, ast.AsyncFunctionDef) else 'method', + 'decorators': self.extract_decorators(child), + 'arguments': self.extract_function_arguments(child), + 'is_property': 'property' in self.extract_decorators(child), + 'is_static': 'staticmethod' in self.extract_decorators(child), + 'is_class': 'classmethod' in self.extract_decorators(child), + } + methods.append(method_info) + + return methods + + def extract_imports(self, tree: ast.AST) -> Dict[str, str]: + """Extract import statements and build alias mapping.""" + imports = {} + + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + name = alias.asname if alias.asname else alias.name + imports[name] = alias.name + elif isinstance(node, ast.ImportFrom): + if node.module: + for alias in node.names: + name = alias.asname if alias.asname else alias.name + imports[name] = f"{node.module}.{alias.name}" + + return imports + + def analyze_scope_context(self, node: ast.AST, parent_scopes: List[str] = None) -> List[str]: + """Analyze scope context for a node.""" + if parent_scopes is None: + parent_scopes = [] + + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + return parent_scopes + [node.name] + + return parent_scopes + + def find_variable_assignments(self, tree: ast.AST) -> List[Dict[str, Any]]: + """Find all variable assignments in the AST.""" + assignments = [] + + for node in ast.walk(tree): + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name): + assignment_info = { + 'name': target.id, + 'type': 'assignment', + 'position': self.get_node_position(node), + 'value_type': self._get_value_type(node.value) + } + assignments.append(assignment_info) + elif isinstance(node, ast.AnnAssign): + if isinstance(node.target, ast.Name): + assignment_info = { + 'name': node.target.id, + 'type': 'annotated_assignment', + 'position': self.get_node_position(node), + 'annotation': self._get_annotation_string(node.annotation), + 'value_type': self._get_value_type(node.value) if node.value else None + } + assignments.append(assignment_info) + + return assignments + + def find_function_calls(self, tree: ast.AST) -> List[Dict[str, Any]]: + """Find all function calls in the AST.""" + calls = [] + + for node in ast.walk(tree): + if isinstance(node, ast.Call): + call_info = { + 'function': self._get_call_name(node), + 'position': self.get_node_position(node), + 'args_count': len(node.args), + 'kwargs_count': len(node.keywords) + } + calls.append(call_info) + + return calls + + def _get_attribute_name(self, attr_node: ast.Attribute) -> str: + """Get full attribute name (e.g., module.Class).""" + parts = [] + current = attr_node + + while isinstance(current, ast.Attribute): + parts.append(current.attr) + current = current.value + + if isinstance(current, ast.Name): + parts.append(current.id) + + return ".".join(reversed(parts)) if parts else "" + + def _get_annotation_string(self, annotation: ast.AST) -> str: + """Convert annotation AST to string.""" + if isinstance(annotation, ast.Name): + return annotation.id + elif isinstance(annotation, ast.Attribute): + return self._get_attribute_name(annotation) + elif isinstance(annotation, ast.Constant): + return str(annotation.value) + elif isinstance(annotation, ast.Str): # Python < 3.8 + return annotation.s + else: + return str(type(annotation).__name__) + + def _get_value_type(self, value: ast.AST) -> str: + """Get the type of a value expression.""" + if isinstance(value, ast.Constant): + return type(value.value).__name__ + elif isinstance(value, (ast.Str, ast.Bytes)): # Python < 3.8 + return type(value.s).__name__ + elif isinstance(value, ast.Num): # Python < 3.8 + return type(value.n).__name__ + elif isinstance(value, ast.List): + return "list" + elif isinstance(value, ast.Dict): + return "dict" + elif isinstance(value, ast.Set): + return "set" + elif isinstance(value, ast.Tuple): + return "tuple" + elif isinstance(value, ast.Call): + return self._get_call_name(value) + else: + return "unknown" + + def _get_call_name(self, call_node: ast.Call) -> str: + """Get the name of a function call.""" + if isinstance(call_node.func, ast.Name): + return call_node.func.id + elif isinstance(call_node.func, ast.Attribute): + return self._get_attribute_name(call_node.func) + else: + return "unknown" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/enum_mapper.py b/src/code_index_mcp/scip/framework/python/enum_mapper.py new file mode 100644 index 0000000..8d679a3 --- /dev/null +++ b/src/code_index_mcp/scip/framework/python/enum_mapper.py @@ -0,0 +1,181 @@ +"""Python enum mapper implementation.""" + +from ..base.enum_mapper import BaseEnumMapper +from ...proto import scip_pb2 + + +class PythonEnumMapper(BaseEnumMapper): + """Python-specific enum mapper for SCIP compliance.""" + + # Python symbol kind mappings + SYMBOL_KIND_MAP = { + 'function': scip_pb2.Function, + 'async_function': scip_pb2.Function, + 'method': scip_pb2.Method, + 'class': scip_pb2.Class, + 'variable': scip_pb2.Variable, + 'constant': scip_pb2.Constant, + 'module': scip_pb2.Module, + 'parameter': scip_pb2.Parameter, + 'property': scip_pb2.Property, + 'constructor': scip_pb2.Constructor, + 'field': scip_pb2.Field, + 'namespace': scip_pb2.Namespace, + } + + # Python syntax kind mappings (using actual SCIP protobuf attributes) + SYNTAX_KIND_MAP = { + 'function_definition': scip_pb2.IdentifierFunctionDefinition, + 'class_definition': scip_pb2.IdentifierType, + 'variable_definition': scip_pb2.IdentifierLocal, # Use IdentifierLocal instead of IdentifierVariable + 'parameter_definition': scip_pb2.IdentifierParameter, + 'identifier': scip_pb2.Identifier, + 'keyword': scip_pb2.IdentifierKeyword, + 'string_literal': scip_pb2.StringLiteral, + 'numeric_literal': scip_pb2.NumericLiteral, + 'boolean_literal': scip_pb2.BooleanLiteral, + 'comment': scip_pb2.Comment, + 'punctuation': scip_pb2.PunctuationDelimiter, + } + + # Python symbol role mappings (using official SCIP protobuf attributes) + SYMBOL_ROLE_MAP = { + 'definition': scip_pb2.Definition, + 'import': scip_pb2.Import, + 'write': scip_pb2.Write, # Official SCIP naming + 'read': scip_pb2.Read, # Official SCIP naming + 'generated': scip_pb2.Generated, + 'test': scip_pb2.Test, + 'type': scip_pb2.Type, # Add missing Type role + 'reference': scip_pb2.Read, # Default reference is read access + } + + def map_symbol_kind(self, language_kind: str) -> int: + """Map Python symbol type to SCIP SymbolKind.""" + kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SymbolKind'): + raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") + + return kind + + def map_syntax_kind(self, language_syntax: str) -> int: + """Map Python syntax element to SCIP SyntaxKind.""" + kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SyntaxKind'): + raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") + + return kind + + def map_symbol_role(self, language_role: str) -> int: + """Map Python symbol role to SCIP SymbolRole.""" + role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) + + # Validate enum value + if not self.validate_enum_value(role, 'SymbolRole'): + raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") + + return role + + def get_python_node_symbol_kind(self, node_type: str) -> str: + """ + Map Python AST node type to internal symbol kind string. + + Args: + node_type: Python AST node type (e.g., 'FunctionDef', 'ClassDef') + + Returns: + Internal symbol kind string for use with map_symbol_kind() + """ + node_kind_map = { + 'FunctionDef': 'function', + 'AsyncFunctionDef': 'async_function', + 'ClassDef': 'class', + 'Assign': 'variable', + 'AnnAssign': 'variable', + 'AugAssign': 'variable', + 'arg': 'parameter', + 'Import': 'module', + 'ImportFrom': 'module', + } + + return node_kind_map.get(node_type, 'variable') + + def get_python_node_syntax_kind(self, node_type: str, context: str = None) -> str: + """ + Map Python AST node type to internal syntax kind string. + + Args: + node_type: Python AST node type + context: Additional context for disambiguation + + Returns: + Internal syntax kind string for use with map_syntax_kind() + """ + node_syntax_map = { + 'FunctionDef': 'function_definition', + 'AsyncFunctionDef': 'function_definition', + 'ClassDef': 'class_definition', + 'Assign': 'variable_definition', + 'AnnAssign': 'variable_definition', + 'Name': 'identifier', + 'Str': 'string_literal', + 'Num': 'numeric_literal', + 'Constant': 'numeric_literal', # Python 3.8+ + 'NameConstant': 'boolean_literal', # True, False, None + } + + return node_syntax_map.get(node_type, 'identifier') + + def get_python_node_symbol_role(self, node_type: str, context: str = None) -> str: + """ + Map Python AST node type to internal symbol role string. + + Args: + node_type: Python AST node type + context: Additional context (e.g., 'in_assignment', 'in_call') + + Returns: + Internal symbol role string for use with map_symbol_role() + """ + if context == 'definition': + return 'definition' + elif context == 'assignment': + return 'write' + elif context == 'import': + return 'import' + elif node_type in ['FunctionDef', 'AsyncFunctionDef', 'ClassDef']: + return 'definition' + else: + return 'reference' + + def is_valid_python_symbol_kind(self, symbol_kind: str) -> bool: + """Check if symbol kind is valid for Python.""" + return symbol_kind in self.SYMBOL_KIND_MAP + + def is_valid_python_syntax_kind(self, syntax_kind: str) -> bool: + """Check if syntax kind is valid for Python.""" + return syntax_kind in self.SYNTAX_KIND_MAP + + def is_valid_python_symbol_role(self, symbol_role: str) -> bool: + """Check if symbol role is valid for Python.""" + return symbol_role in self.SYMBOL_ROLE_MAP + + def get_python_type_reference_role(self) -> str: + """Get symbol role for type references (e.g., in annotations).""" + return 'type' + + def get_all_python_symbol_kinds(self) -> list: + """Get all available Python symbol kinds.""" + return list(self.SYMBOL_KIND_MAP.keys()) + + def get_all_python_syntax_kinds(self) -> list: + """Get all available Python syntax kinds.""" + return list(self.SYNTAX_KIND_MAP.keys()) + + def get_all_python_symbol_roles(self) -> list: + """Get all available Python symbol roles.""" + return list(self.SYMBOL_ROLE_MAP.keys()) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/factory.py b/src/code_index_mcp/scip/framework/python/factory.py new file mode 100644 index 0000000..820dbef --- /dev/null +++ b/src/code_index_mcp/scip/framework/python/factory.py @@ -0,0 +1,583 @@ +"""Python SCIP Index Factory implementation.""" + +import ast +import os +import logging +from pathlib import Path +from typing import Set, List, Iterator, Optional, Dict +from ..base.index_factory import SCIPIndexFactory +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..base.enum_mapper import BaseEnumMapper +from ..symbol_generator import SCIPSymbolGenerator +from ..position_calculator import SCIPPositionCalculator +from ..types import SCIPSymbolContext as SCIPContext, SCIPSymbolDescriptor +from .relationship_extractor import PythonRelationshipExtractor +from .enum_mapper import PythonEnumMapper +from .ast_analyzer import PythonASTAnalyzer +from ...proto import scip_pb2 + +logger = logging.getLogger(__name__) + + +class PythonSCIPIndexFactory(SCIPIndexFactory): + """Python-specific SCIP Index factory implementation with constructor injection.""" + + def __init__(self, + project_root: str, + symbol_generator: SCIPSymbolGenerator, + relationship_extractor: BaseRelationshipExtractor, + enum_mapper: BaseEnumMapper, + position_calculator: SCIPPositionCalculator): + """Initialize Python factory with required components via constructor injection.""" + super().__init__(project_root, symbol_generator, relationship_extractor, + enum_mapper, position_calculator) + self.ast_analyzer = PythonASTAnalyzer() + self._parsed_trees = {} # Cache parsed AST trees + self._current_file_symbols = set() # Track symbols defined in current file + + def get_language(self) -> str: + """Return language identifier.""" + return "python" + + def get_supported_extensions(self) -> Set[str]: + """Return supported file extensions.""" + return {'.py', '.pyw', '.pyx'} + + def _get_or_parse_tree(self, context: SCIPContext): + """Get cached AST tree or parse if not cached.""" + cache_key = context.file_path + if cache_key not in self._parsed_trees: + try: + self._parsed_trees[cache_key] = self.ast_analyzer.parse(context.content) + except SyntaxError: + self._parsed_trees[cache_key] = None + return self._parsed_trees[cache_key] + + def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: + """Extract Python symbol definitions using AST analysis.""" + tree = self._get_or_parse_tree(context) + if tree is None: + return + + # First pass: collect all defined symbols in this file + self._current_file_symbols.clear() + for node in self.ast_analyzer.walk(tree): + if self.ast_analyzer.is_symbol_definition(node): + symbol_name = self.ast_analyzer.get_symbol_name(node) + if symbol_name: + self._current_file_symbols.add(symbol_name) + + # Clear processed nodes cache for fresh traversal + self.ast_analyzer._processed_nodes.clear() + + for node in self.ast_analyzer.walk(tree): + if self.ast_analyzer.is_symbol_definition(node): + symbol_info = self._create_symbol_from_ast_node(node, context) + if symbol_info: + yield symbol_info + + def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: + """Extract Python symbol occurrences.""" + tree = self._get_or_parse_tree(context) + if tree is None: + return + + # First pass: collect all defined symbols in this file + self._current_file_symbols.clear() + for node in self.ast_analyzer.walk(tree): + if self.ast_analyzer.is_symbol_definition(node): + symbol_name = self.ast_analyzer.get_symbol_name(node) + if symbol_name: + self._current_file_symbols.add(symbol_name) + + # Need to clear processed nodes for occurrence extraction + # Since symbols were already extracted, the cache needs reset + self.ast_analyzer._processed_nodes.clear() + + for node in self.ast_analyzer.walk(tree): + if self.ast_analyzer.is_symbol_definition(node) or self.ast_analyzer.is_symbol_reference(node): + occurrence = self._create_occurrence_from_ast_node(node, context) + if occurrence: + yield occurrence + + def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """Extract Python external symbols from imports.""" + external_symbols = [] + + for doc in documents: + # Use cached AST tree if available - need full path for cache key + full_path = os.path.join(self.project_root, doc.relative_path) + cache_key = full_path + tree = self._parsed_trees.get(cache_key) + + if tree is None: + # Only parse if not already cached + try: + content = self._read_file(full_path) + tree = self.ast_analyzer.parse(content) + self._parsed_trees[cache_key] = tree + except (FileNotFoundError, SyntaxError): + continue + + if tree is not None: + for node in self.ast_analyzer.walk(tree): + if isinstance(node, (ast.Import, ast.ImportFrom)): + external_symbol = self._create_external_symbol_from_import(node) + if external_symbol: + external_symbols.append(external_symbol) + continue + + return external_symbols + + def clear_cache(self): + """Clear AST parsing cache.""" + self._parsed_trees.clear() + + def _create_symbol_from_ast_node(self, node: ast.AST, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: + """Create SCIP symbol information from AST node.""" + symbol_info = scip_pb2.SymbolInformation() + + if isinstance(node, ast.FunctionDef): + descriptor = SCIPSymbolDescriptor( + + name=node.name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = node.name + symbol_info.kind = self.enum_mapper.map_symbol_kind('function') + + # Add docstring if available + docstring = ast.get_docstring(node) + if docstring: + symbol_info.documentation.append(docstring) + + elif isinstance(node, ast.AsyncFunctionDef): + descriptor = SCIPSymbolDescriptor( + + name=node.name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = node.name + symbol_info.kind = self.enum_mapper.map_symbol_kind('async_function') + + # Add docstring if available + docstring = ast.get_docstring(node) + if docstring: + symbol_info.documentation.append(docstring) + + elif isinstance(node, ast.ClassDef): + descriptor = SCIPSymbolDescriptor( + + name=node.name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = node.name + symbol_info.kind = self.enum_mapper.map_symbol_kind('class') + + # Add docstring if available + docstring = ast.get_docstring(node) + if docstring: + symbol_info.documentation.append(docstring) + + else: + return None + + return symbol_info + + def _create_occurrence_from_ast_node(self, node: ast.AST, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence from AST node.""" + occurrence = scip_pb2.Occurrence() + + # Calculate position using position calculator + try: + position_info = self.position_calculator.calculate_positions( + context.content, node + ) + + # Set range + occurrence.range.start.extend([position_info.start_line, position_info.start_column]) + occurrence.range.end.extend([position_info.end_line, position_info.end_column]) + + except Exception as e: + # Skip if position calculation fails + return None + + # Set symbol and roles based on node type + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + descriptor = SCIPSymbolDescriptor( + + name=node.name, + + kind="function", + + scope_path=context.scope_stack, + + descriptor_suffix="()." + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('function_definition') + + elif isinstance(node, ast.ClassDef): + descriptor = SCIPSymbolDescriptor( + + name=node.name, + + kind="class", + + scope_path=context.scope_stack, + + descriptor_suffix="#" + + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('class_definition') + + elif isinstance(node, ast.Name): + # Handle variable references + # Check if this is an internal or external symbol + is_internal = node.id in self._current_file_symbols + + if is_internal: + descriptor = SCIPSymbolDescriptor( + name=node.id, + kind="variable", + scope_path=context.scope_stack, + descriptor_suffix="" + ) + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + else: + # External symbol - use appropriate namespace + # Common Python builtins + if node.id in {'str', 'int', 'float', 'bool', 'list', 'dict', 'set', 'tuple', + 'None', 'True', 'False', 'print', 'len', 'range', 'open'}: + occurrence.symbol = f"python-builtin {node.id}" + else: + # Assume it's from an import or global scope + occurrence.symbol = f"python {node.id}" + + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') + + elif isinstance(node, ast.Call): + # Handle function calls + func_name = self._extract_call_name(node.func) + if func_name: + # Check if this is an internal or external function + is_internal = func_name in self._current_file_symbols + + if is_internal: + # Internal function/method + if isinstance(node.func, ast.Attribute): + # Method call - use method descriptor + descriptor = SCIPSymbolDescriptor( + name=func_name, + kind="method", + scope_path=context.scope_stack, + descriptor_suffix="()." + ) + else: + # Function call + descriptor = SCIPSymbolDescriptor( + name=func_name, + kind="function", + scope_path=context.scope_stack, + descriptor_suffix="()." + ) + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + else: + # External function + if func_name in {'print', 'len', 'range', 'open', 'input', 'int', 'str', 'float'}: + occurrence.symbol = f"python-builtin {func_name}()." + else: + occurrence.symbol = f"python {func_name}()." + + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('function') + else: + return None + + elif isinstance(node, ast.Attribute): + # Handle attribute access (including method references) + attr_name = node.attr + descriptor = SCIPSymbolDescriptor( + name=attr_name, + kind="variable", # Could be method, property, or field + scope_path=context.scope_stack, + descriptor_suffix="" + ) + + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') + + else: + return None + + return occurrence + + def _create_external_symbol_from_import(self, node: ast.AST) -> Optional[scip_pb2.SymbolInformation]: + """Create external symbol from import statement.""" + symbol_info = scip_pb2.SymbolInformation() + + if isinstance(node, ast.Import): + for alias in node.names: + symbol_info.symbol = f"python-stdlib {alias.name}" + symbol_info.display_name = alias.name + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"Imported module: {alias.name}") + return symbol_info + + elif isinstance(node, ast.ImportFrom): + if node.module: + symbol_info.symbol = f"python-stdlib {node.module}" + symbol_info.display_name = node.module + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"Imported from module: {node.module}") + return symbol_info + + return None + + def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: + """ + Build Python-specific cross-document relationships. + + This implementation analyzes Python import statements and creates proper + cross-document relationships using package-qualified symbol names. + """ + logger.info(f"Building Python cross-document relationships for {len(documents)} files") + + # Step 1: Analyze Python imports across all documents + import_mapping = self._analyze_python_imports(documents) + + # Step 2: Build Python-specific symbol registry + symbol_registry = self._build_python_symbol_registry(documents, import_mapping) + + # Step 3: Process cross-document references + relationships_added = self._create_python_cross_document_relationships( + documents, symbol_registry, import_mapping + ) + + logger.info(f"Added {relationships_added} Python cross-document relationships") + return relationships_added + + def _analyze_python_imports(self, documents: List[scip_pb2.Document]) -> Dict[str, Dict[str, str]]: + """ + Analyze Python import statements across all documents. + + Returns: + Dict mapping file_path -> {symbol_name -> full_module_path} + """ + import_mapping = {} + + for doc in documents: + file_imports = {} + + # Get full file path for AST parsing + full_path = os.path.join(self.project_root, doc.relative_path) + cache_key = full_path + + # Use cached AST tree if available + tree = self._parsed_trees.get(cache_key) + if tree is None: + try: + content = self._read_file(full_path) + if content: + tree = self.ast_analyzer.parse(content) + self._parsed_trees[cache_key] = tree + except (FileNotFoundError, SyntaxError): + continue + + if tree is not None: + # Extract import information + for node in self.ast_analyzer.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + imported_name = alias.asname if alias.asname else alias.name.split('.')[-1] + file_imports[imported_name] = alias.name + + elif isinstance(node, ast.ImportFrom): + if node.module: + for alias in node.names: + imported_name = alias.asname if alias.asname else alias.name + full_name = f"{node.module}.{alias.name}" + file_imports[imported_name] = full_name + + import_mapping[doc.relative_path] = file_imports + + logger.debug(f"Analyzed imports for {len(import_mapping)} Python files") + return import_mapping + + def _build_python_symbol_registry(self, documents: List[scip_pb2.Document], + import_mapping: Dict[str, Dict[str, str]]) -> Dict[str, tuple]: + """ + Build symbol registry with proper Python package-qualified names. + + Returns: + Dict mapping full_symbol_id -> (document, symbol_info) + """ + symbol_registry = {} + + for doc in documents: + module_path = self._file_path_to_module_path(doc.relative_path) + + for symbol_info in doc.symbols: + local_symbol = symbol_info.symbol + + # Convert local symbol to package-qualified symbol + if local_symbol.startswith('local '): + symbol_name = local_symbol[6:] # Remove 'local ' prefix + + # Create package-qualified symbol + package_symbol = f"python pypi {Path(self.project_root).name} HEAD {module_path}.{symbol_name}" + symbol_registry[package_symbol] = (doc, symbol_info) + + # Also register the local version for backward compatibility + symbol_registry[local_symbol] = (doc, symbol_info) + + logger.debug(f"Built Python symbol registry with {len(symbol_registry)} entries") + return symbol_registry + + def _create_python_cross_document_relationships(self, documents: List[scip_pb2.Document], + symbol_registry: Dict[str, tuple], + import_mapping: Dict[str, Dict[str, str]]) -> int: + """ + Create cross-document relationships based on Python import analysis. + """ + relationships_added = 0 + + for source_doc in documents: + file_imports = import_mapping.get(source_doc.relative_path, {}) + + for occurrence in source_doc.occurrences: + # Skip if not a reference + if not (occurrence.symbol_roles & 8): # ReadAccess + continue + + # Skip if it's also a definition + if occurrence.symbol_roles & 1: # Definition + continue + + # Check if this is a cross-file reference based on imports + symbol_name = self._extract_symbol_name_from_occurrence(occurrence) + if symbol_name in file_imports: + # This is a reference to an imported symbol + target_module = file_imports[symbol_name] + target_symbol_id = f"python pypi {Path(self.project_root).name} HEAD {target_module}" + + target_entry = symbol_registry.get(target_symbol_id) + if target_entry: + target_doc, target_symbol_info = target_entry + + # Find the containing symbol in source document + source_symbol_id = self._find_containing_symbol_in_python(occurrence, source_doc) + + if source_symbol_id and source_symbol_id != target_symbol_id: + # Create relationship + relationship = scip_pb2.Relationship() + relationship.symbol = source_symbol_id + relationship.is_reference = True + + # Check for duplicates + if not any(rel.symbol == source_symbol_id for rel in target_symbol_info.relationships): + target_symbol_info.relationships.append(relationship) + relationships_added += 1 + + return relationships_added + + def _extract_call_name(self, func_node: ast.AST) -> Optional[str]: + """Extract the function name from a Call node's func attribute.""" + if isinstance(func_node, ast.Name): + return func_node.id + elif isinstance(func_node, ast.Attribute): + return func_node.attr + return None + + def _file_path_to_module_path(self, file_path: str) -> str: + """Convert file path to Python module path.""" + # Remove .py extension and convert path separators to dots + module_path = file_path.replace('\\', '/').replace('.py', '').replace('/', '.') + + # Remove common prefixes + if module_path.startswith('src.'): + module_path = module_path[4:] + + return module_path + + def _extract_symbol_name_from_occurrence(self, occurrence: scip_pb2.Occurrence) -> str: + """Extract simple symbol name from SCIP occurrence.""" + symbol = occurrence.symbol + if symbol.startswith('local '): + return symbol[6:].split('.')[0] # Get first part after 'local ' + return symbol.split('.')[-1] # Get last part of qualified name + + def _find_containing_symbol_in_python(self, occurrence: scip_pb2.Occurrence, + document: scip_pb2.Document) -> Optional[str]: + """Find which Python symbol contains this occurrence.""" + if not occurrence.range or not occurrence.range.start: + return None + + occurrence_line = occurrence.range.start[0] if len(occurrence.range.start) > 0 else 0 + + # Find the most specific containing symbol + containing_symbol = None + for symbol_info in document.symbols: + # Simple heuristic: assume we're in the first function/class found + if symbol_info.kind in [11, 3]: # Function or Class + containing_symbol = symbol_info.symbol + break + + return containing_symbol + + +def create_python_scip_factory(project_root: str) -> PythonSCIPIndexFactory: + """ + Factory creator for Python SCIP factory. + Ensures all required components are properly assembled via constructor injection. + """ + symbol_generator = SCIPSymbolGenerator( + scheme="scip-python", + package_manager="local", + package_name=Path(project_root).name, + version="HEAD" + ) + + relationship_extractor = PythonRelationshipExtractor() + enum_mapper = PythonEnumMapper() + position_calculator = SCIPPositionCalculator() + + return PythonSCIPIndexFactory( + project_root=project_root, + symbol_generator=symbol_generator, + relationship_extractor=relationship_extractor, # Guaranteed to be provided + enum_mapper=enum_mapper, + position_calculator=position_calculator + ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/relationship_extractor.py b/src/code_index_mcp/scip/framework/python/relationship_extractor.py new file mode 100644 index 0000000..bc6778e --- /dev/null +++ b/src/code_index_mcp/scip/framework/python/relationship_extractor.py @@ -0,0 +1,205 @@ +"""Python relationship extractor implementation.""" + +import ast +from typing import Iterator +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..types import SCIPContext, Relationship +from ...core.relationship_types import InternalRelationshipType + + +class PythonRelationshipExtractor(BaseRelationshipExtractor): + """Python-specific relationship extractor using AST analysis.""" + + def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract inheritance relationships from Python classes.""" + try: + tree = ast.parse(context.content) + + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + class_symbol_id = self._create_class_symbol_id(node.name, context) + + # Extract base classes + for base in node.bases: + if isinstance(base, ast.Name): + parent_symbol_id = self._create_class_symbol_id(base.id, context) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + elif isinstance(base, ast.Attribute): + # Handle module.ClassName inheritance + parent_name = self._get_attribute_name(base) + if parent_name: + parent_symbol_id = self._create_class_symbol_id(parent_name, context) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=parent_symbol_id, + relationship_type=InternalRelationshipType.INHERITS + ) + + except SyntaxError: + # Skip files with syntax errors + return + + def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract function/method call relationships.""" + try: + tree = ast.parse(context.content) + + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + function_symbol_id = self._create_function_symbol_id(node.name, context) + + # Find function calls within this function + for child in ast.walk(node): + if isinstance(child, ast.Call): + target_function = self._extract_call_target(child) + if target_function: + target_symbol_id = self._create_function_symbol_id(target_function, context) + yield Relationship( + source_symbol=function_symbol_id, + target_symbol=target_symbol_id, + relationship_type=InternalRelationshipType.CALLS + ) + + except SyntaxError: + # Skip files with syntax errors + return + + def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract import/dependency relationships.""" + try: + tree = ast.parse(context.content) + + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + module_symbol_id = f"python-stdlib {alias.name}" + file_symbol_id = self._create_file_symbol_id(context.file_path) + + yield Relationship( + source_symbol=file_symbol_id, + target_symbol=module_symbol_id, + relationship_type=InternalRelationshipType.IMPORTS + ) + + elif isinstance(node, ast.ImportFrom): + if node.module: + module_symbol_id = f"python-stdlib {node.module}" + file_symbol_id = self._create_file_symbol_id(context.file_path) + + yield Relationship( + source_symbol=file_symbol_id, + target_symbol=module_symbol_id, + relationship_type=InternalRelationshipType.IMPORTS + ) + + except SyntaxError: + # Skip files with syntax errors + return + + def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract composition relationships (class attributes).""" + try: + tree = ast.parse(context.content) + + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + class_symbol_id = self._create_class_symbol_id(node.name, context) + + # Find attribute assignments in __init__ method + for child in ast.walk(node): + if isinstance(child, ast.FunctionDef) and child.name == "__init__": + for assign_node in ast.walk(child): + if isinstance(assign_node, ast.Assign): + for target in assign_node.targets: + if isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name) and target.value.id == "self": + # This is a self.attribute assignment + attribute_symbol_id = self._create_attribute_symbol_id(target.attr, class_symbol_id) + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=attribute_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + + except SyntaxError: + # Skip files with syntax errors + return + + def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract interface relationships (protocols, abstract base classes).""" + try: + tree = ast.parse(context.content) + + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + class_symbol_id = self._create_class_symbol_id(node.name, context) + + # Check for abstract methods (indicating interface-like behavior) + has_abstract_methods = False + for child in ast.walk(node): + if isinstance(child, ast.FunctionDef): + # Check for @abstractmethod decorator + for decorator in child.decorator_list: + if isinstance(decorator, ast.Name) and decorator.id == "abstractmethod": + has_abstract_methods = True + break + + if has_abstract_methods: + # This class implements an interface pattern + interface_symbol_id = f"{class_symbol_id}_interface" + yield Relationship( + source_symbol=class_symbol_id, + target_symbol=interface_symbol_id, + relationship_type=InternalRelationshipType.IMPLEMENTS + ) + + except SyntaxError: + # Skip files with syntax errors + return + + def _create_class_symbol_id(self, class_name: str, context: SCIPContext) -> str: + """Create symbol ID for class.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{class_name}" if scope_path else class_name + return f"local {local_id}#" + + def _create_function_symbol_id(self, function_name: str, context: SCIPContext) -> str: + """Create symbol ID for function.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{function_name}" if scope_path else function_name + return f"local {local_id}()." + + def _create_attribute_symbol_id(self, attribute_name: str, class_symbol_id: str) -> str: + """Create symbol ID for class attribute.""" + # Extract class name from class symbol ID + class_name = class_symbol_id.replace("local ", "").replace("#", "") + return f"local {class_name}.{attribute_name}" + + def _create_file_symbol_id(self, file_path: str) -> str: + """Create symbol ID for file.""" + return f"local {file_path}" + + def _extract_call_target(self, call_node: ast.Call) -> str: + """Extract the target function name from a call node.""" + if isinstance(call_node.func, ast.Name): + return call_node.func.id + elif isinstance(call_node.func, ast.Attribute): + return call_node.func.attr + return None + + def _get_attribute_name(self, attr_node: ast.Attribute) -> str: + """Get the full attribute name (e.g., module.Class).""" + parts = [] + current = attr_node + + while isinstance(current, ast.Attribute): + parts.append(current.attr) + current = current.value + + if isinstance(current, ast.Name): + parts.append(current.id) + + return ".".join(reversed(parts)) if parts else None \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/relationship_manager.py b/src/code_index_mcp/scip/framework/relationship_manager.py new file mode 100644 index 0000000..5aef27f --- /dev/null +++ b/src/code_index_mcp/scip/framework/relationship_manager.py @@ -0,0 +1,406 @@ +"""SCIP Relationship Manager - Comprehensive symbol relationship extraction and management.""" + +import logging +from typing import Dict, List, Set, Tuple, Optional, Any +from enum import Enum +from dataclasses import dataclass + +from ..proto import scip_pb2 + + +logger = logging.getLogger(__name__) + + +class RelationshipType(Enum): + """Standard relationship types for symbol analysis.""" + INHERITANCE = "inheritance" + IMPLEMENTATION = "implementation" + COMPOSITION = "composition" + DEPENDENCY = "dependency" + CALL = "call" + IMPORT = "import" + REFERENCE = "reference" + TYPE_DEFINITION = "type_definition" + OVERRIDE = "override" + INSTANTIATION = "instantiation" + + +@dataclass(frozen=True) +class SymbolRelationship: + """Immutable symbol relationship representation.""" + source_symbol: str + target_symbol: str + relationship_type: RelationshipType + confidence: float = 1.0 + source_location: Optional[str] = None + additional_info: Optional[Dict[str, Any]] = None + + +class SCIPRelationshipManager: + """ + Comprehensive relationship manager for SCIP symbol relationships. + + This manager handles the extraction, validation, and conversion of symbol + relationships to SCIP format, ensuring complete relationship networks. + """ + + def __init__(self): + """Initialize the relationship manager.""" + self._relationships: Dict[str, List[SymbolRelationship]] = {} + self._reverse_relationships: Dict[str, List[SymbolRelationship]] = {} + self._relationship_count_by_type: Dict[RelationshipType, int] = {} + + # Initialize counters + for rel_type in RelationshipType: + self._relationship_count_by_type[rel_type] = 0 + + logger.debug("Initialized SCIP Relationship Manager") + + def add_relationship(self, + source_symbol: str, + target_symbol: str, + relationship_type: RelationshipType, + confidence: float = 1.0, + source_location: Optional[str] = None, + additional_info: Optional[Dict[str, Any]] = None) -> None: + """ + Add a symbol relationship to the manager. + + Args: + source_symbol: Source symbol ID + target_symbol: Target symbol ID + relationship_type: Type of relationship + confidence: Confidence level (0.0-1.0) + source_location: Location where relationship was detected + additional_info: Additional metadata about the relationship + """ + if not self._validate_symbol_id(source_symbol): + logger.warning(f"Invalid source symbol ID: {source_symbol}") + return + + if not self._validate_symbol_id(target_symbol): + logger.warning(f"Invalid target symbol ID: {target_symbol}") + return + + if not 0.0 <= confidence <= 1.0: + logger.warning(f"Invalid confidence value: {confidence}, setting to 1.0") + confidence = 1.0 + + relationship = SymbolRelationship( + source_symbol=source_symbol, + target_symbol=target_symbol, + relationship_type=relationship_type, + confidence=confidence, + source_location=source_location, + additional_info=additional_info or {} + ) + + # Add to forward relationships + if source_symbol not in self._relationships: + self._relationships[source_symbol] = [] + + # Check for duplicates + existing = [r for r in self._relationships[source_symbol] + if r.target_symbol == target_symbol and r.relationship_type == relationship_type] + if existing: + logger.debug(f"Duplicate relationship ignored: {source_symbol} -> {target_symbol} ({relationship_type})") + return + + self._relationships[source_symbol].append(relationship) + + # Add to reverse relationships + if target_symbol not in self._reverse_relationships: + self._reverse_relationships[target_symbol] = [] + self._reverse_relationships[target_symbol].append(relationship) + + # Update counters + self._relationship_count_by_type[relationship_type] += 1 + + logger.debug(f"Added relationship: {source_symbol} --{relationship_type.value}--> {target_symbol}") + + def get_relationships(self, symbol_id: str) -> List[SymbolRelationship]: + """ + Get all outgoing relationships for a symbol. + + Args: + symbol_id: Symbol ID to get relationships for + + Returns: + List of relationships where symbol is the source + """ + return self._relationships.get(symbol_id, []) + + def get_reverse_relationships(self, symbol_id: str) -> List[SymbolRelationship]: + """ + Get all incoming relationships for a symbol. + + Args: + symbol_id: Symbol ID to get incoming relationships for + + Returns: + List of relationships where symbol is the target + """ + return self._reverse_relationships.get(symbol_id, []) + + def get_relationships_by_type(self, + symbol_id: str, + relationship_type: RelationshipType) -> List[SymbolRelationship]: + """ + Get relationships of a specific type for a symbol. + + Args: + symbol_id: Symbol ID + relationship_type: Type of relationship to filter by + + Returns: + List of relationships of the specified type + """ + all_relationships = self.get_relationships(symbol_id) + return [r for r in all_relationships if r.relationship_type == relationship_type] + + def has_relationship(self, + source_symbol: str, + target_symbol: str, + relationship_type: Optional[RelationshipType] = None) -> bool: + """ + Check if a relationship exists between two symbols. + + Args: + source_symbol: Source symbol ID + target_symbol: Target symbol ID + relationship_type: Optional specific relationship type to check + + Returns: + True if relationship exists + """ + relationships = self.get_relationships(source_symbol) + + for rel in relationships: + if rel.target_symbol == target_symbol: + if relationship_type is None or rel.relationship_type == relationship_type: + return True + + return False + + def get_inheritance_chain(self, symbol_id: str) -> List[str]: + """ + Get the complete inheritance chain for a symbol. + + Args: + symbol_id: Symbol ID to get inheritance chain for + + Returns: + List of symbol IDs in inheritance order (immediate parent first) + """ + chain = [] + visited = set() + current = symbol_id + + while current and current not in visited: + visited.add(current) + inheritance_rels = self.get_relationships_by_type(current, RelationshipType.INHERITANCE) + + if inheritance_rels: + # Take the first inheritance relationship + parent = inheritance_rels[0].target_symbol + chain.append(parent) + current = parent + else: + break + + return chain + + def get_call_graph(self, symbol_id: str, max_depth: int = 5) -> Dict[str, List[str]]: + """ + Get the call graph for a symbol (what it calls). + + Args: + symbol_id: Symbol ID to get call graph for + max_depth: Maximum depth to traverse + + Returns: + Dictionary mapping symbol IDs to their called functions + """ + call_graph = {} + visited = set() + + def traverse(current_symbol: str, depth: int): + if depth >= max_depth or current_symbol in visited: + return + + visited.add(current_symbol) + call_relationships = self.get_relationships_by_type(current_symbol, RelationshipType.CALL) + + if call_relationships: + called_symbols = [r.target_symbol for r in call_relationships] + call_graph[current_symbol] = called_symbols + + # Recursively traverse called functions + for called_symbol in called_symbols: + traverse(called_symbol, depth + 1) + + traverse(symbol_id, 0) + return call_graph + + def get_dependency_graph(self, symbol_id: str) -> Dict[str, List[str]]: + """ + Get the dependency graph for a symbol. + + Args: + symbol_id: Symbol ID to get dependencies for + + Returns: + Dictionary mapping symbol to its dependencies + """ + dependency_rels = self.get_relationships_by_type(symbol_id, RelationshipType.DEPENDENCY) + import_rels = self.get_relationships_by_type(symbol_id, RelationshipType.IMPORT) + + dependencies = [] + dependencies.extend([r.target_symbol for r in dependency_rels]) + dependencies.extend([r.target_symbol for r in import_rels]) + + return {symbol_id: dependencies} if dependencies else {} + + def convert_to_scip_relationships(self, symbol_id: str) -> List[scip_pb2.Relationship]: + """ + Convert symbol relationships to SCIP Relationship objects. + + Args: + symbol_id: Symbol ID to convert relationships for + + Returns: + List of SCIP Relationship objects + """ + relationships = self.get_relationships(symbol_id) + scip_relationships = [] + + for rel in relationships: + scip_rel = scip_pb2.Relationship() + scip_rel.symbol = rel.target_symbol + + # Map relationship types to SCIP boolean flags + if rel.relationship_type == RelationshipType.REFERENCE: + scip_rel.is_reference = True + elif rel.relationship_type == RelationshipType.IMPLEMENTATION: + scip_rel.is_implementation = True + elif rel.relationship_type == RelationshipType.TYPE_DEFINITION: + scip_rel.is_type_definition = True + elif rel.relationship_type == RelationshipType.INHERITANCE: + scip_rel.is_definition = True # Inheritance implies definition relationship + else: + # For other relationship types, mark as reference + scip_rel.is_reference = True + + scip_relationships.append(scip_rel) + + return scip_relationships + + def add_inheritance_relationship(self, child_symbol: str, parent_symbol: str, + confidence: float = 1.0, source_location: Optional[str] = None): + """Add an inheritance relationship (child inherits from parent).""" + self.add_relationship( + child_symbol, parent_symbol, RelationshipType.INHERITANCE, + confidence=confidence, source_location=source_location, + additional_info={"relationship_description": f"{child_symbol} inherits from {parent_symbol}"} + ) + + def add_call_relationship(self, caller_symbol: str, callee_symbol: str, + confidence: float = 1.0, source_location: Optional[str] = None): + """Add a call relationship (caller calls callee).""" + self.add_relationship( + caller_symbol, callee_symbol, RelationshipType.CALL, + confidence=confidence, source_location=source_location, + additional_info={"relationship_description": f"{caller_symbol} calls {callee_symbol}"} + ) + + def add_import_relationship(self, importer_symbol: str, imported_symbol: str, + confidence: float = 1.0, source_location: Optional[str] = None): + """Add an import relationship (importer imports imported).""" + self.add_relationship( + importer_symbol, imported_symbol, RelationshipType.IMPORT, + confidence=confidence, source_location=source_location, + additional_info={"relationship_description": f"{importer_symbol} imports {imported_symbol}"} + ) + + def add_composition_relationship(self, composite_symbol: str, component_symbol: str, + confidence: float = 1.0, source_location: Optional[str] = None): + """Add a composition relationship (composite contains component).""" + self.add_relationship( + composite_symbol, component_symbol, RelationshipType.COMPOSITION, + confidence=confidence, source_location=source_location, + additional_info={"relationship_description": f"{composite_symbol} contains {component_symbol}"} + ) + + def get_statistics(self) -> Dict[str, Any]: + """Get comprehensive statistics about relationships.""" + total_symbols_with_relationships = len(self._relationships) + total_relationships = sum(len(rels) for rels in self._relationships.values()) + + return { + "total_symbols_with_relationships": total_symbols_with_relationships, + "total_relationships": total_relationships, + "relationships_by_type": dict(self._relationship_count_by_type), + "average_relationships_per_symbol": ( + total_relationships / total_symbols_with_relationships + if total_symbols_with_relationships > 0 else 0 + ), + "symbols_with_incoming_relationships": len(self._reverse_relationships) + } + + def validate_relationship_integrity(self) -> List[str]: + """ + Validate the integrity of all relationships. + + Returns: + List of validation warnings/errors + """ + issues = [] + + # Check for circular inheritance + for symbol_id in self._relationships: + chain = self.get_inheritance_chain(symbol_id) + if symbol_id in chain: + issues.append(f"Circular inheritance detected for {symbol_id}") + + # Check for self-references (except for specific types) + for symbol_id, relationships in self._relationships.items(): + for rel in relationships: + if rel.target_symbol == symbol_id and rel.relationship_type not in [ + RelationshipType.REFERENCE, RelationshipType.CALL + ]: + issues.append(f"Self-reference detected: {symbol_id} -> {rel.relationship_type.value}") + + # Check confidence levels + for relationships in self._relationships.values(): + for rel in relationships: + if rel.confidence < 0.5: + issues.append(f"Low confidence relationship: {rel.source_symbol} -> {rel.target_symbol} ({rel.confidence})") + + return issues + + def _validate_symbol_id(self, symbol_id: str) -> bool: + """Validate symbol ID format.""" + return bool(symbol_id and isinstance(symbol_id, str) and len(symbol_id.strip()) > 0) + + def clear(self): + """Clear all relationships.""" + self._relationships.clear() + self._reverse_relationships.clear() + for rel_type in RelationshipType: + self._relationship_count_by_type[rel_type] = 0 + logger.debug("Cleared all relationships") + + def export_relationships(self) -> Dict[str, Any]: + """Export all relationships for serialization.""" + exported = {} + for symbol_id, relationships in self._relationships.items(): + exported[symbol_id] = [] + for rel in relationships: + exported[symbol_id].append({ + "target_symbol": rel.target_symbol, + "relationship_type": rel.relationship_type.value, + "confidence": rel.confidence, + "source_location": rel.source_location, + "additional_info": rel.additional_info + }) + return exported \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/standard_framework.py b/src/code_index_mcp/scip/framework/standard_framework.py new file mode 100644 index 0000000..f8c7605 --- /dev/null +++ b/src/code_index_mcp/scip/framework/standard_framework.py @@ -0,0 +1,354 @@ +"""SCIP Standard Framework - SCIP standard framework enforcing compliance.""" + +import logging +import os +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Optional, Dict, Any + +from .types import SCIPSymbolContext, SCIPSymbolExtractor +from .symbol_generator import SCIPSymbolGenerator +from .position_calculator import SCIPPositionCalculator +from .base.enum_mapper import BaseEnumMapper +from .compliance_validator import SCIPComplianceValidator +from ..proto import scip_pb2 + + +logger = logging.getLogger(__name__) + + +class SCIPStandardFramework(ABC): + """SCIP standard framework - enforces compliance across all language strategies.""" + + def __init__(self, language: str, project_root: str, version: str = "HEAD"): + """ + Initialize SCIP standard framework. + + Args: + language: Programming language (e.g., 'python', 'javascript') + project_root: Absolute path to project root + version: Project version for symbol generation + """ + self.language = language.lower() + self.project_root = Path(project_root).resolve() + self.version = version + + # Core components - mandatory initialization + self._symbol_generator = self._create_symbol_generator() + self._position_calculator = SCIPPositionCalculator() + self._enum_mapper = self._create_enum_mapper() + self._validator = SCIPComplianceValidator() + + logger.debug(f"Initialized SCIP framework for {language} project: {self.project_root.name}") + + def _create_symbol_generator(self) -> SCIPSymbolGenerator: + """Create SCIP standard symbol generator.""" + return SCIPSymbolGenerator( + scheme=f"scip-{self.language}", + package_manager="local", + package_name=self.project_root.name, + version=self.version + ) + + @abstractmethod + def _create_enum_mapper(self) -> BaseEnumMapper: + """Subclasses must implement language-specific enum mapping.""" + raise NotImplementedError("Subclasses must implement _create_enum_mapper") + + def process_file(self, file_path: str, extractor: SCIPSymbolExtractor) -> scip_pb2.Document: + """ + Standardized file processing pipeline - enforces compliance. + + Args: + file_path: Path to file to process + extractor: Symbol extractor implementation + + Returns: + SCIP Document with full compliance validation + + Raises: + ValueError: If input validation fails + RuntimeError: If processing fails or compliance validation fails + """ + + # 1. Validate input + self._validate_file_input(file_path) + + # 2. Create document base structure + document = self._create_document_base(file_path) + + # 3. Read content and create context + content = self._read_file_safe(file_path) + context = SCIPSymbolContext( + file_path=file_path, + content=content, + scope_stack=[], + imports={} + ) + + # 4. Extract symbols and generate SCIP elements + occurrences, symbols = self._extract_scip_elements(context, extractor) + + # 5. Validate and add to document + document.occurrences.extend(self._validate_occurrences(occurrences)) + document.symbols.extend(self._validate_symbols(symbols)) + + # 6. Final compliance check + if not self._validator.validate_document(document): + validation_summary = self._validator.get_validation_summary() + raise RuntimeError(f"Document failed SCIP compliance validation: {validation_summary['error_messages']}") + + logger.debug(f"Successfully processed {file_path} with {len(document.occurrences)} occurrences and {len(document.symbols)} symbols") + return document + + def process_files(self, file_paths: List[str], extractors: Dict[str, SCIPSymbolExtractor]) -> List[scip_pb2.Document]: + """ + Process multiple files with appropriate extractors. + + Args: + file_paths: List of file paths to process + extractors: Mapping of file extensions to extractors + + Returns: + List of SCIP documents + """ + documents = [] + + for file_path in file_paths: + try: + # Determine appropriate extractor + file_ext = Path(file_path).suffix.lower() + extractor = extractors.get(file_ext) + + if not extractor: + logger.warning(f"No extractor available for {file_ext}, skipping {file_path}") + continue + + # Process file + document = self.process_file(file_path, extractor) + documents.append(document) + + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + # Continue processing other files + continue + + logger.info(f"Processed {len(documents)} files successfully out of {len(file_paths)} total") + return documents + + def create_complete_index(self, file_paths: List[str], extractors: Dict[str, SCIPSymbolExtractor]) -> scip_pb2.Index: + """ + Create complete SCIP index with all 6 essential content categories. + + Args: + file_paths: List of file paths to index + extractors: Mapping of file extensions to extractors + + Returns: + Complete SCIP Index + """ + index = scip_pb2.Index() + + # 1. Create metadata (Category 1) + index.metadata.CopyFrom(self._create_metadata()) + + # 2. Process all documents (Category 2) + documents = self.process_files(file_paths, extractors) + index.documents.extend(documents) + + # 3. Extract external symbols (Category 6) + external_symbols = self._extract_external_symbols(documents) + index.external_symbols.extend(external_symbols) + + # 4. Validate complete index + if not self._validator.validate_index(index): + validation_summary = self._validator.get_validation_summary() + raise RuntimeError(f"Index failed SCIP compliance validation: {validation_summary['error_messages']}") + + logger.info(f"Created complete SCIP index with {len(documents)} documents and {len(external_symbols)} external symbols") + return index + + def _validate_file_input(self, file_path: str) -> None: + """Validate file input parameters.""" + if not file_path: + raise ValueError("File path cannot be empty") + + path = Path(file_path) + if not path.exists(): + raise ValueError(f"File does not exist: {file_path}") + + if not path.is_file(): + raise ValueError(f"Path is not a file: {file_path}") + + def _create_document_base(self, file_path: str) -> scip_pb2.Document: + """Create base document structure.""" + document = scip_pb2.Document() + + # Set relative path from project root + try: + relative_path = Path(file_path).relative_to(self.project_root) + document.relative_path = str(relative_path).replace('\\', '/') + except ValueError: + # File is outside project root, use absolute path + document.relative_path = str(Path(file_path)).replace('\\', '/') + + document.language = self.language + + return document + + def _create_metadata(self) -> scip_pb2.Metadata: + """Create SCIP metadata with standard compliance.""" + metadata = scip_pb2.Metadata() + metadata.version = scip_pb2.ProtocolVersion.UnspecifiedProtocolVersion + + # Tool information + metadata.tool_info.name = "code-index-mcp" + metadata.tool_info.version = "2.1.1" + metadata.tool_info.arguments.extend(["scip-indexing", self.language]) + + # Project information + metadata.project_root = str(self.project_root) + metadata.text_document_encoding = scip_pb2.UTF8 + + return metadata + + def _read_file_safe(self, file_path: str) -> str: + """Read file content with encoding detection.""" + encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252'] + + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + return f.read() + except UnicodeDecodeError: + continue + + raise RuntimeError(f"Could not decode {file_path} with any supported encoding") + + def _extract_scip_elements(self, context: SCIPSymbolContext, extractor: SCIPSymbolExtractor) -> tuple: + """Extract SCIP elements using provided extractor.""" + occurrences = [] + symbols = [] + + try: + # Extract symbol definitions + for symbol_desc in extractor.extract_symbols(context): + try: + # Create SCIP symbol + symbol_id = self._symbol_generator.create_local_symbol(symbol_desc) + + # Map to SCIP enums + symbol_kind = self._enum_mapper.validate_and_map_symbol_kind(symbol_desc.kind) + + # Create symbol information + symbol_info = scip_pb2.SymbolInformation() + symbol_info.symbol = symbol_id + symbol_info.display_name = symbol_desc.name + symbol_info.kind = symbol_kind + + symbols.append(symbol_info) + + except Exception as e: + logger.warning(f"Failed to create symbol for {symbol_desc.name}: {e}") + continue + + # Extract symbol references + for symbol_desc, position_info in extractor.extract_references(context): + try: + # Create SCIP symbol ID + symbol_id = self._symbol_generator.create_local_symbol(symbol_desc) + + # Create SCIP range + range_obj = scip_pb2.Range() + range_obj.start.extend([position_info.start_line, position_info.start_column]) + range_obj.end.extend([position_info.end_line, position_info.end_column]) + + # Map to SCIP enums + symbol_role = self._enum_mapper.validate_and_map_symbol_role("reference") + syntax_kind = self._enum_mapper.validate_and_map_syntax_kind("identifier") + + # Create occurrence + occurrence = scip_pb2.Occurrence() + occurrence.symbol = symbol_id + occurrence.symbol_roles = symbol_role + occurrence.syntax_kind = syntax_kind + occurrence.range.CopyFrom(range_obj) + + occurrences.append(occurrence) + + except Exception as e: + logger.warning(f"Failed to create occurrence for {symbol_desc.name}: {e}") + continue + + except Exception as e: + logger.error(f"Symbol extraction failed: {e}") + raise RuntimeError(f"Failed to extract symbols: {e}") + + return occurrences, symbols + + def _validate_occurrences(self, occurrences: List[scip_pb2.Occurrence]) -> List[scip_pb2.Occurrence]: + """Validate occurrences for SCIP compliance.""" + validated = [] + + for occurrence in occurrences: + try: + # Validate symbol ID + if not self._validator.validate_symbol_id(occurrence.symbol): + logger.warning(f"Invalid symbol ID in occurrence: {occurrence.symbol}") + continue + + # Basic validation passed + validated.append(occurrence) + + except Exception as e: + logger.warning(f"Occurrence validation failed: {e}") + continue + + logger.debug(f"Validated {len(validated)} out of {len(occurrences)} occurrences") + return validated + + def _validate_symbols(self, symbols: List[scip_pb2.SymbolInformation]) -> List[scip_pb2.SymbolInformation]: + """Validate symbols for SCIP compliance.""" + validated = [] + + for symbol in symbols: + try: + # Validate symbol ID + if not self._validator.validate_symbol_id(symbol.symbol): + logger.warning(f"Invalid symbol ID in symbol info: {symbol.symbol}") + continue + + # Basic validation passed + validated.append(symbol) + + except Exception as e: + logger.warning(f"Symbol validation failed: {e}") + continue + + logger.debug(f"Validated {len(validated)} out of {len(symbols)} symbols") + return validated + + def _extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """Extract external symbols from processed documents.""" + external_symbols = [] + + # This is a placeholder implementation + # Subclasses should implement language-specific external symbol extraction + # based on import statements and dependencies + + return external_symbols + + def get_framework_info(self) -> dict: + """Get information about this framework instance.""" + return { + 'language': self.language, + 'project_root': str(self.project_root), + 'project_name': self.project_root.name, + 'version': self.version, + 'components': { + 'symbol_generator': self._symbol_generator.get_generator_info(), + 'enum_mapper': self._enum_mapper.get_enum_info(), + 'position_calculator': True, + 'compliance_validator': True + } + } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/streaming_indexer.py b/src/code_index_mcp/scip/framework/streaming_indexer.py new file mode 100644 index 0000000..b1b26d8 --- /dev/null +++ b/src/code_index_mcp/scip/framework/streaming_indexer.py @@ -0,0 +1,429 @@ +"""SCIP Streaming Indexer - Incremental and streaming index generation for large codebases.""" + +import logging +import json +import os +import time +from typing import Dict, List, Optional, Iterator, Callable, Any, Set +from dataclasses import dataclass, asdict +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, Future +from queue import Queue, Empty +import threading + +from .caching_system import SCIPCacheManager, BatchProcessor +from .index_factory import SCIPIndexFactory +from ..proto import scip_pb2 + +logger = logging.getLogger(__name__) + + +@dataclass +class IndexingProgress: + """Progress tracking for streaming indexing.""" + total_files: int + processed_files: int + failed_files: int + start_time: float + current_file: Optional[str] = None + error_messages: List[str] = None + + def __post_init__(self): + if self.error_messages is None: + self.error_messages = [] + + @property + def progress_percentage(self) -> float: + """Calculate progress percentage.""" + if self.total_files == 0: + return 100.0 + return (self.processed_files / self.total_files) * 100.0 + + @property + def elapsed_time(self) -> float: + """Get elapsed processing time.""" + return time.time() - self.start_time + + @property + def estimated_remaining_time(self) -> float: + """Estimate remaining processing time.""" + if self.processed_files == 0: + return 0.0 + + avg_time_per_file = self.elapsed_time / self.processed_files + remaining_files = self.total_files - self.processed_files + return avg_time_per_file * remaining_files + + +class StreamingIndexer: + """Streaming SCIP indexer for incremental and large-scale indexing.""" + + def __init__(self, + factory: SCIPIndexFactory, + cache_manager: Optional[SCIPCacheManager] = None, + max_workers: int = 4, + chunk_size: int = 100): + """Initialize streaming indexer.""" + self.factory = factory + self.cache_manager = cache_manager or SCIPCacheManager() + self.max_workers = max_workers + self.chunk_size = chunk_size + + # Progress tracking + self._progress: Optional[IndexingProgress] = None + self._progress_callbacks: List[Callable[[IndexingProgress], None]] = [] + + # Threading + self._stop_event = threading.Event() + self._executor: Optional[ThreadPoolExecutor] = None + + # Results queue for streaming output + self._results_queue: Queue = Queue() + + logger.debug(f"Initialized streaming indexer with {max_workers} workers") + + def add_progress_callback(self, callback: Callable[[IndexingProgress], None]) -> None: + """Add progress callback for monitoring.""" + self._progress_callbacks.append(callback) + + def index_files_streaming(self, + file_paths: List[str], + output_callback: Optional[Callable[[scip_pb2.Document], None]] = None + ) -> Iterator[scip_pb2.Document]: + """Stream index generation for files.""" + self._progress = IndexingProgress( + total_files=len(file_paths), + processed_files=0, + failed_files=0, + start_time=time.time() + ) + + # Start processing + self._executor = ThreadPoolExecutor(max_workers=self.max_workers) + + try: + # Submit files in chunks + for chunk_start in range(0, len(file_paths), self.chunk_size): + if self._stop_event.is_set(): + break + + chunk_end = min(chunk_start + self.chunk_size, len(file_paths)) + chunk_files = file_paths[chunk_start:chunk_end] + + # Submit chunk for processing + future = self._executor.submit(self._process_file_chunk, chunk_files) + + # Process results as they become available + try: + chunk_results = future.result(timeout=300) # 5 minute timeout per chunk + + for document in chunk_results: + if output_callback: + output_callback(document) + yield document + + # Update progress + self._progress.processed_files += 1 + self._notify_progress() + + except Exception as e: + logger.error(f"Chunk processing failed: {e}") + self._progress.failed_files += len(chunk_files) + self._progress.error_messages.append(str(e)) + self._notify_progress() + + finally: + if self._executor: + self._executor.shutdown(wait=True) + + logger.info(f"Streaming indexing completed. Processed: {self._progress.processed_files}, " + f"Failed: {self._progress.failed_files}") + + def create_incremental_index(self, + modified_files: List[str], + existing_index: Optional[scip_pb2.Index] = None + ) -> scip_pb2.Index: + """Create incremental index for modified files.""" + logger.info(f"Creating incremental index for {len(modified_files)} modified files") + + # Start with existing index or create new one + if existing_index: + updated_index = scip_pb2.Index() + updated_index.CopyFrom(existing_index) + else: + updated_index = scip_pb2.Index() + updated_index.metadata.CopyFrom(self.factory.create_metadata(self.factory.project_root)) + + # Track existing documents by path for replacement + existing_docs_by_path = {doc.relative_path: doc for doc in updated_index.documents} + + # Process modified files + new_documents = [] + for file_path in modified_files: + try: + # Check cache first + cached_doc = self.cache_manager.get_document_cache(file_path) + if cached_doc: + new_documents.append(cached_doc) + logger.debug(f"Using cached document for {file_path}") + continue + + # Read and process file + content = self._read_file(file_path) + if content is None: + logger.warning(f"Could not read file: {file_path}") + continue + + # Create new document + document = self.factory.create_document(file_path, content) + new_documents.append(document) + + # Cache the document + self.cache_manager.cache_document(file_path, document) + + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + continue + + # Replace or add documents in the index + updated_documents = [] + relative_paths_processed = set() + + for doc in new_documents: + updated_documents.append(doc) + relative_paths_processed.add(doc.relative_path) + + # Add unchanged documents from existing index + if existing_index: + for doc in existing_index.documents: + if doc.relative_path not in relative_paths_processed: + updated_documents.append(doc) + + # Update the index + updated_index.documents[:] = updated_documents + + # Extract external symbols from all documents + external_symbols = self.factory.extract_external_symbols(updated_documents) + updated_index.external_symbols[:] = external_symbols + + logger.info(f"Incremental index created with {len(updated_documents)} documents") + return updated_index + + def save_index_streaming(self, + index: scip_pb2.Index, + output_path: str, + compress: bool = True) -> None: + """Save index with streaming compression for large indexes.""" + logger.info(f"Saving index to {output_path} (compress={compress})") + + try: + if compress: + # Use compression for large indexes + import gzip + with gzip.open(output_path, 'wb') as f: + f.write(index.SerializeToString()) + else: + with open(output_path, 'wb') as f: + f.write(index.SerializeToString()) + + logger.info(f"Index saved successfully to {output_path}") + + except Exception as e: + logger.error(f"Failed to save index: {e}") + raise + + def load_index_streaming(self, input_path: str) -> scip_pb2.Index: + """Load index with streaming decompression.""" + logger.info(f"Loading index from {input_path}") + + try: + if input_path.endswith('.gz'): + import gzip + with gzip.open(input_path, 'rb') as f: + data = f.read() + else: + with open(input_path, 'rb') as f: + data = f.read() + + index = scip_pb2.Index() + index.ParseFromString(data) + + logger.info(f"Index loaded successfully with {len(index.documents)} documents") + return index + + except Exception as e: + logger.error(f"Failed to load index: {e}") + raise + + def watch_and_update(self, + watch_directory: str, + output_path: str, + update_interval: float = 5.0) -> None: + """Watch directory for changes and update index incrementally.""" + logger.info(f"Starting file watcher for {watch_directory}") + + last_update = time.time() + known_files = set() + last_index = None + + while not self._stop_event.is_set(): + try: + # Scan for changes + current_files = set() + modified_files = [] + + for ext in self.factory.get_supported_extensions(): + pattern = f"**/*{ext}" + for file_path in Path(watch_directory).rglob(pattern): + if file_path.is_file(): + current_files.add(str(file_path)) + + # Check if file is new or modified + if str(file_path) not in known_files or \ + file_path.stat().st_mtime > last_update: + modified_files.append(str(file_path)) + + # Update index if there are changes + if modified_files: + logger.info(f"Detected {len(modified_files)} modified files") + + # Create incremental index + updated_index = self.create_incremental_index(modified_files, last_index) + + # Save updated index + self.save_index_streaming(updated_index, output_path) + + last_index = updated_index + known_files = current_files + last_update = time.time() + + # Sleep before next check + time.sleep(update_interval) + + except Exception as e: + logger.error(f"Error in file watcher: {e}") + time.sleep(update_interval) + + def stop(self) -> None: + """Stop streaming indexer.""" + self._stop_event.set() + if self._executor: + self._executor.shutdown(wait=False) + logger.info("Streaming indexer stopped") + + def get_progress(self) -> Optional[IndexingProgress]: + """Get current indexing progress.""" + return self._progress + + def _process_file_chunk(self, file_paths: List[str]) -> List[scip_pb2.Document]: + """Process a chunk of files.""" + documents = [] + + for file_path in file_paths: + if self._stop_event.is_set(): + break + + try: + self._progress.current_file = file_path + self._notify_progress() + + # Check cache first + cached_doc = self.cache_manager.get_document_cache(file_path) + if cached_doc: + documents.append(cached_doc) + continue + + # Read and process file + content = self._read_file(file_path) + if content is None: + logger.warning(f"Could not read file: {file_path}") + continue + + # Create document + document = self.factory.create_document(file_path, content) + documents.append(document) + + # Cache the document + self.cache_manager.cache_document(file_path, document) + + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + continue + + return documents + + def _notify_progress(self) -> None: + """Notify all progress callbacks.""" + if self._progress: + for callback in self._progress_callbacks: + try: + callback(self._progress) + except Exception as e: + logger.warning(f"Progress callback failed: {e}") + + def _read_file(self, file_path: str) -> Optional[str]: + """Read file content with encoding detection.""" + encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252'] + + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + return f.read() + except UnicodeDecodeError: + continue + except (OSError, PermissionError, FileNotFoundError) as e: + logger.warning(f"Could not read {file_path}: {e}") + return None + + logger.warning(f"Could not decode {file_path} with any supported encoding") + return None + + +class IndexMerger: + """Utility for merging multiple SCIP indexes.""" + + @staticmethod + def merge_indexes(indexes: List[scip_pb2.Index], + output_metadata: Optional[scip_pb2.Metadata] = None) -> scip_pb2.Index: + """Merge multiple SCIP indexes into one.""" + if not indexes: + raise ValueError("No indexes provided for merging") + + logger.info(f"Merging {len(indexes)} indexes") + + merged_index = scip_pb2.Index() + + # Use provided metadata or first index's metadata + if output_metadata: + merged_index.metadata.CopyFrom(output_metadata) + else: + merged_index.metadata.CopyFrom(indexes[0].metadata) + + # Collect all documents and external symbols + all_documents = [] + all_external_symbols = [] + seen_document_paths = set() + seen_external_symbols = set() + + for index in indexes: + # Add documents (avoid duplicates by path) + for doc in index.documents: + if doc.relative_path not in seen_document_paths: + all_documents.append(doc) + seen_document_paths.add(doc.relative_path) + else: + logger.warning(f"Duplicate document path: {doc.relative_path}") + + # Add external symbols (avoid duplicates by symbol ID) + for ext_symbol in index.external_symbols: + if ext_symbol.symbol not in seen_external_symbols: + all_external_symbols.append(ext_symbol) + seen_external_symbols.add(ext_symbol.symbol) + + merged_index.documents.extend(all_documents) + merged_index.external_symbols.extend(all_external_symbols) + + logger.info(f"Merged index contains {len(all_documents)} documents " + f"and {len(all_external_symbols)} external symbols") + + return merged_index \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/symbol_generator.py b/src/code_index_mcp/scip/framework/symbol_generator.py new file mode 100644 index 0000000..74fde92 --- /dev/null +++ b/src/code_index_mcp/scip/framework/symbol_generator.py @@ -0,0 +1,144 @@ +"""SCIP Symbol Generator - Strict format compliance for SCIP symbol ID generation.""" + +import re +import logging +from typing import Optional +from .types import SCIPSymbolDescriptor + + +logger = logging.getLogger(__name__) + + +class SCIPSymbolGenerator: + """SCIP standard symbol generator - strict format compliance.""" + + # SCIP symbol format validation patterns + SCHEME_PATTERN = re.compile(r'^[a-zA-Z][a-zA-Z0-9\-_]*$') + LOCAL_ID_PATTERN = re.compile(r'^[^\s]+$') + GLOBAL_SYMBOL_PATTERN = re.compile(r'^[^\s]+\s+[^\s]+\s+[^\s]+(\s+[^\s]+)?$') + + def __init__(self, scheme: str, package_manager: str, package_name: str, version: str): + """Initialize symbol generator with validation.""" + self._validate_scheme(scheme) + self._validate_package_info(package_manager, package_name, version) + + self.scheme = scheme + self.package = f"{package_manager} {package_name} {version}" + + def create_local_symbol(self, descriptor: SCIPSymbolDescriptor) -> str: + """Create local symbol ID - enforced SCIP format.""" + local_id = descriptor.to_scip_descriptor() + + # Validate local ID format + if not self._is_valid_local_id(local_id): + raise ValueError(f"Invalid local symbol ID: {local_id}") + + return f"local {local_id}" + + def create_global_symbol(self, descriptor: SCIPSymbolDescriptor) -> str: + """Create global symbol ID - complete SCIP format.""" + descriptor_str = descriptor.to_scip_descriptor() + + symbol_id = f"{self.scheme} {self.package} {descriptor_str}" + + # Validate global symbol format + if not self._is_valid_global_symbol(symbol_id): + raise ValueError(f"Invalid global symbol ID: {symbol_id}") + + return symbol_id + + def _validate_scheme(self, scheme: str) -> None: + """Validate scheme format against SCIP standards.""" + if not scheme: + raise ValueError("Scheme cannot be empty") + + if not self.SCHEME_PATTERN.match(scheme): + raise ValueError(f"Invalid scheme format: {scheme}. Must match pattern: {self.SCHEME_PATTERN.pattern}") + + if ' ' in scheme.replace(' ', ''): # Allow double space escaping + raise ValueError(f"Scheme cannot contain spaces: {scheme}") + + def _validate_package_info(self, package_manager: str, package_name: str, version: str) -> None: + """Validate package information components.""" + if not package_manager: + raise ValueError("Package manager cannot be empty") + if not package_name: + raise ValueError("Package name cannot be empty") + + # Version can be empty for local projects + for component in [package_manager, package_name, version]: + if component and (' ' in component): + raise ValueError(f"Package component cannot contain spaces: {component}") + + def _is_valid_local_id(self, local_id: str) -> bool: + """Validate local ID format compliance.""" + if not local_id: + return False + + # Check for leading/trailing spaces + if local_id.startswith(' ') or local_id.endswith(' '): + return False + + # Check basic pattern compliance + return self.LOCAL_ID_PATTERN.match(local_id) is not None + + def _is_valid_global_symbol(self, symbol_id: str) -> bool: + """Validate global symbol format compliance.""" + if not symbol_id: + return False + + # Split into components + parts = symbol_id.split(' ') + if len(parts) < 4: + return False + + # Validate each part is non-empty + return all(part.strip() for part in parts) + + def validate_symbol_id(self, symbol_id: str) -> bool: + """Validate any symbol ID against SCIP grammar.""" + if not symbol_id: + return False + + if symbol_id.startswith('local '): + return self._is_valid_local_id(symbol_id[6:]) + else: + return self._is_valid_global_symbol(symbol_id) + + def parse_symbol_id(self, symbol_id: str) -> Optional[dict]: + """Parse symbol ID into components for analysis.""" + if not self.validate_symbol_id(symbol_id): + return None + + if symbol_id.startswith('local '): + return { + 'type': 'local', + 'local_id': symbol_id[6:], + 'scheme': None, + 'package': None, + 'descriptor': symbol_id[6:] + } + else: + parts = symbol_id.split(' ', 3) + if len(parts) >= 4: + return { + 'type': 'global', + 'scheme': parts[0], + 'manager': parts[1], + 'package': parts[2], + 'descriptor': parts[3] + } + + return None + + def get_generator_info(self) -> dict: + """Get information about this generator instance.""" + return { + 'scheme': self.scheme, + 'package': self.package, + 'validation_patterns': { + 'scheme': self.SCHEME_PATTERN.pattern, + 'local_id': self.LOCAL_ID_PATTERN.pattern, + 'global_symbol': self.GLOBAL_SYMBOL_PATTERN.pattern + } + } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/types.py b/src/code_index_mcp/scip/framework/types.py new file mode 100644 index 0000000..f2da49f --- /dev/null +++ b/src/code_index_mcp/scip/framework/types.py @@ -0,0 +1,79 @@ +"""SCIP Framework Types - Core type definitions for SCIP standard compliance.""" + +from dataclasses import dataclass +from typing import List, Dict, Protocol, Iterator, Tuple, Optional +from abc import ABC, abstractmethod + + +@dataclass(frozen=True) +class SCIPSymbolDescriptor: + """SCIP symbol descriptor - immutable data structure for symbol information.""" + name: str + kind: str # function, class, variable, etc. + scope_path: List[str] + descriptor_suffix: str # (). # (param) etc. + + def to_scip_descriptor(self) -> str: + """Convert to SCIP standard descriptor format.""" + scope = ".".join(self.scope_path) if self.scope_path else "" + full_path = f"{scope}.{self.name}" if scope else self.name + return f"{full_path}{self.descriptor_suffix}" + + +@dataclass(frozen=True) +class SCIPPositionInfo: + """SCIP position information - immutable position data with validation.""" + start_line: int + start_column: int + end_line: int + end_column: int + + def validate(self) -> bool: + """Validate position information for SCIP compliance.""" + return ( + self.start_line <= self.end_line and + (self.start_line < self.end_line or self.start_column <= self.end_column) and + all(x >= 0 for x in [self.start_line, self.start_column, self.end_line, self.end_column]) + ) + + +@dataclass +class SCIPSymbolContext: + """Context information for symbol extraction and processing.""" + file_path: str + content: str + scope_stack: List[str] + imports: Dict[str, str] + + def with_scope(self, scope_name: str) -> 'SCIPSymbolContext': + """Create new context with additional scope.""" + return SCIPSymbolContext( + file_path=self.file_path, + content=self.content, + scope_stack=self.scope_stack + [scope_name], + imports=self.imports.copy() + ) + + +# Alias for compatibility +SCIPContext = SCIPSymbolContext + +# Import and alias Relationship type +from .relationship_manager import SymbolRelationship +Relationship = SymbolRelationship + + +class SCIPSymbolExtractor(Protocol): + """Symbol extractor protocol - mandatory interface for symbol extraction.""" + + def extract_symbols(self, context: SCIPSymbolContext) -> Iterator[SCIPSymbolDescriptor]: + """Extract symbol definitions from context.""" + ... + + def extract_references(self, context: SCIPSymbolContext) -> Iterator[Tuple[SCIPSymbolDescriptor, SCIPPositionInfo]]: + """Extract symbol references with position information.""" + ... + + def extract_relationships(self, context: SCIPSymbolContext) -> Iterator[Tuple[str, str, str]]: + """Extract symbol relationships (source, target, relationship_type).""" + ... \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/unified_api.py b/src/code_index_mcp/scip/framework/unified_api.py new file mode 100644 index 0000000..2e955a7 --- /dev/null +++ b/src/code_index_mcp/scip/framework/unified_api.py @@ -0,0 +1,456 @@ +"""SCIP Framework Unified API - Single entry point for all SCIP framework functionality.""" + +import logging +import os +from typing import Dict, List, Optional, Set, Any, Callable, Iterator +from pathlib import Path +from dataclasses import dataclass + +from .index_factory import SCIPIndexFactory +from .python import create_python_scip_factory, PythonSCIPIndexFactory +from .javascript import create_javascript_scip_factory, JavaScriptSCIPIndexFactory +from .java import create_java_scip_factory, JavaSCIPIndexFactory +from .fallback import create_fallback_scip_factory, FallbackSCIPIndexFactory +from .caching_system import SCIPCacheManager, BatchProcessor +from .streaming_indexer import StreamingIndexer, IndexingProgress, IndexMerger +from .compliance_validator import SCIPComplianceValidator +from ..proto import scip_pb2 + +logger = logging.getLogger(__name__) + + +@dataclass +class SCIPConfig: + """Configuration for SCIP framework.""" + project_root: str + cache_enabled: bool = True + cache_dir: Optional[str] = None + max_workers: int = 4 + batch_size: int = 50 + streaming_chunk_size: int = 100 + validate_compliance: bool = True + supported_languages: Optional[Set[str]] = None + exclude_patterns: Optional[List[str]] = None + + def __post_init__(self): + if self.supported_languages is None: + self.supported_languages = {'python', 'javascript', 'typescript', 'java', 'fallback'} + + if self.exclude_patterns is None: + self.exclude_patterns = [ + '__pycache__', '.git', 'node_modules', '.vscode', + '.idea', '*.pyc', '*.pyo', '*.class' + ] + + +class SCIPFrameworkAPI: + """Unified API for SCIP framework - single entry point for all functionality.""" + + def __init__(self, config: SCIPConfig): + """Initialize SCIP framework with configuration.""" + self.config = config + + # Initialize core components + self.cache_manager = None + if config.cache_enabled: + self.cache_manager = SCIPCacheManager( + cache_dir=config.cache_dir, + max_memory_entries=1000 + ) + + self.validator = SCIPComplianceValidator() if config.validate_compliance else None + + # Language-specific factories + self._factories: Dict[str, SCIPIndexFactory] = {} + self._init_factories() + + # Streaming components + self._streaming_indexers: Dict[str, StreamingIndexer] = {} + + logger.info(f"Initialized SCIP Framework API for project: {config.project_root}") + logger.info(f"Supported languages: {config.supported_languages}") + + def detect_project_languages(self, scan_depth: int = 3) -> Set[str]: + """Automatically detect programming languages in the project.""" + detected_languages = set() + project_path = Path(self.config.project_root) + + # Language detection by file extensions + language_extensions = { + 'python': {'.py', '.pyw', '.pyx'}, + 'javascript': {'.js', '.jsx', '.mjs', '.cjs'}, + 'typescript': {'.ts', '.tsx'}, + 'java': {'.java'}, + 'fallback': set() # Fallback handles everything else + } + + # Scan project files + for depth in range(scan_depth + 1): + pattern = '*/' * depth + '*' + + for file_path in project_path.glob(pattern): + if file_path.is_file(): + file_ext = file_path.suffix.lower() + + for lang, extensions in language_extensions.items(): + if file_ext in extensions and lang in self.config.supported_languages: + detected_languages.add(lang) + + logger.info(f"Detected languages: {detected_languages}") + return detected_languages + + def create_complete_index(self, + languages: Optional[Set[str]] = None, + progress_callback: Optional[Callable[[IndexingProgress], None]] = None + ) -> scip_pb2.Index: + """Create complete SCIP index for the project.""" + if languages is None: + languages = self.detect_project_languages() + + logger.info(f"Creating complete index for languages: {languages}") + + # Collect all files by language + files_by_language = self._collect_files_by_language(languages) + + # Create index with metadata + index = scip_pb2.Index() + + # Use first available factory for metadata (they should be consistent) + first_factory = next(iter(self._factories.values())) + index.metadata.CopyFrom(first_factory.create_metadata(self.config.project_root)) + + # Process files by language + all_documents = [] + all_external_symbols = [] + + for language, file_paths in files_by_language.items(): + if language not in self._factories: + logger.warning(f"No factory available for language: {language}") + continue + + logger.info(f"Processing {len(file_paths)} {language} files") + + # Get streaming indexer for this language + streaming_indexer = self._get_streaming_indexer(language) + if progress_callback: + streaming_indexer.add_progress_callback(progress_callback) + + # Process files with streaming + language_documents = list(streaming_indexer.index_files_streaming(file_paths)) + all_documents.extend(language_documents) + + # Extract external symbols + factory = self._factories[language] + external_symbols = factory.extract_external_symbols(language_documents) + all_external_symbols.extend(external_symbols) + + # Add all documents and external symbols to index + index.documents.extend(all_documents) + index.external_symbols.extend(all_external_symbols) + + # Validate if requested + if self.validator: + is_valid = self.validator.validate_index(index) + if not is_valid: + logger.warning("Generated index failed compliance validation") + validation_summary = self.validator.get_validation_summary() + logger.warning(f"Validation errors: {validation_summary['error_messages']}") + + logger.info(f"Complete index created with {len(all_documents)} documents " + f"and {len(all_external_symbols)} external symbols") + + return index + + def create_incremental_index(self, + modified_files: List[str], + existing_index_path: Optional[str] = None + ) -> scip_pb2.Index: + """Create incremental index for modified files.""" + logger.info(f"Creating incremental index for {len(modified_files)} files") + + # Load existing index if provided + existing_index = None + if existing_index_path and os.path.exists(existing_index_path): + try: + streaming_indexer = next(iter(self._streaming_indexers.values())) + existing_index = streaming_indexer.load_index_streaming(existing_index_path) + logger.info(f"Loaded existing index with {len(existing_index.documents)} documents") + except Exception as e: + logger.warning(f"Failed to load existing index: {e}") + + # Group files by language + files_by_language = self._group_files_by_language(modified_files) + + # Create incremental updates for each language + language_indexes = [] + for language, file_paths in files_by_language.items(): + if language not in self._factories: + continue + + streaming_indexer = self._get_streaming_indexer(language) + lang_index = streaming_indexer.create_incremental_index(file_paths, existing_index) + language_indexes.append(lang_index) + + # Merge language indexes + if len(language_indexes) == 1: + return language_indexes[0] + elif len(language_indexes) > 1: + return IndexMerger.merge_indexes(language_indexes) + else: + # No valid files to process + return existing_index or scip_pb2.Index() + + def save_index(self, + index: scip_pb2.Index, + output_path: str, + compress: bool = True) -> None: + """Save SCIP index to file.""" + streaming_indexer = self._get_any_streaming_indexer() + streaming_indexer.save_index_streaming(index, output_path, compress) + + def load_index(self, input_path: str) -> scip_pb2.Index: + """Load SCIP index from file.""" + streaming_indexer = self._get_any_streaming_indexer() + return streaming_indexer.load_index_streaming(input_path) + + def validate_index(self, index: scip_pb2.Index) -> Dict[str, Any]: + """Validate SCIP index compliance.""" + if not self.validator: + return {"validation_enabled": False} + + is_valid = self.validator.validate_index(index) + return { + "is_valid": is_valid, + "validation_enabled": True, + **self.validator.get_validation_summary() + } + + def get_cache_statistics(self) -> Dict[str, Any]: + """Get cache performance statistics.""" + if not self.cache_manager: + return {"cache_enabled": False} + + return { + "cache_enabled": True, + **self.cache_manager.get_cache_statistics() + } + + def clear_cache(self) -> None: + """Clear all caches.""" + if self.cache_manager: + self.cache_manager.invalidate_all_cache() + logger.info("Cache cleared") + + def start_file_watcher(self, + output_path: str, + update_interval: float = 5.0) -> None: + """Start file watcher for automatic index updates.""" + # Use Python factory's streaming indexer for watching + # (could be enhanced to support multiple languages) + streaming_indexer = self._get_streaming_indexer('python') + streaming_indexer.watch_and_update( + self.config.project_root, + output_path, + update_interval + ) + + def stop_all_watchers(self) -> None: + """Stop all file watchers and streaming indexers.""" + for indexer in self._streaming_indexers.values(): + indexer.stop() + logger.info("All watchers stopped") + + def analyze_symbol_relationships(self, index: scip_pb2.Index) -> Dict[str, Any]: + """Analyze symbol relationships in the index.""" + relationship_stats = { + "total_symbols": len(index.external_symbols), + "documents_with_symbols": 0, + "symbols_per_document": {}, + "symbol_types": {}, + "relationship_patterns": [] + } + + # Analyze documents + for doc in index.documents: + symbol_count = len(doc.symbols) + occurrence_count = len(doc.occurrences) + + if symbol_count > 0: + relationship_stats["documents_with_symbols"] += 1 + + relationship_stats["symbols_per_document"][doc.relative_path] = { + "symbols": symbol_count, + "occurrences": occurrence_count + } + + # Analyze symbol types in document + for symbol in doc.symbols: + symbol_kind_name = self._get_symbol_kind_name(symbol.kind) + if symbol_kind_name not in relationship_stats["symbol_types"]: + relationship_stats["symbol_types"][symbol_kind_name] = 0 + relationship_stats["symbol_types"][symbol_kind_name] += 1 + + return relationship_stats + + def export_index_json(self, index: scip_pb2.Index, output_path: str) -> None: + """Export index to JSON format for analysis.""" + from google.protobuf.json_format import MessageToDict + + try: + index_dict = MessageToDict(index) + + import json + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(index_dict, f, indent=2, ensure_ascii=False) + + logger.info(f"Index exported to JSON: {output_path}") + + except Exception as e: + logger.error(f"Failed to export index to JSON: {e}") + raise + + def get_framework_info(self) -> Dict[str, Any]: + """Get comprehensive framework information.""" + return { + "config": { + "project_root": self.config.project_root, + "cache_enabled": self.config.cache_enabled, + "max_workers": self.config.max_workers, + "batch_size": self.config.batch_size, + "supported_languages": list(self.config.supported_languages), + "validate_compliance": self.config.validate_compliance + }, + "factories": list(self._factories.keys()), + "streaming_indexers": list(self._streaming_indexers.keys()), + "cache_statistics": self.get_cache_statistics(), + "detected_languages": list(self.detect_project_languages()) + } + + def _init_factories(self) -> None: + """Initialize language-specific factories.""" + if 'python' in self.config.supported_languages: + self._factories['python'] = create_python_scip_factory(self.config.project_root) + + if 'javascript' in self.config.supported_languages or 'typescript' in self.config.supported_languages: + self._factories['javascript'] = create_javascript_scip_factory(self.config.project_root) + self._factories['typescript'] = self._factories['javascript'] # Same factory + + if 'java' in self.config.supported_languages: + self._factories['java'] = create_java_scip_factory(self.config.project_root) + + if 'fallback' in self.config.supported_languages: + self._factories['fallback'] = create_fallback_scip_factory(self.config.project_root) + + def _get_streaming_indexer(self, language: str) -> StreamingIndexer: + """Get or create streaming indexer for language.""" + if language not in self._streaming_indexers: + if language not in self._factories: + raise ValueError(f"No factory available for language: {language}") + + factory = self._factories[language] + self._streaming_indexers[language] = StreamingIndexer( + factory=factory, + cache_manager=self.cache_manager, + max_workers=self.config.max_workers, + chunk_size=self.config.streaming_chunk_size + ) + + return self._streaming_indexers[language] + + def _get_any_streaming_indexer(self) -> StreamingIndexer: + """Get any available streaming indexer.""" + if not self._streaming_indexers: + # Create one for the first available language + first_language = next(iter(self._factories.keys())) + return self._get_streaming_indexer(first_language) + + return next(iter(self._streaming_indexers.values())) + + def _collect_files_by_language(self, languages: Set[str]) -> Dict[str, List[str]]: + """Collect all project files grouped by language.""" + files_by_language = {lang: [] for lang in languages} + + project_path = Path(self.config.project_root) + + # Language to extensions mapping + language_extensions = { + 'python': {'.py', '.pyw', '.pyx'}, + 'javascript': {'.js', '.jsx', '.mjs', '.cjs'}, + 'typescript': {'.ts', '.tsx'}, + 'java': {'.java'} + } + + # Scan all files + for file_path in project_path.rglob('*'): + if not file_path.is_file(): + continue + + # Skip excluded patterns + if self._should_exclude_file(str(file_path)): + continue + + file_ext = file_path.suffix.lower() + + # Categorize by language + for lang in languages: + if lang in language_extensions: + if file_ext in language_extensions[lang]: + files_by_language[lang].append(str(file_path)) + break + + # Log file counts + for lang, files in files_by_language.items(): + if files: + logger.info(f"Found {len(files)} {lang} files") + + return files_by_language + + def _group_files_by_language(self, file_paths: List[str]) -> Dict[str, List[str]]: + """Group given files by language.""" + files_by_language = {} + + language_extensions = { + 'python': {'.py', '.pyw', '.pyx'}, + 'javascript': {'.js', '.jsx', '.mjs', '.cjs'}, + 'typescript': {'.ts', '.tsx'}, + 'java': {'.java'} + } + + for file_path in file_paths: + file_ext = Path(file_path).suffix.lower() + + for lang, extensions in language_extensions.items(): + if file_ext in extensions and lang in self.config.supported_languages: + if lang not in files_by_language: + files_by_language[lang] = [] + files_by_language[lang].append(file_path) + break + + return files_by_language + + def _should_exclude_file(self, file_path: str) -> bool: + """Check if file should be excluded based on patterns.""" + path_str = str(file_path) + + for pattern in self.config.exclude_patterns: + if pattern in path_str: + return True + + return False + + def _get_symbol_kind_name(self, symbol_kind: int) -> str: + """Get human-readable symbol kind name.""" + # Use enum mapper from any factory + if self._factories: + factory = next(iter(self._factories.values())) + if hasattr(factory, '_enum_mapper'): + return factory._enum_mapper.get_symbol_kind_name(symbol_kind) or f"Unknown({symbol_kind})" + + return f"SymbolKind({symbol_kind})" + + +# Convenience function for quick setup +def create_scip_framework(project_root: str, **kwargs) -> SCIPFrameworkAPI: + """Create SCIP framework with default configuration.""" + config = SCIPConfig(project_root=project_root, **kwargs) + return SCIPFrameworkAPI(config) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/__init__.py b/src/code_index_mcp/scip/framework/zig/__init__.py new file mode 100644 index 0000000..e4a0910 --- /dev/null +++ b/src/code_index_mcp/scip/framework/zig/__init__.py @@ -0,0 +1,14 @@ +"""Zig SCIP framework module.""" + +from .factory import ZigSCIPIndexFactory, create_zig_scip_factory +from .enum_mapper import ZigEnumMapper +from .relationship_extractor import ZigRelationshipExtractor +from .tree_sitter_analyzer import ZigTreeSitterAnalyzer + +__all__ = [ + 'ZigSCIPIndexFactory', + 'create_zig_scip_factory', + 'ZigEnumMapper', + 'ZigRelationshipExtractor', + 'ZigTreeSitterAnalyzer' +] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/enum_mapper.py b/src/code_index_mcp/scip/framework/zig/enum_mapper.py new file mode 100644 index 0000000..c4fb191 --- /dev/null +++ b/src/code_index_mcp/scip/framework/zig/enum_mapper.py @@ -0,0 +1,217 @@ +"""Zig enum mapper implementation.""" + +from typing import Dict +from ..base.enum_mapper import BaseEnumMapper +from ...proto import scip_pb2 + + +class ZigEnumMapper(BaseEnumMapper): + """Zig-specific enum mapper for SCIP compliance.""" + + # Zig symbol kind mappings + SYMBOL_KIND_MAP = { + 'function': scip_pb2.Function, + 'method': scip_pb2.Method, + 'struct': scip_pb2.Struct, + 'union': scip_pb2.Struct, + 'enum': scip_pb2.Enum, + 'field': scip_pb2.Field, + 'variable': scip_pb2.Variable, + 'parameter': scip_pb2.Parameter, + 'constant': scip_pb2.Constant, + 'type': scip_pb2.Type, + 'namespace': scip_pb2.Namespace, + 'module': scip_pb2.Module, + 'local_variable': scip_pb2.Variable, + 'global_variable': scip_pb2.Variable, + 'error_set': scip_pb2.Type, + 'test_declaration': scip_pb2.Function, + 'comptime_declaration': scip_pb2.Function, + } + + # Zig syntax kind mappings + SYNTAX_KIND_MAP = { + 'function_declaration': scip_pb2.IdentifierFunctionDefinition, + 'struct_declaration': scip_pb2.IdentifierType, + 'union_declaration': scip_pb2.IdentifierType, + 'enum_declaration': scip_pb2.IdentifierType, + 'field_declaration': scip_pb2.IdentifierAttribute, + 'variable_declaration': scip_pb2.IdentifierLocal, + 'parameter_declaration': scip_pb2.IdentifierParameter, + 'constant_declaration': scip_pb2.IdentifierConstant, + 'type_declaration': scip_pb2.IdentifierType, + 'test_declaration': scip_pb2.IdentifierFunctionDefinition, + 'comptime_declaration': scip_pb2.IdentifierFunctionDefinition, + 'identifier': scip_pb2.Identifier, + 'keyword': scip_pb2.IdentifierKeyword, + 'string_literal': scip_pb2.StringLiteral, + 'numeric_literal': scip_pb2.NumericLiteral, + 'boolean_literal': scip_pb2.BooleanLiteral, + 'comment': scip_pb2.Comment, + 'punctuation': scip_pb2.PunctuationDelimiter, + } + + # Zig symbol role mappings (official SCIP naming) + SYMBOL_ROLE_MAP = { + 'definition': scip_pb2.Definition, + 'import': scip_pb2.Import, + 'write': scip_pb2.Write, # Official SCIP naming + 'read': scip_pb2.Read, # Official SCIP naming + 'generated': scip_pb2.Generated, + 'test': scip_pb2.Test, + 'type': scip_pb2.Type, # Add missing Type role + 'reference': scip_pb2.Read, # Default reference is read access + } + + def map_symbol_kind(self, language_kind: str) -> int: + """Map Zig symbol type to SCIP SymbolKind.""" + kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SymbolKind'): + raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") + + return kind + + def map_syntax_kind(self, language_syntax: str) -> int: + """Map Zig syntax element to SCIP SyntaxKind.""" + kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) + + # Validate enum value + if not self.validate_enum_value(kind, 'SyntaxKind'): + raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") + + return kind + + def map_symbol_role(self, language_role: str) -> int: + """Map Zig symbol role to SCIP SymbolRole.""" + role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) + + # Validate enum value + if not self.validate_enum_value(role, 'SymbolRole'): + raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") + + return role + + def get_zig_node_symbol_kind(self, node_type: str) -> str: + """ + Map Zig tree-sitter node type to internal symbol kind string. + + Args: + node_type: Zig tree-sitter node type (e.g., 'function_declaration', 'struct_declaration') + + Returns: + Internal symbol kind string for use with map_symbol_kind() + """ + node_kind_map = { + 'function_declaration': 'function', + 'struct_declaration': 'struct', + 'union_declaration': 'union', + 'enum_declaration': 'enum', + 'field_declaration': 'field', + 'variable_declaration': 'variable', + 'parameter_declaration': 'parameter', + 'constant_declaration': 'constant', + 'type_declaration': 'type', + 'test_declaration': 'test_declaration', + 'comptime_declaration': 'comptime_declaration', + 'error_set_declaration': 'error_set', + 'container_field': 'field', + 'builtin_call_expr': 'function', + } + + return node_kind_map.get(node_type, 'variable') + + def get_zig_node_syntax_kind(self, node_type: str, context: str = None) -> str: + """ + Map Zig tree-sitter node type to internal syntax kind string. + + Args: + node_type: Zig tree-sitter node type + context: Additional context for disambiguation + + Returns: + Internal syntax kind string for use with map_syntax_kind() + """ + node_syntax_map = { + 'function_declaration': 'function_declaration', + 'struct_declaration': 'struct_declaration', + 'union_declaration': 'union_declaration', + 'enum_declaration': 'enum_declaration', + 'field_declaration': 'field_declaration', + 'variable_declaration': 'variable_declaration', + 'parameter_declaration': 'parameter_declaration', + 'constant_declaration': 'constant_declaration', + 'type_declaration': 'type_declaration', + 'test_declaration': 'test_declaration', + 'comptime_declaration': 'comptime_declaration', + 'identifier': 'identifier', + 'string_literal': 'string_literal', + 'integer_literal': 'numeric_literal', + 'float_literal': 'numeric_literal', + 'builtin_identifier': 'keyword', + 'boolean_literal': 'boolean_literal', + } + + return node_syntax_map.get(node_type, 'identifier') + + def get_zig_node_symbol_role(self, node_type: str, context: str = None) -> str: + """ + Map Zig tree-sitter node type to internal symbol role string. + + Args: + node_type: Zig tree-sitter node type + context: Additional context (e.g., 'in_assignment', 'in_call') + + Returns: + Internal symbol role string for use with map_symbol_role() + """ + if context == 'definition': + return 'definition' + elif context == 'assignment': + return 'write' + elif context == 'import': + return 'import' + elif context == 'test': + return 'test' + elif node_type in ['function_declaration', 'struct_declaration', 'union_declaration', + 'enum_declaration', 'field_declaration', 'variable_declaration', + 'constant_declaration', 'type_declaration', 'test_declaration']: + return 'definition' + else: + return 'reference' + + def is_valid_zig_symbol_kind(self, symbol_kind: str) -> bool: + """Check if symbol kind is valid for Zig.""" + return symbol_kind in self.SYMBOL_KIND_MAP + + def is_valid_zig_syntax_kind(self, syntax_kind: str) -> bool: + """Check if syntax kind is valid for Zig.""" + return syntax_kind in self.SYNTAX_KIND_MAP + + def is_valid_zig_symbol_role(self, symbol_role: str) -> bool: + """Check if symbol role is valid for Zig.""" + return symbol_role in self.SYMBOL_ROLE_MAP + + def get_all_zig_symbol_kinds(self) -> list: + """Get all available Zig symbol kinds.""" + return list(self.SYMBOL_KIND_MAP.keys()) + + def get_all_zig_syntax_kinds(self) -> list: + """Get all available Zig syntax kinds.""" + return list(self.SYNTAX_KIND_MAP.keys()) + + def get_all_zig_symbol_roles(self) -> list: + """Get all available Zig symbol roles.""" + return list(self.SYMBOL_ROLE_MAP.keys()) + + def get_zig_specific_kinds(self) -> Dict[str, str]: + """Get Zig-specific symbol kinds.""" + return { + 'error_set': 'error_set', + 'test_declaration': 'test_declaration', + 'comptime_declaration': 'comptime_declaration', + 'builtin_function': 'function', + 'global_variable': 'global_variable', + 'local_variable': 'local_variable', + } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/factory.py b/src/code_index_mcp/scip/framework/zig/factory.py new file mode 100644 index 0000000..afef434 --- /dev/null +++ b/src/code_index_mcp/scip/framework/zig/factory.py @@ -0,0 +1,388 @@ +"""Zig SCIP Index Factory implementation.""" + +import os +from pathlib import Path +from typing import Set, List, Iterator, Optional +from ..base.index_factory import SCIPIndexFactory +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..base.enum_mapper import BaseEnumMapper +from ..symbol_generator import SCIPSymbolGenerator +from ..position_calculator import SCIPPositionCalculator +from ..types import SCIPContext, SCIPSymbolDescriptor +from .relationship_extractor import ZigRelationshipExtractor +from .enum_mapper import ZigEnumMapper +from .tree_sitter_analyzer import ZigTreeSitterAnalyzer +from ...proto import scip_pb2 + +import tree_sitter +from tree_sitter_zig import language as zig_language + + +class ZigSCIPIndexFactory(SCIPIndexFactory): + """Zig-specific SCIP Index factory implementation with constructor injection.""" + + def __init__(self, + project_root: str, + symbol_generator: SCIPSymbolGenerator, + relationship_extractor: BaseRelationshipExtractor, + enum_mapper: BaseEnumMapper, + position_calculator: SCIPPositionCalculator): + """Initialize Zig factory with required components via constructor injection.""" + super().__init__(project_root, symbol_generator, relationship_extractor, + enum_mapper, position_calculator) + self.tree_analyzer = ZigTreeSitterAnalyzer() + + def get_language(self) -> str: + """Return language identifier.""" + return "zig" + + def get_supported_extensions(self) -> Set[str]: + """Return supported file extensions.""" + return {'.zig', '.zon'} + + def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: + """Extract Zig symbol definitions using tree-sitter analysis.""" + try: + tree = self.tree_analyzer.parse(context.content, context.file_path) + + for node in self.tree_analyzer.walk(tree): + if self.tree_analyzer.is_symbol_definition(node): + symbol_info = self._create_symbol_from_tree_node(node, context) + if symbol_info: + yield symbol_info + + except SyntaxError as e: + # Handle syntax errors gracefully + pass + + def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: + """Extract Zig symbol occurrences.""" + try: + tree = self.tree_analyzer.parse(context.content, context.file_path) + + for node in self.tree_analyzer.walk(tree): + if (self.tree_analyzer.is_symbol_definition(node) or + self.tree_analyzer.is_symbol_reference(node)): + occurrence = self._create_occurrence_from_tree_node(node, context) + if occurrence: + yield occurrence + + except SyntaxError as e: + # Handle syntax errors gracefully + pass + + def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: + """Extract Zig external symbols from imports.""" + external_symbols = [] + + for doc in documents: + try: + content = self._read_file(os.path.join(self.project_root, doc.relative_path)) + tree = self.tree_analyzer.parse(content, doc.relative_path) + + # Extract import statements + import_statements = self.tree_analyzer.extract_import_statements(tree) + for import_path in import_statements: + external_symbol = self._create_external_symbol_from_import(import_path) + if external_symbol: + external_symbols.append(external_symbol) + + except Exception as e: + # Skip problematic files + continue + + return external_symbols + + def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: + """ + Build Zig-specific cross-document relationships. + + This implementation provides basic cross-document relationship support + for Zig. A more sophisticated implementation would analyze @import statements + and module dependencies. + """ + # For now, use a simplified approach + # TODO: Implement proper Zig import analysis + return 0 # Placeholder - no relationships added yet + + def _create_symbol_from_tree_node(self, node, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: + """Create SCIP symbol information from tree-sitter node.""" + symbol_info = scip_pb2.SymbolInformation() + + symbol_name = self.tree_analyzer.get_symbol_name(node) + if not symbol_name: + return None + + if node.type == 'function_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="function", + scope_path=context.scope_stack, + descriptor_suffix="()." + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('function') + + elif node.type == 'struct_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="struct", + scope_path=context.scope_stack, + descriptor_suffix="#" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('struct') + + elif node.type == 'union_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="union", + scope_path=context.scope_stack, + descriptor_suffix="#" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('union') + + elif node.type == 'enum_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="enum", + scope_path=context.scope_stack, + descriptor_suffix="#" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('enum') + + elif node.type == 'variable_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="variable", + scope_path=context.scope_stack, + descriptor_suffix="" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('variable') + + elif node.type == 'constant_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="constant", + scope_path=context.scope_stack, + descriptor_suffix="" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('constant') + + elif node.type == 'type_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="type", + scope_path=context.scope_stack, + descriptor_suffix="" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('type') + + elif node.type == 'container_field': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="field", + scope_path=context.scope_stack, + descriptor_suffix="" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('field') + + elif node.type == 'parameter_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="parameter", + scope_path=context.scope_stack, + descriptor_suffix="" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('parameter') + + elif node.type == 'test_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="test_declaration", + scope_path=context.scope_stack, + descriptor_suffix="()." + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('test_declaration') + + elif node.type == 'comptime_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="comptime_declaration", + scope_path=context.scope_stack, + descriptor_suffix="()." + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('comptime_declaration') + + elif node.type == 'error_set_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="error_set", + scope_path=context.scope_stack, + descriptor_suffix="#" + ) + symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) + symbol_info.display_name = symbol_name + symbol_info.kind = self.enum_mapper.map_symbol_kind('error_set') + + else: + return None + + return symbol_info + + def _create_occurrence_from_tree_node(self, node, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: + """Create SCIP occurrence from tree-sitter node.""" + occurrence = scip_pb2.Occurrence() + + # Calculate position using position calculator + try: + position_info = self.position_calculator.calculate_positions_from_tree_node( + context.content, node + ) + + # Set range + occurrence.range.start.extend([position_info.start_line, position_info.start_column]) + occurrence.range.end.extend([position_info.end_line, position_info.end_column]) + + except Exception as e: + # Skip if position calculation fails + return None + + symbol_name = self.tree_analyzer.get_symbol_name(node) + if not symbol_name: + return None + + # Set symbol and roles based on node type + if node.type == 'function_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="function", + scope_path=context.scope_stack, + descriptor_suffix="()." + ) + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('function_declaration') + + elif node.type in ['struct_declaration', 'union_declaration', 'enum_declaration']: + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind=node.type.replace('_declaration', ''), + scope_path=context.scope_stack, + descriptor_suffix="#" + ) + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind(f'{node.type}') + + elif node.type in ['variable_declaration', 'constant_declaration']: + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind=node.type.replace('_declaration', ''), + scope_path=context.scope_stack, + descriptor_suffix="" + ) + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind(f'{node.type}') + + elif node.type == 'test_declaration': + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="test_declaration", + scope_path=context.scope_stack, + descriptor_suffix="()." + ) + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('test') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('test_declaration') + + elif node.type == 'identifier': + # Handle variable references + descriptor = SCIPSymbolDescriptor( + name=symbol_name, + kind="variable", + scope_path=context.scope_stack, + descriptor_suffix="" + ) + occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) + occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') + occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') + + else: + return None + + return occurrence + + def _create_external_symbol_from_import(self, import_path: str) -> Optional[scip_pb2.SymbolInformation]: + """Create external symbol from import statement.""" + symbol_info = scip_pb2.SymbolInformation() + + # Determine if it's a standard library, C library, or external import + if import_path.startswith('std'): + symbol_info.symbol = f"zig-std {import_path}" + symbol_info.display_name = import_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"Zig standard library: {import_path}") + elif import_path.startswith('c'): + symbol_info.symbol = f"c-lib {import_path}" + symbol_info.display_name = import_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"C library: {import_path}") + elif import_path.startswith('./') or import_path.startswith('../'): + symbol_info.symbol = f"local {import_path}" + symbol_info.display_name = import_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"Local module: {import_path}") + else: + symbol_info.symbol = f"zig-external {import_path}" + symbol_info.display_name = import_path + symbol_info.kind = self.enum_mapper.map_symbol_kind('module') + symbol_info.documentation.append(f"External Zig module: {import_path}") + + return symbol_info + + +def create_zig_scip_factory(project_root: str) -> ZigSCIPIndexFactory: + """ + Factory creator for Zig SCIP factory. + Ensures all required components are properly assembled via constructor injection. + """ + symbol_generator = SCIPSymbolGenerator( + scheme="scip-zig", + package_manager="zig", + package_name=Path(project_root).name, + version="HEAD" + ) + + relationship_extractor = ZigRelationshipExtractor() + enum_mapper = ZigEnumMapper() + position_calculator = SCIPPositionCalculator() + + return ZigSCIPIndexFactory( + project_root=project_root, + symbol_generator=symbol_generator, + relationship_extractor=relationship_extractor, # Guaranteed to be provided + enum_mapper=enum_mapper, + position_calculator=position_calculator + ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/relationship_extractor.py b/src/code_index_mcp/scip/framework/zig/relationship_extractor.py new file mode 100644 index 0000000..8bbad94 --- /dev/null +++ b/src/code_index_mcp/scip/framework/zig/relationship_extractor.py @@ -0,0 +1,322 @@ +"""Zig relationship extractor implementation.""" + +from typing import Iterator, Optional, List +from ..base.relationship_extractor import BaseRelationshipExtractor +from ..types import SCIPContext, Relationship +from ...core.relationship_types import InternalRelationshipType + +import tree_sitter +from tree_sitter_zig import language as zig_language + + +class ZigRelationshipExtractor(BaseRelationshipExtractor): + """Zig-specific relationship extractor using tree-sitter analysis.""" + + def __init__(self): + """Initialize the Zig relationship extractor.""" + lang = tree_sitter.Language(zig_language()) + self.parser = tree_sitter.Parser(lang) + + def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract inheritance relationships from Zig (limited, as Zig doesn't have traditional inheritance).""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + # Zig doesn't have traditional inheritance, but we can extract composition relationships + # where structs contain other struct types + for node in self._walk_tree(tree.root_node): + if node.type == 'struct_declaration': + struct_name = self._get_struct_name(node) + if not struct_name: + continue + + struct_symbol_id = self._create_struct_symbol_id(struct_name, context) + + # Look for embedded structs or type fields that reference other types + for field_node in self._walk_tree(node): + if field_node.type == 'container_field': + field_type = self._get_field_type(field_node, context.content) + if field_type and self._is_custom_type(field_type): + type_symbol_id = self._create_type_symbol_id(field_type, context) + yield Relationship( + source_symbol=struct_symbol_id, + target_symbol=type_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract function call relationships.""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + for node in self._walk_tree(tree.root_node): + if node.type == 'function_declaration': + function_name = self._get_function_name(node) + if not function_name: + continue + + function_symbol_id = self._create_function_symbol_id(function_name, context) + + # Find function calls within this function + for call_node in self._walk_tree(node): + if call_node.type == 'call_expression': + target_function = self._get_call_target(call_node, context.content) + if target_function and target_function != function_name: + target_symbol_id = self._create_function_symbol_id(target_function, context) + yield Relationship( + source_symbol=function_symbol_id, + target_symbol=target_symbol_id, + relationship_type=InternalRelationshipType.CALLS + ) + elif call_node.type == 'builtin_call_expr': + # Handle builtin functions like @import, @cInclude, etc. + builtin_name = self._get_builtin_name(call_node, context.content) + if builtin_name: + builtin_symbol_id = f"zig-builtin {builtin_name}" + yield Relationship( + source_symbol=function_symbol_id, + target_symbol=builtin_symbol_id, + relationship_type=InternalRelationshipType.CALLS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract import/dependency relationships.""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + file_symbol_id = self._create_file_symbol_id(context.file_path) + + for node in self._walk_tree(tree.root_node): + if node.type == 'builtin_call_expr': + builtin_name = self._get_builtin_name(node, context.content) + if builtin_name in ['@import', '@cImport', '@cInclude']: + import_path = self._get_import_path(node, context.content) + if import_path: + # Determine if it's a standard library, C library, or local import + if import_path.startswith('std'): + module_symbol_id = f"zig-std {import_path}" + elif builtin_name in ['@cImport', '@cInclude']: + module_symbol_id = f"c-lib {import_path}" + elif import_path.startswith('./') or import_path.startswith('../'): + module_symbol_id = f"local {import_path}" + else: + module_symbol_id = f"zig-external {import_path}" + + yield Relationship( + source_symbol=file_symbol_id, + target_symbol=module_symbol_id, + relationship_type=InternalRelationshipType.IMPORTS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract composition relationships (struct fields, union fields).""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + for node in self._walk_tree(tree.root_node): + if node.type in ['struct_declaration', 'union_declaration']: + container_name = self._get_container_name(node) + if not container_name: + continue + + container_symbol_id = self._create_container_symbol_id(container_name, node.type, context) + + # Find fields in this container + for field_node in self._walk_tree(node): + if field_node.type == 'container_field': + field_name = self._get_field_name(field_node, context.content) + if field_name: + field_symbol_id = self._create_field_symbol_id(field_name, container_symbol_id) + yield Relationship( + source_symbol=container_symbol_id, + target_symbol=field_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + + except Exception: + # Skip files with parsing errors + return + + def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: + """Extract interface relationships (Zig doesn't have interfaces, but has error sets and protocols).""" + try: + tree = self.parser.parse(bytes(context.content, 'utf8')) + + for node in self._walk_tree(tree.root_node): + if node.type == 'error_set_declaration': + error_set_name = self._get_error_set_name(node, context.content) + if not error_set_name: + continue + + error_set_symbol_id = self._create_error_set_symbol_id(error_set_name, context) + + # Find error values in this error set + for error_node in self._walk_tree(node): + if error_node.type == 'identifier': + error_name = self._get_node_text(error_node, context.content) + if error_name and error_name != error_set_name: + error_symbol_id = self._create_error_symbol_id(error_name, error_set_symbol_id) + yield Relationship( + source_symbol=error_set_symbol_id, + target_symbol=error_symbol_id, + relationship_type=InternalRelationshipType.CONTAINS + ) + + except Exception: + # Skip files with parsing errors + return + + def _walk_tree(self, node) -> Iterator: + """Walk tree-sitter tree nodes.""" + yield node + for child in node.children: + yield from self._walk_tree(child) + + def _get_node_text(self, node, content: str) -> str: + """Get text content of a tree-sitter node.""" + return content[node.start_byte:node.end_byte] + + def _get_struct_name(self, struct_node) -> Optional[str]: + """Extract struct name from struct declaration node.""" + for child in struct_node.children: + if child.type == 'identifier': + return child.text.decode('utf8') + return None + + def _get_function_name(self, function_node) -> Optional[str]: + """Extract function name from function declaration node.""" + for child in function_node.children: + if child.type == 'identifier': + return child.text.decode('utf8') + return None + + def _get_container_name(self, container_node) -> Optional[str]: + """Extract container name from struct/union declaration node.""" + for child in container_node.children: + if child.type == 'identifier': + return child.text.decode('utf8') + return None + + def _get_field_name(self, field_node, content: str) -> Optional[str]: + """Extract field name from container field node.""" + for child in field_node.children: + if child.type == 'identifier': + return self._get_node_text(child, content) + return None + + def _get_field_type(self, field_node, content: str) -> Optional[str]: + """Extract field type from container field node.""" + # Look for type information in the field + for child in field_node.children: + if child.type in ['type_expression', 'identifier']: + return self._get_node_text(child, content) + return None + + def _get_call_target(self, call_node, content: str) -> Optional[str]: + """Extract target function name from call expression.""" + for child in call_node.children: + if child.type == 'identifier': + return self._get_node_text(child, content) + elif child.type == 'field_expression': + # Handle method calls like obj.method() + for grandchild in child.children: + if grandchild.type == 'identifier': + return self._get_node_text(grandchild, content) + return None + + def _get_builtin_name(self, builtin_node, content: str) -> Optional[str]: + """Extract builtin function name from builtin call expression.""" + builtin_text = self._get_node_text(builtin_node, content) + if builtin_text.startswith('@'): + # Extract just the builtin name (e.g., "@import" from "@import(...)") + paren_index = builtin_text.find('(') + if paren_index > 0: + return builtin_text[:paren_index] + return builtin_text + return None + + def _get_import_path(self, import_node, content: str) -> Optional[str]: + """Extract import path from import expression.""" + # Look for string literal in the import call + for child in self._walk_tree(import_node): + if child.type == 'string_literal': + path_text = self._get_node_text(child, content) + # Remove quotes + return path_text.strip('"\'') + return None + + def _get_error_set_name(self, error_set_node, content: str) -> Optional[str]: + """Extract error set name from error set declaration.""" + for child in error_set_node.children: + if child.type == 'identifier': + return self._get_node_text(child, content) + return None + + def _is_custom_type(self, type_name: str) -> bool: + """Check if a type name represents a custom type (not a builtin).""" + builtin_types = { + 'i8', 'i16', 'i32', 'i64', 'i128', + 'u8', 'u16', 'u32', 'u64', 'u128', + 'f16', 'f32', 'f64', 'f128', + 'bool', 'void', 'noreturn', 'type', + 'anyerror', 'anyframe', 'anyopaque' + } + return type_name not in builtin_types and not type_name.startswith('*') + + def _create_struct_symbol_id(self, struct_name: str, context: SCIPContext) -> str: + """Create symbol ID for struct.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{struct_name}" if scope_path else struct_name + return f"local {local_id}#" + + def _create_function_symbol_id(self, function_name: str, context: SCIPContext) -> str: + """Create symbol ID for function.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{function_name}" if scope_path else function_name + return f"local {local_id}()." + + def _create_container_symbol_id(self, container_name: str, container_type: str, context: SCIPContext) -> str: + """Create symbol ID for struct/union container.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{container_name}" if scope_path else container_name + return f"local {local_id}#" + + def _create_type_symbol_id(self, type_name: str, context: SCIPContext) -> str: + """Create symbol ID for type.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{type_name}" if scope_path else type_name + return f"local {local_id}#" + + def _create_field_symbol_id(self, field_name: str, container_symbol_id: str) -> str: + """Create symbol ID for field.""" + # Extract container name from container symbol ID + container_name = container_symbol_id.replace("local ", "").replace("#", "") + return f"local {container_name}.{field_name}" + + def _create_error_set_symbol_id(self, error_set_name: str, context: SCIPContext) -> str: + """Create symbol ID for error set.""" + scope_path = ".".join(context.scope_stack) if context.scope_stack else "" + local_id = f"{scope_path}.{error_set_name}" if scope_path else error_set_name + return f"local {local_id}#" + + def _create_error_symbol_id(self, error_name: str, error_set_symbol_id: str) -> str: + """Create symbol ID for error value.""" + # Extract error set name from error set symbol ID + error_set_name = error_set_symbol_id.replace("local ", "").replace("#", "") + return f"local {error_set_name}.{error_name}" + + def _create_file_symbol_id(self, file_path: str) -> str: + """Create symbol ID for file.""" + return f"local {file_path}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/tree_sitter_analyzer.py b/src/code_index_mcp/scip/framework/zig/tree_sitter_analyzer.py new file mode 100644 index 0000000..1b8fec0 --- /dev/null +++ b/src/code_index_mcp/scip/framework/zig/tree_sitter_analyzer.py @@ -0,0 +1,357 @@ +"""Zig tree-sitter analyzer implementation.""" + +from typing import Iterator, Optional, Set, List, Dict, Any +from ..types import SCIPContext +from ..base.language_analyzer import BaseLanguageAnalyzer + +import tree_sitter +from tree_sitter_zig import language as zig_language + + +class ZigTreeSitterAnalyzer(BaseLanguageAnalyzer): + """Zig analyzer using tree-sitter for AST parsing.""" + + def __init__(self): + """Initialize the Zig tree-sitter analyzer.""" + lang = tree_sitter.Language(zig_language()) + self.parser = tree_sitter.Parser(lang) + self._processed_nodes: Set[int] = set() + + def parse(self, content: str, filename: str = ""): + """Parse Zig source code into tree-sitter AST.""" + try: + return self.parser.parse(bytes(content, 'utf8')) + except Exception as e: + raise SyntaxError(f"Zig syntax error in {filename}: {e}") + + def walk(self, tree) -> Iterator: + """Walk tree-sitter tree nodes, avoiding duplicates.""" + for node in self._walk_node(tree.root_node): + node_id = id(node) + if node_id not in self._processed_nodes: + self._processed_nodes.add(node_id) + yield node + + def _walk_node(self, node) -> Iterator: + """Recursively walk tree nodes.""" + yield node + for child in node.children: + yield from self._walk_node(child) + + def is_symbol_definition(self, node) -> bool: + """Check if tree-sitter node represents a symbol definition.""" + return node.type in { + 'function_declaration', + 'struct_declaration', + 'union_declaration', + 'enum_declaration', + 'variable_declaration', + 'constant_declaration', + 'type_declaration', + 'container_field', + 'parameter_declaration', + 'test_declaration', + 'comptime_declaration', + 'error_set_declaration', + } + + def is_symbol_reference(self, node) -> bool: + """Check if tree-sitter node represents a symbol reference.""" + return node.type in { + 'identifier', + 'call_expression', + 'field_expression', + 'builtin_call_expr', + } + + def get_symbol_name(self, node) -> Optional[str]: + """Extract symbol name from tree-sitter node.""" + if node.type in ['function_declaration', 'struct_declaration', 'union_declaration', + 'enum_declaration', 'variable_declaration', 'constant_declaration', + 'type_declaration', 'test_declaration', 'comptime_declaration']: + # Look for identifier child + for child in node.children: + if child.type == 'identifier': + return child.text.decode('utf8') + + elif node.type == 'container_field': + # Field in struct/union/enum + for child in node.children: + if child.type == 'identifier': + return child.text.decode('utf8') + + elif node.type == 'parameter_declaration': + # Function parameter + for child in node.children: + if child.type == 'identifier': + return child.text.decode('utf8') + + elif node.type == 'identifier': + return node.text.decode('utf8') + + return None + + def get_node_position(self, node) -> tuple: + """Get position information from tree-sitter node.""" + start_line = node.start_point[0] + start_col = node.start_point[1] + end_line = node.end_point[0] + end_col = node.end_point[1] + + return (start_line, start_col, end_line, end_col) + + def extract_function_info(self, tree) -> List[Dict[str, Any]]: + """Extract function information from the AST.""" + functions = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'function_declaration': + function_info = { + 'name': self.get_symbol_name(node), + 'type': 'function', + 'position': self.get_node_position(node), + 'is_public': self._is_public_function(node), + 'is_extern': self._is_extern_function(node), + 'return_type': self._extract_return_type(node), + 'parameters': self._extract_function_parameters(node), + } + functions.append(function_info) + + return functions + + def extract_struct_info(self, tree) -> List[Dict[str, Any]]: + """Extract struct information from the AST.""" + structs = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'struct_declaration': + struct_info = { + 'name': self.get_symbol_name(node), + 'type': 'struct', + 'position': self.get_node_position(node), + 'is_public': self._is_public_declaration(node), + 'fields': self._extract_struct_fields(node), + } + structs.append(struct_info) + + return structs + + def extract_union_info(self, tree) -> List[Dict[str, Any]]: + """Extract union information from the AST.""" + unions = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'union_declaration': + union_info = { + 'name': self.get_symbol_name(node), + 'type': 'union', + 'position': self.get_node_position(node), + 'is_public': self._is_public_declaration(node), + 'fields': self._extract_union_fields(node), + } + unions.append(union_info) + + return unions + + def extract_enum_info(self, tree) -> List[Dict[str, Any]]: + """Extract enum information from the AST.""" + enums = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'enum_declaration': + enum_info = { + 'name': self.get_symbol_name(node), + 'type': 'enum', + 'position': self.get_node_position(node), + 'is_public': self._is_public_declaration(node), + 'values': self._extract_enum_values(node), + } + enums.append(enum_info) + + return enums + + def extract_variable_info(self, tree) -> List[Dict[str, Any]]: + """Extract variable information from the AST.""" + variables = [] + + for node in self._walk_node(tree.root_node): + if node.type in ['variable_declaration', 'constant_declaration']: + variable_info = { + 'name': self.get_symbol_name(node), + 'type': 'constant' if node.type == 'constant_declaration' else 'variable', + 'position': self.get_node_position(node), + 'is_public': self._is_public_declaration(node), + 'variable_type': self._extract_variable_type(node), + 'is_mutable': node.type == 'variable_declaration', + } + variables.append(variable_info) + + return variables + + def extract_test_info(self, tree) -> List[Dict[str, Any]]: + """Extract test declaration information from the AST.""" + tests = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'test_declaration': + test_info = { + 'name': self.get_symbol_name(node) or self._extract_test_name(node), + 'type': 'test', + 'position': self.get_node_position(node), + } + tests.append(test_info) + + return tests + + def extract_import_statements(self, tree) -> List[str]: + """Extract import statements from the AST.""" + imports = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'builtin_call_expr': + builtin_text = node.text.decode('utf8') + if builtin_text.startswith('@import'): + import_path = self._extract_import_path(node) + if import_path: + imports.append(import_path) + + return imports + + def extract_error_set_info(self, tree) -> List[Dict[str, Any]]: + """Extract error set information from the AST.""" + error_sets = [] + + for node in self._walk_node(tree.root_node): + if node.type == 'error_set_declaration': + error_set_info = { + 'name': self.get_symbol_name(node), + 'type': 'error_set', + 'position': self.get_node_position(node), + 'errors': self._extract_error_values(node), + } + error_sets.append(error_set_info) + + return error_sets + + def _is_public_declaration(self, node) -> bool: + """Check if a declaration is public.""" + # Look for 'pub' keyword in parent or siblings + parent = node.parent + if parent: + for child in parent.children: + if child.type == 'keyword' and child.text.decode('utf8') == 'pub': + return True + return False + + def _is_public_function(self, node) -> bool: + """Check if a function is public.""" + return self._is_public_declaration(node) + + def _is_extern_function(self, node) -> bool: + """Check if a function is extern.""" + # Look for 'extern' keyword + parent = node.parent + if parent: + for child in parent.children: + if child.type == 'keyword' and child.text.decode('utf8') == 'extern': + return True + return False + + def _extract_return_type(self, function_node) -> Optional[str]: + """Extract return type from function declaration.""" + # Look for return type after the parameter list + for child in function_node.children: + if child.type in ['type_expression', 'identifier']: + return child.text.decode('utf8') + return None + + def _extract_function_parameters(self, function_node) -> List[Dict[str, str]]: + """Extract parameter information from function declaration.""" + parameters = [] + + for child in function_node.children: + if child.type == 'parameter_list': + for param_child in child.children: + if param_child.type == 'parameter_declaration': + param_name = self.get_symbol_name(param_child) + param_type = self._extract_parameter_type(param_child) + if param_name: + parameters.append({ + 'name': param_name, + 'type': param_type or 'unknown' + }) + + return parameters + + def _extract_parameter_type(self, param_node) -> Optional[str]: + """Extract parameter type from parameter declaration.""" + for child in param_node.children: + if child.type in ['type_expression', 'identifier']: + return child.text.decode('utf8') + return None + + def _extract_struct_fields(self, struct_node) -> List[str]: + """Extract field names from struct declaration.""" + fields = [] + + for child in struct_node.children: + if child.type == 'container_declaration': + for field_child in child.children: + if field_child.type == 'container_field': + field_name = self.get_symbol_name(field_child) + if field_name: + fields.append(field_name) + + return fields + + def _extract_union_fields(self, union_node) -> List[str]: + """Extract field names from union declaration.""" + return self._extract_struct_fields(union_node) # Same logic + + def _extract_enum_values(self, enum_node) -> List[str]: + """Extract enum value names from enum declaration.""" + values = [] + + for child in enum_node.children: + if child.type == 'container_declaration': + for value_child in child.children: + if value_child.type == 'container_field': + value_name = self.get_symbol_name(value_child) + if value_name: + values.append(value_name) + + return values + + def _extract_variable_type(self, var_node) -> Optional[str]: + """Extract variable type from variable declaration.""" + for child in var_node.children: + if child.type in ['type_expression', 'identifier']: + return child.text.decode('utf8') + return None + + def _extract_test_name(self, test_node) -> Optional[str]: + """Extract test name from test declaration.""" + # Test name is usually in a string literal + for child in test_node.children: + if child.type == 'string_literal': + return child.text.decode('utf8').strip('"\'') + return None + + def _extract_import_path(self, import_node) -> Optional[str]: + """Extract import path from @import call.""" + for child in self._walk_node(import_node): + if child.type == 'string_literal': + return child.text.decode('utf8').strip('"\'') + return None + + def _extract_error_values(self, error_set_node) -> List[str]: + """Extract error values from error set declaration.""" + errors = [] + + for child in error_set_node.children: + if child.type == 'error_set': + for error_child in child.children: + if error_child.type == 'identifier': + errors.append(error_child.text.decode('utf8')) + + return errors \ No newline at end of file diff --git a/src/code_index_mcp/scip/language_manager.py b/src/code_index_mcp/scip/language_manager.py new file mode 100644 index 0000000..118ad73 --- /dev/null +++ b/src/code_index_mcp/scip/language_manager.py @@ -0,0 +1,522 @@ +"""SCIP Language Manager - Direct factory management without strategy layer.""" + +import logging +import os +from pathlib import Path +from typing import Dict, List, Optional, Set, Callable, Any + +from .framework.types import SCIPContext +from .framework.base.index_factory import SCIPIndexFactory +from .proto import scip_pb2 + +# Import all language factory creators +from .framework.python import create_python_scip_factory +from .framework.javascript import create_javascript_scip_factory +from .framework.java import create_java_scip_factory +from .framework.objective_c import create_objective_c_scip_factory +from .framework.zig import create_zig_scip_factory +from .framework.fallback import create_fallback_scip_factory + +logger = logging.getLogger(__name__) + + +class LanguageNotSupportedException(Exception): + """Exception raised when a language is not supported.""" + pass + + +class SCIPLanguageManager: + """ + Direct language management for SCIP indexing without strategy abstraction layer. + + This manager directly handles language detection, factory selection, and file processing + without the overhead of the strategy pattern. It provides a cleaner, more efficient + approach to SCIP index generation. + """ + + def __init__(self, project_root: str): + """Initialize the language manager for a specific project.""" + self.project_root = project_root + + # Language factory creators mapping + self._factory_creators: Dict[str, Callable[[str], SCIPIndexFactory]] = { + 'python': create_python_scip_factory, + 'javascript': create_javascript_scip_factory, + 'typescript': create_javascript_scip_factory, # Same as JavaScript + 'java': create_java_scip_factory, + 'objective_c': create_objective_c_scip_factory, + 'zig': create_zig_scip_factory, + 'fallback': create_fallback_scip_factory + } + + # Language priority for detection conflicts + self._language_priority = { + 'python': 90, + 'javascript': 85, + 'typescript': 85, + 'java': 80, + 'objective_c': 75, + 'zig': 70, + 'fallback': 10 # Always lowest priority + } + + # Extension to language mapping + self._extension_mapping = { + # Python + '.py': 'python', + '.pyw': 'python', + '.pyx': 'python', + '.pyi': 'python', + + # JavaScript/TypeScript + '.js': 'javascript', + '.jsx': 'javascript', + '.mjs': 'javascript', + '.cjs': 'javascript', + '.ts': 'typescript', + '.tsx': 'typescript', + + # Java + '.java': 'java', + + # Objective-C + '.m': 'objective_c', + '.mm': 'objective_c', + '.h': 'objective_c', # Could be C/C++ too, but we'll handle with priority + + # Zig + '.zig': 'zig', + '.zon': 'zig', + } + + # Factory cache to avoid recreating + self._factory_cache: Dict[str, SCIPIndexFactory] = {} + + logger.info(f"Initialized SCIP Language Manager for project: {project_root}") + logger.info(f"Supported languages: {list(self._factory_creators.keys())}") + + def detect_language(self, file_path: str) -> str: + """ + Detect the programming language for a given file. + + Args: + file_path: Path to the file + + Returns: + Language identifier string + """ + extension = Path(file_path).suffix.lower() + + # Direct mapping for most cases + if extension in self._extension_mapping: + return self._extension_mapping[extension] + + # Special handling for ambiguous extensions + if extension == '.h': + # Could be C, C++, or Objective-C + # For now, default to objective_c, but could add content-based detection + return 'objective_c' + + # Default to fallback for unknown extensions + return 'fallback' + + def get_factory(self, language: str) -> SCIPIndexFactory: + """ + Get or create a factory for the specified language. + + Args: + language: Language identifier + + Returns: + SCIP Index Factory for the language + + Raises: + LanguageNotSupportedException: If language is not supported + """ + if language not in self._factory_creators: + raise LanguageNotSupportedException(f"Language '{language}' is not supported") + + # Check cache first + if language not in self._factory_cache: + factory_creator = self._factory_creators[language] + self._factory_cache[language] = factory_creator(self.project_root) + logger.debug(f"Created new {language} factory for project {self.project_root}") + + return self._factory_cache[language] + + def get_factory_for_file(self, file_path: str) -> SCIPIndexFactory: + """ + Get the appropriate factory for a specific file. + + Args: + file_path: Path to the file + + Returns: + SCIP Index Factory for the file's language + """ + language = self.detect_language(file_path) + return self.get_factory(language) + + def process_file(self, file_path: str) -> Optional[scip_pb2.Document]: + """ + Process a single file and generate SCIP document. + + Args: + file_path: Path to the file to process + + Returns: + SCIP Document or None if processing failed + """ + try: + # Get appropriate factory + factory = self.get_factory_for_file(file_path) + + # Read file content + content = self._read_file_content(file_path) + if not content: + return None + + # Create context + relative_path = os.path.relpath(file_path, self.project_root) + context = SCIPContext( + file_path=relative_path, + content=content, + scope_stack=[], + imports={} + ) + + # Generate document + document = factory.create_document(file_path, content) + + if document: + logger.debug(f"Successfully processed {relative_path} with {len(document.symbols)} symbols") + + return document + + except Exception as e: + logger.error(f"Failed to process file {file_path}: {e}") + return None + + def process_files(self, file_paths: List[str]) -> List[scip_pb2.Document]: + """ + Process multiple files and generate SCIP documents. + + Args: + file_paths: List of file paths to process + + Returns: + List of SCIP Documents + """ + documents = [] + processed_count = 0 + error_count = 0 + + # Group files by language for efficiency + files_by_language = self._group_files_by_language(file_paths) + + for language, files in files_by_language.items(): + if not files: + continue + + logger.info(f"Processing {len(files)} {language} files") + + try: + factory = self.get_factory(language) + + for i, file_path in enumerate(files, 1): + document = self.process_file(file_path) + if document: + documents.append(document) + processed_count += 1 + else: + error_count += 1 + + # Progress logging + if i % 10 == 0 or i == len(files): + relative_path = os.path.relpath(file_path, self.project_root) + logger.debug(f"{language} progress: {i}/{len(files)} files, last: {relative_path}") + + except Exception as e: + logger.error(f"Failed to process {language} files: {e}") + error_count += len(files) + continue + + logger.info(f"Processing complete: {processed_count} documents generated, {error_count} errors") + return documents + + def create_complete_index(self, file_paths: Optional[List[str]] = None) -> scip_pb2.Index: + """ + Create a complete SCIP index for the project. + + Args: + file_paths: Optional list of specific files to process. If None, auto-discover. + + Returns: + Complete SCIP Index + """ + if file_paths is None: + file_paths = self._discover_project_files() + + logger.info(f"Creating complete SCIP index for {len(file_paths)} files") + + # Create index with metadata + index = scip_pb2.Index() + + # Use any factory to create metadata (they should be consistent) + try: + fallback_factory = self.get_factory('fallback') + index.metadata.CopyFrom(fallback_factory.create_metadata(self.project_root)) + except Exception as e: + logger.warning(f"Failed to create metadata: {e}") + + # Process all files + documents = self.process_files(file_paths) + index.documents.extend(documents) + + # Extract external symbols + all_external_symbols = [] + files_by_language = self._group_files_by_language(file_paths) + + for language, files in files_by_language.items(): + if not files: + continue + + try: + factory = self.get_factory(language) + language_documents = [doc for doc in documents if self._get_document_language(doc) == language] + external_symbols = factory.extract_external_symbols(language_documents) + all_external_symbols.extend(external_symbols) + except Exception as e: + logger.warning(f"Failed to extract external symbols for {language}: {e}") + + index.external_symbols.extend(all_external_symbols) + + # Build cross-document relationships after all documents are processed + logger.info("Building cross-document relationships...") + self._build_cross_document_relationships(index) + + logger.info(f"Complete index created with {len(documents)} documents and {len(all_external_symbols)} external symbols") + return index + + def _build_cross_document_relationships(self, index: scip_pb2.Index) -> None: + """ + Build cross-document relationships using language-specific processing. + + This method delegates relationship building to individual language factories + to handle language-specific module systems and import semantics correctly. + """ + logger.info("Building cross-document relationships using language-specific processing...") + + # Group documents by language for language-specific processing + files_by_language = self._group_documents_by_language(index.documents) + + total_relationships_added = 0 + + for language, documents in files_by_language.items(): + if not documents: + continue + + try: + logger.info(f"Processing cross-document relationships for {len(documents)} {language} files") + factory = self.get_factory(language) + + # Delegate to language-specific implementation + relationships_added = factory.build_cross_document_relationships(documents, index) + total_relationships_added += relationships_added + + logger.info(f"Added {relationships_added} relationships for {language} files") + + except Exception as e: + logger.warning(f"Failed to build cross-document relationships for {language}: {e}") + # Fallback to legacy unified processing for this language + self._build_cross_document_relationships_legacy(index, documents) + + logger.info(f"Total cross-document relationships added: {total_relationships_added}") + + def _build_cross_document_relationships_legacy(self, index: scip_pb2.Index, documents_filter: List[scip_pb2.Document] = None) -> None: + """ + Legacy unified cross-document relationship building as fallback. + + This is the original implementation kept for fallback purposes. + """ + logger.info("Using legacy cross-document relationship building") + + # Use provided documents or all documents in index + documents_to_process = documents_filter if documents_filter else index.documents + + # Step 1: Build global symbol registry + symbol_registry = {} + for doc in documents_to_process: + for symbol_info in doc.symbols: + symbol_id = symbol_info.symbol + symbol_registry[symbol_id] = (doc, symbol_info) + + # Also register without suffix for function symbols + if symbol_info.kind == 11: # SymbolKind.Function + if symbol_id.endswith('().'): + base_id = symbol_id[:-3] # Remove '().' + symbol_registry[base_id] = (doc, symbol_info) + + logger.debug(f"Built legacy symbol registry with {len(symbol_registry)} entries") + + # Step 2: Analyze occurrences to build relationships + relationships_added = 0 + for source_doc in documents_to_process: + for occurrence in source_doc.occurrences: + # Skip if not a reference (we want ReadAccess = 8) + if not (occurrence.symbol_roles & 8): + continue + + # Skip if it's also a definition (Definition = 1) + if occurrence.symbol_roles & 1: + continue + + target_symbol_id = occurrence.symbol + + # Find the target symbol being referenced + target_entry = symbol_registry.get(target_symbol_id) + if not target_entry: + continue + + target_doc, target_symbol_info = target_entry + + # Skip self-references within same symbol + source_symbol_id = self._find_containing_symbol(occurrence, source_doc) + if not source_symbol_id or source_symbol_id == target_symbol_id: + continue + + # Create relationship (target is called by source) + # Only add if it's a function being called + if target_symbol_info.kind == 11: # SymbolKind.Function + relationship = scip_pb2.Relationship() + relationship.symbol = source_symbol_id + relationship.is_reference = True + relationship.is_implementation = False + relationship.is_type_definition = False + relationship.is_definition = False + + # Check if this relationship already exists to avoid duplicates + already_exists = any( + rel.symbol == source_symbol_id + for rel in target_symbol_info.relationships + ) + + if not already_exists: + target_symbol_info.relationships.append(relationship) + relationships_added += 1 + + logger.info(f"Added {relationships_added} legacy cross-document relationships") + + def _find_containing_symbol(self, occurrence, document) -> Optional[str]: + """ + Find which symbol contains this occurrence based on position. + + Args: + occurrence: The occurrence to locate + document: The document containing the occurrence + + Returns: + Symbol ID of the containing symbol, or None if not found + """ + if not occurrence.range or not occurrence.range.start: + return None + + occurrence_line = occurrence.range.start[0] if len(occurrence.range.start) > 0 else 0 + + # Find the symbol that contains this occurrence + best_symbol = None + for symbol_info in document.symbols: + # We need to determine if the occurrence is within this symbol's scope + # This is a simplified approach - ideally we'd have proper scope ranges + # For now, we'll use a heuristic based on symbol type + + # If it's a module-level symbol (no parent), it could contain the occurrence + if not best_symbol: + best_symbol = symbol_info.symbol + + # If no containing symbol found, use file-level context + if not best_symbol and document.relative_path: + file_name = document.relative_path.replace('\\', '/').split('/')[-1] + return f"local {file_name}#" + + return best_symbol + + def get_supported_languages(self) -> Set[str]: + """Get all supported languages.""" + return set(self._factory_creators.keys()) + + def get_language_statistics(self, file_paths: List[str]) -> Dict[str, int]: + """Get statistics about language distribution in file list.""" + stats = {} + for file_path in file_paths: + language = self.detect_language(file_path) + stats[language] = stats.get(language, 0) + 1 + return stats + + def _read_file_content(self, file_path: str) -> Optional[str]: + """Read file content safely.""" + try: + with open(file_path, 'r', encoding='utf-8', errors='replace') as f: + return f.read() + except Exception as e: + logger.warning(f"Failed to read file {file_path}: {e}") + return None + + def _group_files_by_language(self, file_paths: List[str]) -> Dict[str, List[str]]: + """Group files by their detected language.""" + groups = {} + for file_path in file_paths: + language = self.detect_language(file_path) + if language not in groups: + groups[language] = [] + groups[language].append(file_path) + return groups + + def _group_documents_by_language(self, documents: List[scip_pb2.Document]) -> Dict[str, List[scip_pb2.Document]]: + """Group SCIP documents by their language.""" + groups = {} + for doc in documents: + language = self._get_document_language(doc) + if language not in groups: + groups[language] = [] + groups[language].append(doc) + return groups + + def _discover_project_files(self) -> List[str]: + """Auto-discover files in the project directory.""" + files = [] + project_path = Path(self.project_root) + + # Common exclude patterns + exclude_patterns = { + '.git', '__pycache__', 'node_modules', '.vscode', '.idea', + '.pytest_cache', '.mypy_cache', 'dist', 'build' + } + + for file_path in project_path.rglob('*'): + if file_path.is_file(): + # Skip excluded directories + if any(part in exclude_patterns for part in file_path.parts): + continue + + # Only include files with known extensions or force fallback + extension = file_path.suffix.lower() + if extension in self._extension_mapping or extension: + files.append(str(file_path)) + + logger.info(f"Discovered {len(files)} files in project") + return files + + def _get_document_language(self, document: scip_pb2.Document) -> str: + """Extract language from document.""" + if hasattr(document, 'language') and document.language: + return document.language + + # Fallback: detect from file path + return self.detect_language(document.relative_path) if document.relative_path else 'fallback' + + +# Convenience function for quick usage +def create_language_manager(project_root: str) -> SCIPLanguageManager: + """Create a new SCIP Language Manager for the given project.""" + return SCIPLanguageManager(project_root) \ No newline at end of file diff --git a/src/code_index_mcp/scip/strategies/__init__.py b/src/code_index_mcp/scip/strategies/__init__.py deleted file mode 100644 index 3fb54fa..0000000 --- a/src/code_index_mcp/scip/strategies/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""SCIP indexing strategies.""" - -from .base_strategy import SCIPIndexerStrategy - -__all__ = ['SCIPIndexerStrategy'] \ No newline at end of file diff --git a/src/code_index_mcp/scip/strategies/base_strategy.py b/src/code_index_mcp/scip/strategies/base_strategy.py deleted file mode 100644 index 56972ef..0000000 --- a/src/code_index_mcp/scip/strategies/base_strategy.py +++ /dev/null @@ -1,432 +0,0 @@ -"""Base strategy interface for SCIP indexing - SCIP standard compliant.""" - -from abc import ABC, abstractmethod -from typing import List, Optional, Dict, Any -import logging - -from ..proto import scip_pb2 -from ..core.symbol_manager import SCIPSymbolManager -from ..core.position_calculator import PositionCalculator -from ..core.local_reference_resolver import LocalReferenceResolver -from ..core.relationship_manager import SCIPRelationshipManager -from ..core.relationship_types import SCIPRelationshipMapper, InternalRelationshipType - - -logger = logging.getLogger(__name__) - - -class SCIPIndexerStrategy(ABC): - """ - Base class for all SCIP indexing strategies. - - This version is fully compliant with SCIP standards and includes: - - Standard SCIP symbol ID generation - - Accurate position calculation - - Local cross-file reference resolution - - Two-phase analysis (symbol collection + reference resolution) - """ - - def __init__(self, priority: int = 50): - """ - Initialize the strategy with a priority level. - - Args: - priority: Strategy priority (higher = more preferred) - 100 = Official tools (highest) - 90 = Language-specific strategies - 50 = Custom strategies (primary) - 25 = Language-specialized defaults - 10 = Generic defaults - 1 = Fallback (lowest) - """ - self.priority = priority - - # Core components (initialized per project) - self.symbol_manager: Optional[SCIPSymbolManager] = None - self.reference_resolver: Optional[LocalReferenceResolver] = None - self.position_calculator: Optional[PositionCalculator] = None - self.relationship_manager: Optional[SCIPRelationshipManager] = None - self.relationship_mapper: Optional[SCIPRelationshipMapper] = None - - @abstractmethod - def can_handle(self, extension: str, file_path: str) -> bool: - """ - Check if this strategy can handle the given file type. - - Args: - extension: File extension (e.g., '.py') - file_path: Full path to the file - - Returns: - True if this strategy can handle the file - """ - - @abstractmethod - def get_language_name(self) -> str: - """ - Get the language name for SCIP symbol generation. - - Returns: - Language name (e.g., 'python', 'javascript', 'java') - """ - - def generate_scip_documents(self, files: List[str], project_path: str) -> List[scip_pb2.Document]: - """ - Generate SCIP documents for the given files using two-phase analysis. - - Args: - files: List of file paths to index - project_path: Root path of the project - - Returns: - List of SCIP Document objects - - Raises: - StrategyError: If the strategy cannot process the files - """ - import os - from datetime import datetime - strategy_name = self.__class__.__name__ - - logger.info(f"🐍 {strategy_name}: Starting indexing of {len(files)} files") - logger.debug(f"Files to process: {[os.path.basename(f) for f in files[:5]]}" + - (f" ... and {len(files)-5} more" if len(files) > 5 else "")) - - try: - # Initialize core components for this project - logger.debug(f"🔧 {strategy_name}: Initializing components...") - self._initialize_components(project_path) - logger.debug(f"✅ {strategy_name}: Component initialization completed") - - # Phase 1: Collect all symbol definitions - logger.info(f"📋 {strategy_name}: Phase 1 - Collecting symbol definitions from {len(files)} files") - self._collect_symbol_definitions(files, project_path) - logger.info(f"✅ {strategy_name}: Phase 1 completed") - - # Phase 2: Build symbol relationships - logger.info(f"🔗 {strategy_name}: Phase 2 - Building symbol relationships") - relationships = self._build_symbol_relationships(files, project_path) - total_relationships = sum(len(rels) for rels in relationships.values()) - logger.info(f"✅ {strategy_name}: Phase 2 completed, built {total_relationships} relationships for {len(relationships)} symbols") - - # Phase 3: Generate complete SCIP documents with resolved references and relationships - logger.info(f"📄 {strategy_name}: Phase 3 - Generating SCIP documents with resolved references and relationships") - documents = self._generate_documents_with_references(files, project_path, relationships) - logger.info(f"✅ {strategy_name}: Phase 3 completed, generated {len(documents)} documents") - - # Log statistics - if self.reference_resolver: - stats = self.reference_resolver.get_project_statistics() - logger.info(f"📊 {strategy_name}: Statistics - {stats['total_definitions']} definitions, " - f"{stats['total_references']} references, {stats['files_with_symbols']} files") - - logger.info(f"🎉 {strategy_name}: Indexing completed") - - return documents - - except Exception as e: - logger.error(f"❌ {strategy_name}: Failed: {e}") - raise StrategyError(f"Failed to generate SCIP documents: {e}") from e - - def get_external_symbols(self): - """Get external symbol information from symbol manager.""" - if self.symbol_manager: - return self.symbol_manager.get_external_symbols() - return [] - - def get_dependencies(self): - """Get dependency information from symbol manager.""" - if self.symbol_manager: - return self.symbol_manager.get_dependencies() - return {} - - def _initialize_components(self, project_path: str) -> None: - """Initialize core components for the project.""" - import os - project_name = os.path.basename(project_path) - - self.symbol_manager = SCIPSymbolManager(project_path, project_name) - self.reference_resolver = LocalReferenceResolver(project_path) - self.relationship_manager = SCIPRelationshipManager() - self.relationship_mapper = SCIPRelationshipMapper() - - logger.debug(f"Initialized components for project: {project_name}") - - @abstractmethod - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """ - Phase 1: Collect all symbol definitions from files. - - This phase should: - 1. Parse each file - 2. Extract symbol definitions - 3. Register them with the reference resolver - - Args: - files: List of file paths to process - project_path: Project root path - """ - - @abstractmethod - def _generate_documents_with_references(self, files: List[str], project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> List[scip_pb2.Document]: - """ - Phase 3: Generate complete SCIP documents with resolved references and relationships. - - This phase should: - 1. Parse each file again - 2. Generate occurrences for definitions and references - 3. Resolve references using the reference resolver - 4. Add relationships to symbol information - 5. Create complete SCIP documents - - Args: - files: List of file paths to process - project_path: Project root path - relationships: Optional dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - - Returns: - List of complete SCIP documents - """ - - @abstractmethod - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """ - Build relationships between symbols. - - This method should analyze symbol relationships and return a mapping - from symbol IDs to their relationships. - - Args: - files: List of file paths to process - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - - def _create_scip_relationships(self, symbol_relationships: List[tuple]) -> List[scip_pb2.Relationship]: - """ - Create SCIP relationships from symbol relationship tuples. - - Args: - symbol_relationships: List of (target_symbol, relationship_type) tuples - - Returns: - List of SCIP Relationship objects - """ - if not self.relationship_mapper: - logger.warning("Relationship mapper not initialized, returning empty relationships") - return [] - - try: - relationships = [] - for target_symbol, relationship_type in symbol_relationships: - if isinstance(relationship_type, str): - # Convert string to enum if needed - try: - relationship_type = InternalRelationshipType(relationship_type) - except ValueError: - logger.warning(f"Unknown relationship type: {relationship_type}") - continue - - scip_rel = self.relationship_mapper.map_to_scip_relationship( - target_symbol, relationship_type - ) - relationships.append(scip_rel) - - logger.debug(f"Created {len(relationships)} SCIP relationships") - return relationships - - except Exception as e: - logger.error(f"Failed to create SCIP relationships: {e}") - return [] - - def get_priority(self) -> int: - """Return the strategy priority.""" - return self.priority - - def get_strategy_name(self) -> str: - """Return a human-readable name for this strategy.""" - class_name = self.__class__.__name__ - return class_name - - def is_available(self) -> bool: - """ - Check if this strategy is available and ready to use. - - Returns: - True if the strategy can be used - """ - return True - - def _read_file_content(self, file_path: str) -> Optional[str]: - """ - Read file content with encoding detection. - - Args: - file_path: Path to file - - Returns: - File content or None if reading fails - """ - try: - # Try different encodings - encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252'] - - for encoding in encodings: - try: - with open(file_path, 'r', encoding=encoding) as f: - return f.read() - except UnicodeDecodeError: - continue - - logger.warning(f"Could not decode {file_path} with any encoding") - return None - - except (OSError, PermissionError, FileNotFoundError) as e: - logger.warning(f"Could not read {file_path}: {e}") - return None - - def _get_relative_path(self, file_path: str, project_path: str) -> str: - """ - Get relative path from project root. - - Args: - file_path: Absolute or relative file path - project_path: Project root path - - Returns: - Relative path from project root - """ - try: - from pathlib import Path - path = Path(file_path) - if path.is_absolute(): - return str(path.relative_to(Path(project_path))) - return file_path - except ValueError: - # If path is not under project_path, return as-is - return file_path - - def _create_scip_occurrence(self, - symbol_id: str, - range_obj: scip_pb2.Range, - symbol_roles: int, - syntax_kind: int) -> scip_pb2.Occurrence: - """ - Create a SCIP occurrence. - - Args: - symbol_id: SCIP symbol ID - range_obj: SCIP Range object - symbol_roles: SCIP symbol roles - syntax_kind: SCIP syntax kind - - Returns: - SCIP Occurrence object - """ - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = symbol_roles - occurrence.syntax_kind = syntax_kind - occurrence.range.CopyFrom(range_obj) - - return occurrence - - def _create_scip_symbol_information(self, - symbol_id: str, - display_name: str, - symbol_kind: int, - documentation: List[str] = None, - relationships: List[scip_pb2.Relationship] = None) -> scip_pb2.SymbolInformation: - """ - Create SCIP symbol information with relationships. - - Args: - symbol_id: SCIP symbol ID - display_name: Human-readable name - symbol_kind: SCIP symbol kind - documentation: Optional documentation - relationships: Optional relationships - - Returns: - SCIP SymbolInformation object with relationships - """ - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = display_name - symbol_info.kind = symbol_kind - - if documentation: - symbol_info.documentation.extend(documentation) - - # Add relationships if provided - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _register_symbol_definition(self, symbol_id: str, file_path: str, - definition_range: scip_pb2.Range, symbol_kind: int, - display_name: str, documentation: List[str] = None) -> None: - """ - Register a symbol definition with the reference resolver. - - Args: - symbol_id: SCIP symbol ID - file_path: File path where symbol is defined - definition_range: SCIP range object for definition - symbol_kind: SCIP symbol kind - display_name: Human-readable name - documentation: Optional documentation - """ - if not self.reference_resolver: - logger.warning("Reference resolver not initialized, skipping symbol registration") - return - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=definition_range, - symbol_kind=symbol_kind, - display_name=display_name, - documentation=documentation or [] - ) - - def _check_components_initialized(self) -> bool: - """ - Check if all required components are initialized. - - Returns: - True if all components are ready - - Raises: - StrategyError: If required components are not initialized - """ - missing_components = [] - - if not self.symbol_manager: - missing_components.append("symbol_manager") - if not self.reference_resolver: - missing_components.append("reference_resolver") - if not self.relationship_manager: - missing_components.append("relationship_manager") - if not self.relationship_mapper: - missing_components.append("relationship_mapper") - - if missing_components: - raise StrategyError(f"Required components not initialized: {', '.join(missing_components)}") - - return True - - -class StrategyError(Exception): - """Base exception for strategy-related errors.""" - - -class ToolUnavailableError(StrategyError): - """Raised when a required tool is not available.""" - - -class ConversionError(StrategyError): - """Raised when conversion to SCIP format fails.""" \ No newline at end of file diff --git a/src/code_index_mcp/scip/strategies/fallback_strategy.py b/src/code_index_mcp/scip/strategies/fallback_strategy.py deleted file mode 100644 index 7abb407..0000000 --- a/src/code_index_mcp/scip/strategies/fallback_strategy.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Simplified fallback SCIP indexing strategy - minimal file information only.""" - -import logging -import os -from typing import List, Optional, Dict, Any -from pathlib import Path - -from .base_strategy import SCIPIndexerStrategy, StrategyError -from ..proto import scip_pb2 -from ...constants import SUPPORTED_EXTENSIONS - - -logger = logging.getLogger(__name__) - - -class FallbackStrategy(SCIPIndexerStrategy): - """Simplified SCIP-compliant fallback strategy providing only basic file information.""" - - def __init__(self, priority: int = 10): - """Initialize the fallback strategy with low priority.""" - super().__init__(priority) - - def can_handle(self, extension: str, file_path: str) -> bool: - """This strategy can handle supported file extensions as a last resort.""" - return extension.lower() in SUPPORTED_EXTENSIONS - - def get_language_name(self) -> str: - """Get the language name for SCIP symbol generation.""" - return "text" # Generic text language - - def is_available(self) -> bool: - """Check if this strategy is available.""" - return True # Always available as fallback - - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Simple file counting - no symbol collection.""" - logger.debug(f"FallbackStrategy Phase 1: Processing {len(files)} files for basic cataloging") - processed_count = 0 - - for file_path in files: - try: - relative_path = os.path.relpath(file_path, project_path) - # Just count files, no symbol extraction - processed_count += 1 - logger.debug(f"Registered file: {relative_path}") - except Exception as e: - logger.warning(f"Phase 1 failed for {file_path}: {e}") - continue - - logger.info(f"Phase 1 summary: {processed_count} files registered") - - def _generate_documents_with_references(self, files: List[str], project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> List[scip_pb2.Document]: - """Phase 2: Generate minimal SCIP documents with basic file information.""" - documents = [] - logger.debug(f"FallbackStrategy Phase 2: Creating basic documents for {len(files)} files") - processed_count = 0 - - for file_path in files: - try: - document = self._create_basic_document(file_path, project_path) - if document: - documents.append(document) - processed_count += 1 - - except Exception as e: - logger.warning(f"Phase 2 failed for {file_path}: {e}") - continue - - logger.info(f"Phase 2 summary: {processed_count} basic documents created") - return documents - - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """Skip relationship building - return empty dict.""" - logger.debug("FallbackStrategy: Skipping relationship building (minimal mode)") - return {} - - def _create_basic_document(self, file_path: str, project_path: str) -> Optional[scip_pb2.Document]: - """Create a minimal SCIP document with basic file information.""" - try: - # Check if file exists and get basic info - if not os.path.exists(file_path): - return None - - file_stats = os.stat(file_path) - relative_path = os.path.relpath(file_path, project_path) - - # Create basic document - document = scip_pb2.Document() - document.relative_path = relative_path - document.language = self._detect_language_from_extension(Path(file_path).suffix) - - # Add basic file symbol - file_name = Path(file_path).stem - symbol_id = self.symbol_manager.create_local_symbol( - language=document.language, - file_path=relative_path, - symbol_path=[file_name], - descriptor="" - ) - - # Create minimal symbol information - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = file_name - symbol_info.kind = scip_pb2.File - symbol_info.documentation.append( - f"File: {relative_path} ({document.language})" - ) - - document.symbols.append(symbol_info) - - logger.debug(f"Created basic document for: {relative_path}") - return document - - except Exception as e: - logger.warning(f"Failed to create basic document for {file_path}: {e}") - return None - - def _detect_language_from_extension(self, extension: str) -> str: - """Detect specific language from extension.""" - extension_mapping = { - # Programming languages - '.c': 'c', - '.cpp': 'cpp', '.cc': 'cpp', '.cxx': 'cpp', '.c++': 'cpp', - '.h': 'c', '.hpp': 'cpp', '.hh': 'cpp', '.hxx': 'cpp', - '.js': 'javascript', '.mjs': 'javascript', '.jsx': 'javascript', - '.ts': 'typescript', '.tsx': 'typescript', - '.py': 'python', '.pyi': 'python', '.pyx': 'python', - '.java': 'java', - '.go': 'go', - '.rs': 'rust', - '.rb': 'ruby', - '.cs': 'csharp', - '.php': 'php', - '.swift': 'swift', - '.kt': 'kotlin', '.kts': 'kotlin', - '.scala': 'scala', - '.r': 'r', - '.lua': 'lua', - '.perl': 'perl', '.pl': 'perl', - '.zig': 'zig', - '.dart': 'dart', - - # Web and markup - '.html': 'html', '.htm': 'html', - '.css': 'css', - '.scss': 'scss', '.sass': 'sass', - '.less': 'less', - '.vue': 'vue', - '.svelte': 'svelte', - '.astro': 'astro', - - # Data and config - '.json': 'json', - '.xml': 'xml', - '.yaml': 'yaml', '.yml': 'yaml', - '.toml': 'toml', - '.ini': 'ini', - '.cfg': 'ini', - '.conf': 'ini', - - # Documentation - '.md': 'markdown', '.markdown': 'markdown', - '.mdx': 'mdx', - '.tex': 'latex', - '.rst': 'rst', - - # Database and query - '.sql': 'sql', - '.cql': 'cql', - '.cypher': 'cypher', - '.sparql': 'sparql', - '.graphql': 'graphql', '.gql': 'graphql', - - # Shell and scripts - '.sh': 'shell', '.bash': 'bash', - '.zsh': 'zsh', '.fish': 'fish', - '.ps1': 'powershell', - '.bat': 'batch', '.cmd': 'batch', - - # Template languages - '.handlebars': 'handlebars', '.hbs': 'handlebars', - '.ejs': 'ejs', - '.pug': 'pug', - '.mustache': 'mustache', - - # Other - '.dockerfile': 'dockerfile', - '.gitignore': 'gitignore', - '.env': 'dotenv', - } - - return extension_mapping.get(extension.lower(), 'text') diff --git a/src/code_index_mcp/scip/strategies/java_strategy.py b/src/code_index_mcp/scip/strategies/java_strategy.py deleted file mode 100644 index ea2409a..0000000 --- a/src/code_index_mcp/scip/strategies/java_strategy.py +++ /dev/null @@ -1,624 +0,0 @@ -"""Java SCIP indexing strategy v4 - Tree-sitter based with Python strategy architecture.""" - -import logging -import os -from typing import List, Optional, Dict, Any, Set - -try: - import tree_sitter - from tree_sitter_java import language as java_language - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - -from .base_strategy import SCIPIndexerStrategy, StrategyError -from ..proto import scip_pb2 -from ..core.position_calculator import PositionCalculator -from ..core.relationship_types import InternalRelationshipType - - -logger = logging.getLogger(__name__) - - -class JavaStrategy(SCIPIndexerStrategy): - """SCIP-compliant Java indexing strategy using Tree-sitter with Python strategy architecture.""" - - SUPPORTED_EXTENSIONS = {'.java'} - - def __init__(self, priority: int = 95): - """Initialize the Java strategy v4.""" - super().__init__(priority) - - if not TREE_SITTER_AVAILABLE: - raise StrategyError("Tree-sitter not available for Java strategy") - - # Initialize Java parser - java_lang = tree_sitter.Language(java_language()) - self.parser = tree_sitter.Parser(java_lang) - - def can_handle(self, extension: str, file_path: str) -> bool: - """Check if this strategy can handle the file type.""" - return extension.lower() in self.SUPPORTED_EXTENSIONS and TREE_SITTER_AVAILABLE - - def get_language_name(self) -> str: - """Get the language name for SCIP symbol generation.""" - return "java" - - def is_available(self) -> bool: - """Check if this strategy is available.""" - return TREE_SITTER_AVAILABLE - - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Collect all symbol definitions from Java files.""" - logger.debug(f"JavaStrategy Phase 1: Processing {len(files)} files for symbol collection") - processed_count = 0 - error_count = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - self._collect_symbols_from_file(file_path, project_path) - processed_count += 1 - - if i % 10 == 0 or i == len(files): - logger.debug(f"Phase 1 progress: {i}/{len(files)} files, last file: {relative_path}") - - except Exception as e: - error_count += 1 - logger.warning(f"Phase 1 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 1 summary: {processed_count} files processed, {error_count} errors") - - def _generate_documents_with_references(self, files: List[str], project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> List[scip_pb2.Document]: - """Phase 2: Generate complete SCIP documents with resolved references.""" - documents = [] - logger.debug(f"JavaStrategy Phase 2: Generating documents for {len(files)} files") - processed_count = 0 - error_count = 0 - total_occurrences = 0 - total_symbols = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - document = self._analyze_java_file(file_path, project_path, relationships) - if document: - documents.append(document) - total_occurrences += len(document.occurrences) - total_symbols += len(document.symbols) - processed_count += 1 - - if i % 10 == 0 or i == len(files): - logger.debug(f"Phase 2 progress: {i}/{len(files)} files, " - f"last file: {relative_path}, " - f"{len(document.occurrences) if document else 0} occurrences") - - except Exception as e: - error_count += 1 - logger.error(f"Phase 2 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 2 summary: {processed_count} documents generated, {error_count} errors, " - f"{total_occurrences} total occurrences, {total_symbols} total symbols") - - return documents - - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """ - Build relationships between Java symbols. - - Args: - files: List of file paths to process - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - logger.debug(f"JavaStrategy: Building symbol relationships for {len(files)} files") - - all_relationships = {} - - for file_path in files: - try: - file_relationships = self._extract_java_relationships_from_file(file_path, project_path) - all_relationships.update(file_relationships) - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - - total_symbols_with_relationships = len(all_relationships) - total_relationships = sum(len(rels) for rels in all_relationships.values()) - - logger.debug(f"JavaStrategy: Built {total_relationships} relationships for {total_symbols_with_relationships} symbols") - return all_relationships - - def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: - """Collect symbol definitions from a single Java file.""" - content = self._read_file_content(file_path) - if not content: - return - - tree = self._parse_content(content) - if not tree: - return - - relative_path = self._get_relative_path(file_path, project_path) - self._collect_symbols_from_tree(tree, relative_path, content) - - def _analyze_java_file(self, file_path: str, project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> Optional[scip_pb2.Document]: - """Analyze a single Java file and generate complete SCIP document.""" - content = self._read_file_content(file_path) - if not content: - return None - - tree = self._parse_content(content) - if not tree: - return None - - # Create SCIP document - document = scip_pb2.Document() - document.relative_path = self._get_relative_path(file_path, project_path) - document.language = self.get_language_name() - - # Analyze Tree-sitter AST and generate occurrences - self.position_calculator = PositionCalculator(content) - occurrences, symbols = self._analyze_tree_for_document(tree, document.relative_path, content, relationships) - - # Add results to document - document.occurrences.extend(occurrences) - document.symbols.extend(symbols) - - logger.debug(f"Analyzed Java file {document.relative_path}: " - f"{len(document.occurrences)} occurrences, {len(document.symbols)} symbols") - - return document - - def _parse_content(self, content: str) -> Optional[tree_sitter.Tree]: - """Parse Java content with Tree-sitter.""" - try: - return self.parser.parse(bytes(content, "utf8")) - except Exception as e: - logger.error(f"Failed to parse Java content: {e}") - return None - - def _collect_symbols_from_tree(self, tree: tree_sitter.Tree, file_path: str, content: str) -> None: - """Collect symbols from Tree-sitter tree using integrated visitor (Phase 1).""" - root = tree.root_node - - for node in self._walk_tree(root): - if node.type == "class_declaration": - self._register_class_symbol(node, file_path, content) - elif node.type == "interface_declaration": - self._register_interface_symbol(node, file_path, content) - elif node.type == "enum_declaration": - self._register_enum_symbol(node, file_path, content) - elif node.type == "method_declaration": - self._register_method_symbol(node, file_path, content) - elif node.type == "constructor_declaration": - self._register_constructor_symbol(node, file_path, content) - - def _analyze_tree_for_document(self, tree: tree_sitter.Tree, file_path: str, content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[List[scip_pb2.Occurrence], List[scip_pb2.SymbolInformation]]: - """Analyze Tree-sitter tree to generate occurrences and symbols for SCIP document (Phase 2).""" - occurrences = [] - symbols = [] - root = tree.root_node - - for node in self._walk_tree(root): - if node.type == "class_declaration": - symbol_id = self._create_class_symbol_id(node, file_path, content) - occurrence = self._create_class_occurrence(node, symbol_id) - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_class_symbol_info(node, symbol_id, content, scip_relationships) - - if occurrence: - occurrences.append(occurrence) - if symbol_info: - symbols.append(symbol_info) - - elif node.type == "interface_declaration": - symbol_id = self._create_interface_symbol_id(node, file_path, content) - occurrence = self._create_interface_occurrence(node, symbol_id) - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_interface_symbol_info(node, symbol_id, content, scip_relationships) - - if occurrence: - occurrences.append(occurrence) - if symbol_info: - symbols.append(symbol_info) - - elif node.type in ["method_declaration", "constructor_declaration"]: - symbol_id = self._create_method_symbol_id(node, file_path, content) - occurrence = self._create_method_occurrence(node, symbol_id) - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_method_symbol_info(node, symbol_id, content, scip_relationships) - - if occurrence: - occurrences.append(occurrence) - if symbol_info: - symbols.append(symbol_info) - - return occurrences, symbols - - def _extract_java_relationships_from_file(self, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """Extract relationships from a single Java file.""" - logger.debug(f"JavaStrategy: Starting relationship extraction from {file_path}") - - content = self._read_file_content(file_path) - if not content: - logger.debug(f"JavaStrategy: No content found in {file_path}") - return {} - - tree = self._parse_content(content) - if not tree: - logger.debug(f"JavaStrategy: Failed to parse {file_path} with Tree-sitter") - return {} - - relative_path = self._get_relative_path(file_path, project_path) - relationships = self._extract_relationships_from_tree(tree, relative_path, content) - - logger.debug(f"JavaStrategy: Extracted {len(relationships)} relationships from {relative_path}") - return relationships - - def _extract_relationships_from_tree(self, tree: tree_sitter.Tree, file_path: str, content: str) -> Dict[str, List[tuple]]: - """Extract relationships from Tree-sitter AST.""" - relationships = {} - root = tree.root_node - - for node in self._walk_tree(root): - if node.type == "class_declaration": - # Extract inheritance relationships - class_symbol_id = self._create_class_symbol_id(node, file_path, content) - - # Find extends clause - for child in node.children: - if child.type == "superclass": - for grandchild in child.children: - if grandchild.type == "type_identifier": - parent_name = grandchild.text.decode() - parent_symbol_id = self._create_class_symbol_id_by_name(parent_name, file_path) - if class_symbol_id not in relationships: - relationships[class_symbol_id] = [] - relationships[class_symbol_id].append((parent_symbol_id, InternalRelationshipType.INHERITS)) - - # Find implements clause - for child in node.children: - if child.type == "super_interfaces": - for interface_list in child.children: - if interface_list.type == "type_list": - for interface_type in interface_list.children: - if interface_type.type == "type_identifier": - interface_name = interface_type.text.decode() - interface_symbol_id = self._create_interface_symbol_id_by_name(interface_name, file_path) - if class_symbol_id not in relationships: - relationships[class_symbol_id] = [] - relationships[class_symbol_id].append((interface_symbol_id, InternalRelationshipType.IMPLEMENTS)) - - return relationships - - # Helper methods for Tree-sitter node processing - def _walk_tree(self, node: tree_sitter.Node): - """Walk through all nodes in a Tree-sitter tree.""" - yield node - for child in node.children: - yield from self._walk_tree(child) - - def _get_node_identifier(self, node: tree_sitter.Node) -> Optional[str]: - """Get the identifier name from a Tree-sitter node.""" - for child in node.children: - if child.type == "identifier": - return child.text.decode() - return None - - def _get_package_name(self, tree: tree_sitter.Tree) -> str: - """Extract package name from Tree-sitter tree.""" - root = tree.root_node - for node in self._walk_tree(root): - if node.type == "package_declaration": - for child in node.children: - if child.type == "scoped_identifier": - return child.text.decode() - return "" - - # Symbol creation methods (similar to Python strategy) - def _register_class_symbol(self, node: tree_sitter.Node, file_path: str, content: str) -> None: - """Register a class symbol definition.""" - name = self._get_node_identifier(node) - if not name: - return - - symbol_id = self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="#" - ) - - # Create a dummy range for registration - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Class, - display_name=name, - documentation=["Java class"] - ) - - def _register_interface_symbol(self, node: tree_sitter.Node, file_path: str, content: str) -> None: - """Register an interface symbol definition.""" - name = self._get_node_identifier(node) - if not name: - return - - symbol_id = self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="#" - ) - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Interface, - display_name=name, - documentation=["Java interface"] - ) - - def _register_enum_symbol(self, node: tree_sitter.Node, file_path: str, content: str) -> None: - """Register an enum symbol definition.""" - name = self._get_node_identifier(node) - if not name: - return - - symbol_id = self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="#" - ) - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Enum, - display_name=name, - documentation=["Java enum"] - ) - - def _register_method_symbol(self, node: tree_sitter.Node, file_path: str, content: str) -> None: - """Register a method symbol definition.""" - name = self._get_node_identifier(node) - if not name: - return - - symbol_id = self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="()." - ) - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Method, - display_name=name, - documentation=["Java method"] - ) - - def _register_constructor_symbol(self, node: tree_sitter.Node, file_path: str, content: str) -> None: - """Register a constructor symbol definition.""" - name = self._get_node_identifier(node) - if not name: - return - - symbol_id = self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="()." - ) - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Method, - display_name=name, - documentation=["Java constructor"] - ) - - # Symbol ID creation methods - def _create_class_symbol_id(self, node: tree_sitter.Node, file_path: str, content: str) -> str: - """Create symbol ID for a class.""" - name = self._get_node_identifier(node) - if not name: - return "" - return self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="#" - ) - - def _create_class_symbol_id_by_name(self, name: str, file_path: str) -> str: - """Create symbol ID for a class by name.""" - return self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="#" - ) - - def _create_interface_symbol_id(self, node: tree_sitter.Node, file_path: str, content: str) -> str: - """Create symbol ID for an interface.""" - name = self._get_node_identifier(node) - if not name: - return "" - return self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="#" - ) - - def _create_interface_symbol_id_by_name(self, name: str, file_path: str) -> str: - """Create symbol ID for an interface by name.""" - return self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="#" - ) - - def _create_method_symbol_id(self, node: tree_sitter.Node, file_path: str, content: str) -> str: - """Create symbol ID for a method.""" - name = self._get_node_identifier(node) - if not name: - return "" - return self.symbol_manager.create_local_symbol( - language="java", - file_path=file_path, - symbol_path=[name], - descriptor="()." - ) - - # Occurrence creation methods (using PositionCalculator) - def _create_class_occurrence(self, node: tree_sitter.Node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for class.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierType - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_interface_occurrence(self, node: tree_sitter.Node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for interface.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierType - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_method_occurrence(self, node: tree_sitter.Node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for method.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierFunction - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - # Symbol information creation methods (with relationships) - def _create_class_symbol_info(self, node: tree_sitter.Node, symbol_id: str, content: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for class.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = self._get_node_identifier(node) or "Unknown" - symbol_info.kind = scip_pb2.Class - - # Add documentation - symbol_info.documentation.append("Java class") - - # Add relationships if provided - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_interface_symbol_info(self, node: tree_sitter.Node, symbol_id: str, content: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for interface.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = self._get_node_identifier(node) or "Unknown" - symbol_info.kind = scip_pb2.Interface - - symbol_info.documentation.append("Java interface") - - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_method_symbol_info(self, node: tree_sitter.Node, symbol_id: str, content: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for method.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = self._get_node_identifier(node) or "Unknown" - symbol_info.kind = scip_pb2.Method - - # Determine if it's a constructor or method - if node.type == "constructor_declaration": - symbol_info.documentation.append("Java constructor") - else: - symbol_info.documentation.append("Java method") - - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_scip_relationships(self, symbol_relationships: List[tuple]) -> List[scip_pb2.Relationship]: - """Convert internal relationships to SCIP relationships.""" - scip_relationships = [] - for target_symbol_id, relationship_type in symbol_relationships: - relationship = scip_pb2.Relationship() - relationship.symbol = target_symbol_id - relationship.is_reference = True - # Map relationship types to SCIP if needed - scip_relationships.append(relationship) - return scip_relationships \ No newline at end of file diff --git a/src/code_index_mcp/scip/strategies/javascript_strategy.py b/src/code_index_mcp/scip/strategies/javascript_strategy.py deleted file mode 100644 index 489fd37..0000000 --- a/src/code_index_mcp/scip/strategies/javascript_strategy.py +++ /dev/null @@ -1,974 +0,0 @@ -"""JavaScript/TypeScript SCIP indexing strategy - SCIP standard compliant.""" - -import logging -import os -from typing import List, Optional, Dict, Any, Set - -from .base_strategy import SCIPIndexerStrategy, StrategyError -from ..proto import scip_pb2 -from ..core.position_calculator import PositionCalculator -from ..core.relationship_types import InternalRelationshipType - -# Tree-sitter imports -import tree_sitter -from tree_sitter_javascript import language as js_language -from tree_sitter_typescript import language_typescript as ts_language - - -logger = logging.getLogger(__name__) - - -class JavaScriptStrategy(SCIPIndexerStrategy): - """SCIP-compliant JavaScript/TypeScript indexing strategy using Tree-sitter.""" - - SUPPORTED_EXTENSIONS = {'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs'} - - def __init__(self, priority: int = 95): - """Initialize the JavaScript/TypeScript strategy.""" - super().__init__(priority) - - # Initialize parsers - try: - js_lang = tree_sitter.Language(js_language()) - ts_lang = tree_sitter.Language(ts_language()) - - self.js_parser = tree_sitter.Parser(js_lang) - self.ts_parser = tree_sitter.Parser(ts_lang) - logger.info("JavaScript strategy initialized with Tree-sitter support") - except Exception as e: - logger.error(f"Failed to initialize JavaScript strategy: {e}") - self.js_parser = None - self.ts_parser = None - - # Initialize dependency tracking - self.dependencies = { - 'imports': { - 'standard_library': [], - 'third_party': [], - 'local': [] - } - } - - def can_handle(self, extension: str, file_path: str) -> bool: - """Check if this strategy can handle the file type.""" - return extension.lower() in self.SUPPORTED_EXTENSIONS - - def get_language_name(self) -> str: - """Get the language name for SCIP symbol generation.""" - return "javascript" - - def is_available(self) -> bool: - """Check if this strategy is available.""" - return self.js_parser is not None and self.ts_parser is not None - - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Collect all symbol definitions from JavaScript/TypeScript files.""" - logger.debug(f"JavaScriptStrategy Phase 1: Processing {len(files)} files for symbol collection") - processed_count = 0 - error_count = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - self._collect_symbols_from_file(file_path, project_path) - processed_count += 1 - - if i % 10 == 0 or i == len(files): # Progress every 10 files or at end - logger.debug(f"Phase 1 progress: {i}/{len(files)} files, last file: {relative_path}") - - except Exception as e: - error_count += 1 - logger.warning(f"Phase 1 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 1 summary: {processed_count} files processed, {error_count} errors") - - def _generate_documents_with_references(self, files: List[str], project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> List[scip_pb2.Document]: - """Phase 2: Generate complete SCIP documents with resolved references.""" - documents = [] - logger.debug(f"JavaScriptStrategy Phase 2: Generating documents for {len(files)} files") - processed_count = 0 - error_count = 0 - total_occurrences = 0 - total_symbols = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - document = self._analyze_javascript_file(file_path, project_path, relationships) - if document: - documents.append(document) - total_occurrences += len(document.occurrences) - total_symbols += len(document.symbols) - processed_count += 1 - - if i % 10 == 0 or i == len(files): # Progress every 10 files or at end - logger.debug(f"Phase 2 progress: {i}/{len(files)} files, " - f"last file: {relative_path}, " - f"{len(document.occurrences) if document else 0} occurrences") - - except Exception as e: - error_count += 1 - logger.error(f"Phase 2 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 2 summary: {processed_count} documents generated, {error_count} errors, " - f"{total_occurrences} total occurrences, {total_symbols} total symbols") - - return documents - - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """ - Build relationships between JavaScript/TypeScript symbols. - - Args: - files: List of file paths to process - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - logger.debug(f"JavaScriptStrategy: Building symbol relationships for {len(files)} files") - - all_relationships = {} - - for file_path in files: - try: - file_relationships = self._extract_relationships_from_file(file_path, project_path) - all_relationships.update(file_relationships) - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - - total_symbols_with_relationships = len(all_relationships) - total_relationships = sum(len(rels) for rels in all_relationships.values()) - - logger.debug(f"JavaScriptStrategy: Built {total_relationships} relationships for {total_symbols_with_relationships} symbols") - return all_relationships - - def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: - """Collect symbol definitions from a single JavaScript/TypeScript file.""" - - # Reset dependencies for this file - self._reset_dependencies() - - # Read file content - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {os.path.relpath(file_path, project_path)}") - return - - # Parse with Tree-sitter - try: - tree = self._parse_js_content(content, file_path) - if not tree or not tree.root_node: - raise StrategyError(f"Failed to parse {os.path.relpath(file_path, project_path)}") - except Exception as e: - logger.warning(f"Parse error in {os.path.relpath(file_path, project_path)}: {e}") - return - - # Collect symbols using integrated visitor - relative_path = self._get_relative_path(file_path, project_path) - self._collect_symbols_from_tree(tree, relative_path, content) - logger.debug(f"Symbol collection - {relative_path}") - - def _analyze_javascript_file(self, file_path: str, project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> Optional[scip_pb2.Document]: - """Analyze a single JavaScript/TypeScript file and generate complete SCIP document.""" - relative_path = self._get_relative_path(file_path, project_path) - - # Read file content - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {relative_path}") - return None - - # Parse with Tree-sitter - try: - tree = self._parse_js_content(content, file_path) - if not tree or not tree.root_node: - raise StrategyError(f"Failed to parse {relative_path}") - except Exception as e: - logger.warning(f"Parse error in {relative_path}: {e}") - return None - - # Create SCIP document - document = scip_pb2.Document() - document.relative_path = relative_path - document.language = self.get_language_name() - - # Analyze tree and generate occurrences - self.position_calculator = PositionCalculator(content) - - occurrences, symbols = self._analyze_tree_for_document(tree, relative_path, content, relationships) - - # Add results to document - document.occurrences.extend(occurrences) - document.symbols.extend(symbols) - - logger.debug(f"Document analysis - {relative_path}: " - f"-> {len(document.occurrences)} occurrences, {len(document.symbols)} symbols") - - return document - - def _extract_relationships_from_file(self, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """ - Extract relationships from a single JavaScript/TypeScript file. - - Args: - file_path: File to analyze - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - content = self._read_file_content(file_path) - if not content: - return {} - - try: - tree = self._parse_js_content(content, file_path) - if not tree or not tree.root_node: - raise StrategyError(f"Failed to parse {file_path} for relationship extraction") - except Exception as e: - logger.warning(f"Parse error in {file_path}: {e}") - return {} - - return self._extract_relationships_from_tree(tree, file_path, project_path) - - def _parse_js_content(self, content: str, file_path: str): - """Parse JavaScript/TypeScript content using Tree-sitter parser.""" - # Determine parser based on file extension - extension = os.path.splitext(file_path)[1].lower() - - if extension in {'.ts', '.tsx'}: - parser = self.ts_parser - else: - parser = self.js_parser - - if not parser: - raise StrategyError(f"No parser available for {extension}") - - content_bytes = content.encode('utf-8') - return parser.parse(content_bytes) - - def _collect_symbols_from_tree(self, tree, file_path: str, content: str) -> None: - """Collect symbols from Tree-sitter tree using integrated visitor.""" - # Use a set to track processed nodes and avoid duplicates - self._processed_nodes = set() - scope_stack = [] - - def visit_node(node, current_scope_stack=None): - if current_scope_stack is None: - current_scope_stack = scope_stack[:] - - # Skip if already processed (by memory address) - node_id = id(node) - if node_id in self._processed_nodes: - return - self._processed_nodes.add(node_id) - - node_type = node.type - - # Traditional function and class declarations - if node_type in ['function_declaration', 'method_definition', 'arrow_function']: - name = self._get_js_function_name(node) - if name: - self._register_function_symbol(node, name, file_path, current_scope_stack) - elif node_type in ['class_declaration']: - name = self._get_js_class_name(node) - if name: - self._register_class_symbol(node, name, file_path, current_scope_stack) - - # Assignment expressions with function expressions (obj.method = function() {}) - elif node_type == 'assignment_expression': - self._handle_assignment_expression(node, file_path, current_scope_stack) - - # Lexical declarations (const, let, var) - elif node_type == 'lexical_declaration': - self._handle_lexical_declaration(node, file_path, current_scope_stack) - - # Expression statements (might contain method chains) - elif node_type == 'expression_statement': - self._handle_expression_statement(node, file_path, current_scope_stack) - - # Recursively visit children - for child in node.children: - visit_node(child, current_scope_stack) - - visit_node(tree.root_node) - - def _analyze_tree_for_document(self, tree, file_path: str, content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[List[scip_pb2.Occurrence], List[scip_pb2.SymbolInformation]]: - """Analyze Tree-sitter tree to generate occurrences and symbols for SCIP document.""" - occurrences = [] - symbols = [] - scope_stack = [] - - # Use the same processed nodes set to avoid duplicates - if not hasattr(self, '_processed_nodes'): - self._processed_nodes = set() - - def visit_node(node, current_scope_stack=None): - if current_scope_stack is None: - current_scope_stack = scope_stack[:] - - node_type = node.type - - # Traditional function and class declarations - if node_type in ['function_declaration', 'method_definition', 'arrow_function']: - name = self._get_js_function_name(node) - if name: - symbol_id = self._create_function_symbol_id(name, file_path, current_scope_stack) - occurrence = self._create_function_occurrence(node, symbol_id) - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - symbol_info = self._create_function_symbol_info(node, symbol_id, name, scip_relationships) - - if occurrence: - occurrences.append(occurrence) - if symbol_info: - symbols.append(symbol_info) - - elif node_type in ['class_declaration']: - name = self._get_js_class_name(node) - if name: - symbol_id = self._create_class_symbol_id(name, file_path, current_scope_stack) - occurrence = self._create_class_occurrence(node, symbol_id) - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - symbol_info = self._create_class_symbol_info(node, symbol_id, name, scip_relationships) - - if occurrence: - occurrences.append(occurrence) - if symbol_info: - symbols.append(symbol_info) - - # Assignment expressions with function expressions - elif node_type == 'assignment_expression': - occurrence, symbol_info = self._handle_assignment_for_document(node, file_path, current_scope_stack, relationships) - if occurrence: - occurrences.append(occurrence) - if symbol_info: - symbols.append(symbol_info) - - # Lexical declarations - elif node_type == 'lexical_declaration': - document_symbols = self._handle_lexical_for_document(node, file_path, current_scope_stack, relationships) - for occ, sym in document_symbols: - if occ: - occurrences.append(occ) - if sym: - symbols.append(sym) - - # Recursively visit children only if not in assignment or lexical that we handle above - if node_type not in ['assignment_expression', 'lexical_declaration']: - for child in node.children: - visit_node(child, current_scope_stack) - - visit_node(tree.root_node) - return occurrences, symbols - - def _extract_relationships_from_tree(self, tree, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """Extract relationships from Tree-sitter tree.""" - relationships = {} - scope_stack = [] - relative_path = self._get_relative_path(file_path, project_path) - - def visit_node(node, current_scope_stack=None): - if current_scope_stack is None: - current_scope_stack = scope_stack[:] - - node_type = node.type - - if node_type == 'class_declaration': - # Extract inheritance relationships - class_name = self._get_js_class_name(node) - if class_name: - class_symbol_id = self._create_class_symbol_id(class_name, relative_path, current_scope_stack) - - # Look for extends clause - for child in node.children: - if child.type == 'class_heritage': - for heritage_child in child.children: - if heritage_child.type == 'identifier': - parent_name = self._get_node_text(heritage_child) - if parent_name: - parent_symbol_id = self._create_class_symbol_id(parent_name, relative_path, current_scope_stack) - if class_symbol_id not in relationships: - relationships[class_symbol_id] = [] - relationships[class_symbol_id].append((parent_symbol_id, InternalRelationshipType.INHERITS)) - - elif node_type in ['function_declaration', 'method_definition', 'arrow_function']: - # Extract function call relationships - function_name = self._get_js_function_name(node) - if function_name: - function_symbol_id = self._create_function_symbol_id(function_name, relative_path, current_scope_stack) - - # Find call expressions within this function - self._extract_calls_from_node(node, function_symbol_id, relationships, relative_path, current_scope_stack) - - # Recursively visit children - for child in node.children: - visit_node(child, current_scope_stack) - - visit_node(tree.root_node) - return relationships - - def _extract_calls_from_node(self, node, source_symbol_id: str, relationships: Dict, file_path: str, scope_stack: List): - """Extract function calls from a node.""" - - def visit_for_calls(n): - if n.type == 'call_expression': - # Get the function being called - function_node = n.children[0] if n.children else None - if function_node: - if function_node.type == 'identifier': - target_name = self._get_node_text(function_node) - if target_name: - target_symbol_id = self._create_function_symbol_id(target_name, file_path, scope_stack) - if source_symbol_id not in relationships: - relationships[source_symbol_id] = [] - relationships[source_symbol_id].append((target_symbol_id, InternalRelationshipType.CALLS)) - - for child in n.children: - visit_for_calls(child) - - visit_for_calls(node) - - # Helper methods for Tree-sitter node processing - def _get_node_text(self, node) -> Optional[str]: - """Get text content of a Tree-sitter node.""" - if hasattr(node, 'text'): - try: - return node.text.decode('utf-8') - except: - pass - return None - - def _get_js_function_name(self, node) -> Optional[str]: - """Extract function name from function node.""" - for child in node.children: - if child.type == 'identifier': - return self._get_node_text(child) - return None - - def _get_js_class_name(self, node) -> Optional[str]: - """Extract class name from class node.""" - for child in node.children: - if child.type == 'identifier': - return self._get_node_text(child) - return None - - # Helper methods - def _register_function_symbol(self, node, name: str, file_path: str, scope_stack: List[str]) -> None: - """Register a function symbol definition.""" - symbol_id = self._create_function_symbol_id(name, file_path, scope_stack) - - # Create a dummy range for registration - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Function, - display_name=name, - documentation=["JavaScript function"] - ) - - def _register_class_symbol(self, node, name: str, file_path: str, scope_stack: List[str]) -> None: - """Register a class symbol definition.""" - symbol_id = self._create_class_symbol_id(name, file_path, scope_stack) - - # Create a dummy range for registration - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Class, - display_name=name, - documentation=["JavaScript class"] - ) - - def _create_function_symbol_id(self, name: str, file_path: str, scope_stack: List[str]) -> str: - """Create symbol ID for function.""" - # SCIP standard: local - local_id = ".".join(scope_stack + [name]) if scope_stack else name - return f"local {local_id}()." - - def _create_class_symbol_id(self, name: str, file_path: str, scope_stack: List[str]) -> str: - """Create symbol ID for class.""" - # SCIP standard: local - local_id = ".".join(scope_stack + [name]) if scope_stack else name - return f"local {local_id}#" - - def _create_function_occurrence(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for function.""" - if not self.position_calculator: - return None - - try: - # Use Tree-sitter position calculation method - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierFunction - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_class_occurrence(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for class.""" - if not self.position_calculator: - return None - - try: - # Use Tree-sitter position calculation method - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierType - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_function_symbol_info(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for function.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = name - symbol_info.kind = scip_pb2.Function - - # Add documentation - check for JSDoc or comments - symbol_info.documentation.append("JavaScript function") - - # Add relationships if provided - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_class_symbol_info(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for class.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = name - symbol_info.kind = scip_pb2.Class - - # Add documentation - check for JSDoc or comments - symbol_info.documentation.append("JavaScript class") - - # Add relationships if provided - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - # JavaScript-specific syntax handlers - def _handle_assignment_expression(self, node, file_path: str, scope_stack: List[str]) -> None: - """Handle assignment expressions like obj.method = function() {}""" - left_child = None - right_child = None - - for child in node.children: - if child.type == 'member_expression': - left_child = child - elif child.type in ['function_expression', 'arrow_function']: - right_child = child - - if left_child and right_child: - # Extract method name from member expression - method_name = self._extract_member_expression_name(left_child) - if method_name: - # Use just the last part as function name for cleaner identification - clean_name = method_name.split('.')[-1] if '.' in method_name else method_name - # Register as function symbol - self._register_function_symbol(right_child, clean_name, file_path, scope_stack + method_name.split('.')[:-1]) - - def _handle_lexical_declaration(self, node, file_path: str, scope_stack: List[str]) -> None: - """Handle lexical declarations like const VAR = value""" - for child in node.children: - if child.type == 'variable_declarator': - # Get variable name and value - var_name = None - var_value = None - - for declarator_child in child.children: - if declarator_child.type == 'identifier': - var_name = self._get_node_text(declarator_child) - elif declarator_child.type in ['object_expression', 'new_expression', 'call_expression']: - var_value = declarator_child - elif declarator_child.type == 'object_pattern': - # Handle destructuring like const { v4: uuidv4 } = require('uuid') - self._handle_destructuring_pattern(declarator_child, file_path, scope_stack) - - if var_name: - # Check if this is an import/require statement - if var_value and var_value.type == 'call_expression': - # Check if it's a require() call - is_require = False - for cc in var_value.children: - if cc.type == 'identifier' and self._get_node_text(cc) == 'require': - is_require = True - break - - if is_require: - self._handle_import_statement(var_name, var_value, file_path, scope_stack) - else: - # Register as variable (like const limiter = rateLimit(...)) - self._register_variable_symbol(child, var_name, file_path, scope_stack, var_value) - - # Extract functions from call_expression (like rateLimit config) - self._extract_functions_from_call_expression(var_value, var_name, file_path, scope_stack) - else: - # Register as constant/variable symbol - self._register_variable_symbol(child, var_name, file_path, scope_stack, var_value) - # Extract functions from object expressions - if var_value and var_value.type == 'object_expression': - self._extract_functions_from_object(var_value, var_name, file_path, scope_stack) - - def _handle_expression_statement(self, node, file_path: str, scope_stack: List[str]) -> None: - """Handle expression statements that might contain method chains""" - for child in node.children: - if child.type == 'call_expression': - # Look for method chain patterns like schema.virtual().get() - self._handle_method_chain(child, file_path, scope_stack) - elif child.type == 'assignment_expression': - # Handle nested assignment expressions - self._handle_assignment_expression(child, file_path, scope_stack) - - def _handle_method_chain(self, node, file_path: str, scope_stack: List[str]) -> None: - """Handle method chains like schema.virtual('name').get(function() {})""" - # Look for chained calls that end with function expressions - for child in node.children: - if child.type == 'member_expression': - # This could be a chained method call - member_name = self._extract_member_expression_name(child) - if member_name: - # Look for function arguments - for sibling in node.children: - if sibling.type == 'arguments': - for arg in sibling.children: - if arg.type in ['function_expression', 'arrow_function']: - # Register the function with a descriptive name - func_name = f"{member_name}_callback" - self._register_function_symbol(arg, func_name, file_path, scope_stack) - - def _extract_member_expression_name(self, node) -> Optional[str]: - """Extract name from member expression like obj.prop.method""" - parts = [] - - def extract_parts(n): - if n.type == 'member_expression': - # Process children in order: object first, then property - object_child = None - property_child = None - - for child in n.children: - if child.type in ['identifier', 'member_expression']: - object_child = child - elif child.type == 'property_identifier': - property_child = child - - # Recursively extract object part first - if object_child: - if object_child.type == 'member_expression': - extract_parts(object_child) - elif object_child.type == 'identifier': - parts.append(self._get_node_text(object_child)) - - # Then add the property - if property_child: - parts.append(self._get_node_text(property_child)) - - elif n.type == 'identifier': - parts.append(self._get_node_text(n)) - - extract_parts(node) - return '.'.join(parts) if parts else None - - def _register_variable_symbol(self, node, name: str, file_path: str, scope_stack: List[str], value_node=None) -> None: - """Register a variable/constant symbol definition.""" - symbol_id = self._create_variable_symbol_id(name, file_path, scope_stack, value_node) - - # Determine symbol type based on value - symbol_kind = scip_pb2.Variable - doc_type = "JavaScript variable" - - if value_node: - if value_node.type == 'object_expression': - symbol_kind = scip_pb2.Object - doc_type = "JavaScript object" - elif value_node.type == 'new_expression': - symbol_kind = scip_pb2.Variable # new expressions create variables, not classes - doc_type = "JavaScript instance" - elif value_node.type == 'call_expression': - # Check if it's a require call vs regular function call - is_require = False - for child in value_node.children: - if child.type == 'identifier' and self._get_node_text(child) == 'require': - is_require = True - break - if is_require: - symbol_kind = scip_pb2.Namespace - doc_type = "JavaScript import" - else: - symbol_kind = scip_pb2.Variable - doc_type = "JavaScript constant" - - # Create a dummy range for registration - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=symbol_kind, - display_name=name, - documentation=[doc_type] - ) - - def _handle_destructuring_pattern(self, node, file_path: str, scope_stack: List[str]) -> None: - """Handle destructuring patterns like { v4: uuidv4 }""" - for child in node.children: - if child.type == 'shorthand_property_identifier_pattern': - # Simple destructuring like { prop } - var_name = self._get_node_text(child) - if var_name: - self._register_variable_symbol(child, var_name, file_path, scope_stack) - elif child.type == 'pair_pattern': - # Renamed destructuring like { v4: uuidv4 } - for pair_child in child.children: - if pair_child.type == 'identifier': - var_name = self._get_node_text(pair_child) - if var_name: - self._register_variable_symbol(pair_child, var_name, file_path, scope_stack) - - def _handle_import_statement(self, var_name: str, call_node, file_path: str, scope_stack: List[str]) -> None: - """Handle import statements like const lib = require('module')""" - # Check if this is a require() call - callee = None - module_name = None - - for child in call_node.children: - if child.type == 'identifier': - callee = self._get_node_text(child) - elif child.type == 'arguments': - # Get the module name from arguments - for arg in child.children: - if arg.type == 'string': - module_name = self._get_node_text(arg).strip('"\'') - break - - if callee == 'require' and module_name: - # Classify dependency type - self._classify_and_store_dependency(module_name) - - # Create SCIP standard symbol ID - local_id = ".".join(scope_stack + [var_name]) if scope_stack else var_name - symbol_id = f"local {local_id}(import)" - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Namespace, - display_name=var_name, - documentation=[f"Import from {module_name}"] - ) - - def _handle_assignment_for_document(self, node, file_path: str, scope_stack: List[str], relationships: Optional[Dict[str, List[tuple]]]) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: - """Handle assignment expressions for document generation""" - left_child = None - right_child = None - - for child in node.children: - if child.type == 'member_expression': - left_child = child - elif child.type in ['function_expression', 'arrow_function']: - right_child = child - - if left_child and right_child: - method_name = self._extract_member_expression_name(left_child) - if method_name: - symbol_id = self._create_function_symbol_id(method_name, file_path, scope_stack) - occurrence = self._create_function_occurrence(right_child, symbol_id) - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - symbol_info = self._create_function_symbol_info(right_child, symbol_id, method_name, scip_relationships) - return occurrence, symbol_info - - return None, None - - def _handle_lexical_for_document(self, node, file_path: str, scope_stack: List[str], relationships: Optional[Dict[str, List[tuple]]]) -> List[tuple]: - """Handle lexical declarations for document generation""" - results = [] - - for child in node.children: - if child.type == 'variable_declarator': - var_name = None - var_value = None - - for declarator_child in child.children: - if declarator_child.type == 'identifier': - var_name = self._get_node_text(declarator_child) - elif declarator_child.type in ['object_expression', 'new_expression', 'call_expression']: - var_value = declarator_child - - if var_name: - # Create occurrence and symbol info for variable - symbol_id = self._create_variable_symbol_id(var_name, file_path, scope_stack, var_value) - occurrence = self._create_variable_occurrence(child, symbol_id) - symbol_info = self._create_variable_symbol_info(child, symbol_id, var_name, var_value) - results.append((occurrence, symbol_info)) - - return results - - def _create_variable_symbol_id(self, name: str, file_path: str, scope_stack: List[str], value_node=None) -> str: - """Create symbol ID for variable.""" - # SCIP standard: local - local_id = ".".join(scope_stack + [name]) if scope_stack else name - - # Determine descriptor based on value type - descriptor = "." # Default for variables - if value_node: - if value_node.type == 'object_expression': - descriptor = "{}" - elif value_node.type == 'new_expression': - descriptor = "." # new expressions are still variables, not classes - elif value_node.type == 'call_expression': - # Check if it's a require call vs regular function call - is_require = False - for child in value_node.children: - if child.type == 'identifier' and hasattr(self, '_get_node_text'): - if self._get_node_text(child) == 'require': - is_require = True - break - descriptor = "(import)" if is_require else "." - - return f"local {local_id}{descriptor}" - - def _create_variable_occurrence(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for variable.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierConstant - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_variable_symbol_info(self, node, symbol_id: str, name: str, value_node=None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for variable.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = name - - # Determine kind based on value - correct classification - if value_node: - if value_node.type == 'object_expression': - symbol_info.kind = scip_pb2.Object - symbol_info.documentation.append("JavaScript object literal") - elif value_node.type == 'new_expression': - symbol_info.kind = scip_pb2.Variable # new expressions create variables, not classes - symbol_info.documentation.append("JavaScript instance variable") - elif value_node.type == 'call_expression': - symbol_info.kind = scip_pb2.Namespace - symbol_info.documentation.append("JavaScript import") - elif value_node.type == 'function_expression': - symbol_info.kind = scip_pb2.Function - symbol_info.documentation.append("JavaScript function variable") - else: - symbol_info.kind = scip_pb2.Variable - symbol_info.documentation.append("JavaScript variable") - else: - symbol_info.kind = scip_pb2.Variable - symbol_info.documentation.append("JavaScript variable") - - return symbol_info - - def _extract_functions_from_object(self, object_node, parent_name: str, file_path: str, scope_stack: List[str]) -> None: - """Extract functions from object expressions like { handler: function() {} }""" - for child in object_node.children: - if child.type == 'pair': - prop_name = None - prop_value = None - - for pair_child in child.children: - if pair_child.type in ['identifier', 'property_identifier']: - prop_name = self._get_node_text(pair_child) - elif pair_child.type in ['function_expression', 'arrow_function']: - prop_value = pair_child - - if prop_name and prop_value: - # Register function with context-aware name - func_scope = scope_stack + [parent_name] - self._register_function_symbol(prop_value, prop_name, file_path, func_scope) - - def _extract_functions_from_call_expression(self, call_node, parent_name: str, file_path: str, scope_stack: List[str]) -> None: - """Extract functions from call expressions arguments like rateLimit({ handler: function() {} })""" - for child in call_node.children: - if child.type == 'arguments': - for arg in child.children: - if arg.type == 'object_expression': - self._extract_functions_from_object(arg, parent_name, file_path, scope_stack) - elif arg.type in ['function_expression', 'arrow_function']: - # Anonymous function in call - give it a descriptive name - func_name = f"{parent_name}_callback" - self._register_function_symbol(arg, func_name, file_path, scope_stack) - - def _classify_and_store_dependency(self, module_name: str) -> None: - """Classify and store dependency based on module name.""" - # Standard Node.js built-in modules - node_builtins = { - 'fs', 'path', 'http', 'https', 'url', 'crypto', 'os', 'util', 'events', - 'stream', 'buffer', 'child_process', 'cluster', 'dgram', 'dns', 'net', - 'tls', 'zlib', 'readline', 'repl', 'vm', 'worker_threads', 'async_hooks' - } - - if module_name in node_builtins: - category = 'standard_library' - elif module_name.startswith('./') or module_name.startswith('../') or module_name.startswith('/'): - category = 'local' - else: - category = 'third_party' - - # Avoid duplicates - if module_name not in self.dependencies['imports'][category]: - self.dependencies['imports'][category].append(module_name) - - def get_dependencies(self) -> Dict[str, Any]: - """Get collected dependencies for MCP response.""" - return self.dependencies - - def _reset_dependencies(self) -> None: - """Reset dependency tracking for new file analysis.""" - self.dependencies = { - 'imports': { - 'standard_library': [], - 'third_party': [], - 'local': [] - } - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/strategies/objective_c_strategy.py b/src/code_index_mcp/scip/strategies/objective_c_strategy.py deleted file mode 100644 index c27dc87..0000000 --- a/src/code_index_mcp/scip/strategies/objective_c_strategy.py +++ /dev/null @@ -1,1083 +0,0 @@ -""" -Objective-C Strategy for SCIP indexing using libclang. - -This strategy uses libclang to parse Objective-C source files (.m, .mm, .h) -and extract symbol information following SCIP standards. -""" - -import logging -import os -from typing import List, Set, Optional, Tuple, Dict, Any -from pathlib import Path - -try: - import clang.cindex as clang - from clang.cindex import CursorKind, TypeKind - LIBCLANG_AVAILABLE = True -except ImportError: - LIBCLANG_AVAILABLE = False - clang = None - CursorKind = None - TypeKind = None - -from .base_strategy import SCIPIndexerStrategy, StrategyError -from ..proto import scip_pb2 -from ..core.position_calculator import PositionCalculator -from ..core.relationship_types import InternalRelationshipType - -logger = logging.getLogger(__name__) - - -class ObjectiveCStrategy(SCIPIndexerStrategy): - """SCIP indexing strategy for Objective-C using libclang.""" - - SUPPORTED_EXTENSIONS = {'.m', '.mm', '.h'} - - def __init__(self, priority: int = 95): - """Initialize the Objective-C strategy.""" - super().__init__(priority) - self._processed_symbols: Set[str] = set() - self._symbol_counter = 0 - self.project_path: Optional[str] = None - - def can_handle(self, extension: str, file_path: str) -> bool: - """Check if this strategy can handle the file type.""" - if not LIBCLANG_AVAILABLE: - logger.warning("libclang not available for Objective-C processing") - return False - return extension.lower() in self.SUPPORTED_EXTENSIONS - - def get_language_name(self) -> str: - """Get the language name for SCIP symbol generation.""" - return "objc" - - def is_available(self) -> bool: - """Check if this strategy is available.""" - return LIBCLANG_AVAILABLE - - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Collect all symbol definitions from Objective-C files.""" - logger.debug(f"ObjectiveCStrategy Phase 1: Processing {len(files)} files for symbol collection") - - # Store project path for use in import classification - self.project_path = project_path - - processed_count = 0 - error_count = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - self._collect_symbols_from_file(file_path, project_path) - processed_count += 1 - - if i % 10 == 0 or i == len(files): - logger.debug(f"Phase 1 progress: {i}/{len(files)} files, last file: {relative_path}") - - except Exception as e: - error_count += 1 - logger.warning(f"Phase 1 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 1 summary: {processed_count} files processed, {error_count} errors") - - def _generate_documents_with_references(self, files: List[str], project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> List[scip_pb2.Document]: - """Phase 3: Generate complete SCIP documents with resolved references.""" - documents = [] - logger.debug(f"ObjectiveCStrategy Phase 3: Generating documents for {len(files)} files") - processed_count = 0 - error_count = 0 - total_occurrences = 0 - total_symbols = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - document = self._analyze_objc_file(file_path, project_path, relationships) - if document: - documents.append(document) - total_occurrences += len(document.occurrences) - total_symbols += len(document.symbols) - processed_count += 1 - - if i % 10 == 0 or i == len(files): - logger.debug(f"Phase 3 progress: {i}/{len(files)} files, " - f"last file: {relative_path}, " - f"{len(document.occurrences) if document else 0} occurrences") - - except Exception as e: - error_count += 1 - logger.error(f"Phase 3 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 3 summary: {processed_count} documents generated, {error_count} errors, " - f"{total_occurrences} total occurrences, {total_symbols} total symbols") - - return documents - - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """Phase 2: Build relationships between Objective-C symbols.""" - logger.debug(f"ObjectiveCStrategy: Building symbol relationships for {len(files)} files") - all_relationships = {} - - for file_path in files: - try: - file_relationships = self._extract_relationships_from_file(file_path, project_path) - all_relationships.update(file_relationships) - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - - total_symbols_with_relationships = len(all_relationships) - total_relationships = sum(len(rels) for rels in all_relationships.values()) - - logger.debug(f"ObjectiveCStrategy: Built {total_relationships} relationships for {total_symbols_with_relationships} symbols") - return all_relationships - - def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: - """Collect symbol definitions from a single Objective-C file using libclang.""" - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {os.path.relpath(file_path, project_path)}") - return - - try: - # Parse with libclang - index = clang.Index.create() - translation_unit = index.parse( - file_path, - args=['-ObjC', '-x', 'objective-c'], - options=clang.TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD - ) - - if not translation_unit: - logger.debug(f"Parse failed: {os.path.relpath(file_path, project_path)}") - return - - # Reset processed symbols for each file - self._processed_symbols.clear() - self._symbol_counter = 0 - - # Traverse AST to collect symbols - relative_path = self._get_relative_path(file_path, project_path) - self._traverse_clang_ast_for_symbols(translation_unit.cursor, relative_path, content, file_path) - - # Extract imports/dependencies and register with symbol manager - self._extract_and_register_imports(translation_unit.cursor, file_path, project_path) - - logger.debug(f"Symbol collection completed - {relative_path}") - - except Exception as e: - logger.error(f"Error processing {file_path} with libclang: {e}") - - def _extract_and_register_imports(self, cursor: 'clang.Cursor', file_path: str, project_path: str) -> None: - """Extract imports from AST and register them with the symbol manager.""" - try: - # Traverse AST to find all import statements - self._traverse_ast_for_import_registration(cursor, file_path, project_path) - - except Exception as e: - logger.error(f"Error extracting imports from {file_path}: {e}") - - def _traverse_ast_for_import_registration(self, cursor: 'clang.Cursor', file_path: str, project_path: str) -> None: - """Traverse AST specifically to register imports with the symbol manager.""" - try: - # Process current cursor for import registration - if cursor.kind == CursorKind.INCLUSION_DIRECTIVE: - self._register_import_with_symbol_manager(cursor, file_path, project_path) - - # Recursively process children - for child in cursor.get_children(): - self._traverse_ast_for_import_registration(child, file_path, project_path) - - except Exception as e: - logger.error(f"Error traversing AST for import registration: {e}") - - def _register_import_with_symbol_manager(self, cursor: 'clang.Cursor', file_path: str, project_path: str) -> None: - """Register a single import with the symbol manager.""" - try: - # Try to get the included file path - include_path = None - framework_name = None - - # Method 1: Try to get the included file (may fail for system headers) - try: - included_file = cursor.get_included_file() - if included_file: - include_path = str(included_file) - logger.debug(f"Got include path from file: {include_path}") - except Exception as e: - logger.debug(f"Failed to get included file: {e}") - - # Method 2: Try to get from cursor spelling (the actual #import statement) - spelling = cursor.spelling - if spelling: - logger.debug(f"Got cursor spelling: {spelling}") - # Extract framework name from spelling like "Foundation/Foundation.h" or "Person.h" - framework_name = self._extract_framework_name_from_spelling(spelling) - if framework_name: - logger.debug(f"Extracted framework name from spelling: {framework_name}") - - # Classify based on spelling pattern - import_type = self._classify_import_from_spelling(spelling) - logger.debug(f"Classified import as: {import_type}") - - # Only register external dependencies (not local files) - if import_type in ['standard_library', 'third_party']: - if not self.symbol_manager: - logger.error("Symbol manager is None!") - return - - # Determine version if possible (for now, leave empty) - version = "" - - logger.debug(f"Registering external symbol: {framework_name}") - - # Register the import with the moniker manager - symbol_id = self.symbol_manager.create_external_symbol( - language="objc", - package_name=framework_name, - module_path=framework_name, - symbol_name="*", # Framework-level import - version=version, - alias=None - ) - - logger.debug(f"Registered external dependency: {framework_name} ({import_type}) -> {symbol_id}") - return - else: - logger.debug(f"Skipping local import: {framework_name} ({import_type})") - return - - # Method 3: Fallback to include_path if we have it - if include_path: - logger.debug(f"Processing include path: {include_path}") - - # Extract framework/module name - framework_name = self._extract_framework_name(include_path, cursor) - if not framework_name: - logger.debug(f"No framework name extracted from {include_path}") - return - - logger.debug(f"Extracted framework name: {framework_name}") - - # Classify the import type - import_type = self._classify_objc_import(include_path) - logger.debug(f"Classified import as: {import_type}") - - # Only register external dependencies (not local files) - if import_type in ['standard_library', 'third_party']: - if not self.symbol_manager: - logger.error("Symbol manager is None!") - return - - # Determine version if possible (for now, leave empty) - version = self._extract_framework_version(include_path) - - logger.debug(f"Registering external symbol: {framework_name}") - - # Register the import with the moniker manager - symbol_id = self.symbol_manager.create_external_symbol( - language="objc", - package_name=framework_name, - module_path=framework_name, - symbol_name="*", # Framework-level import - version=version, - alias=None - ) - - logger.debug(f"Registered external dependency: {framework_name} ({import_type}) -> {symbol_id}") - else: - logger.debug(f"Skipping local import: {framework_name} ({import_type})") - else: - logger.debug("No include path or spelling found for cursor") - - except Exception as e: - logger.error(f"Error registering import with symbol manager: {e}") - import traceback - logger.error(f"Traceback: {traceback.format_exc()}") - - def _extract_framework_name_from_spelling(self, spelling: str) -> Optional[str]: - """Extract framework name from cursor spelling.""" - try: - # Remove quotes and angle brackets - clean_spelling = spelling.strip('"<>') - - # For framework imports like "Foundation/Foundation.h" - if '/' in clean_spelling: - parts = clean_spelling.split('/') - if len(parts) >= 2: - framework_name = parts[0] - return framework_name - - # For simple includes like "MyHeader.h" - header_name = clean_spelling.replace('.h', '').replace('.m', '').replace('.mm', '') - return header_name - - except Exception as e: - logger.debug(f"Error extracting framework name from spelling {spelling}: {e}") - return None - - def _classify_import_from_spelling(self, spelling: str) -> str: - """Classify import based on spelling pattern.""" - try: - # Remove quotes and angle brackets - clean_spelling = spelling.strip('"<>') - - # Check if it's a known system framework by name (since cursor.spelling doesn't include brackets) - if '/' in clean_spelling: - framework_name = clean_spelling.split('/')[0] - system_frameworks = { - 'Foundation', 'UIKit', 'CoreData', 'CoreGraphics', 'QuartzCore', - 'AVFoundation', 'CoreLocation', 'MapKit', 'CoreAnimation', - 'Security', 'SystemConfiguration', 'CFNetwork', 'CoreFoundation', - 'AppKit', 'Cocoa', 'WebKit', 'JavaScriptCore', 'Metal', 'MetalKit', - 'GameplayKit', 'SpriteKit', 'SceneKit', 'ARKit', 'Vision', 'CoreML' - } - if framework_name in system_frameworks: - return 'standard_library' - - # Check for single framework names (like just "Foundation.h") - framework_name_only = clean_spelling.replace('.h', '').replace('.framework', '') - system_frameworks = { - 'Foundation', 'UIKit', 'CoreData', 'CoreGraphics', 'QuartzCore', - 'AVFoundation', 'CoreLocation', 'MapKit', 'CoreAnimation', - 'Security', 'SystemConfiguration', 'CFNetwork', 'CoreFoundation', - 'AppKit', 'Cocoa', 'WebKit', 'JavaScriptCore', 'Metal', 'MetalKit', - 'GameplayKit', 'SpriteKit', 'SceneKit', 'ARKit', 'Vision', 'CoreML' - } - if framework_name_only in system_frameworks: - return 'standard_library' - - # Angle brackets indicate system headers (if we had them) - if spelling.startswith('<') and spelling.endswith('>'): - return 'standard_library' - - # Quotes indicate local or third-party headers - elif spelling.startswith('"') and spelling.endswith('"'): - # Check for common third-party patterns - if any(pattern in clean_spelling.lower() for pattern in ['pods/', 'carthage/', 'node_modules/']): - return 'third_party' - - # Default for quoted imports - return 'local' - - # Check for common third-party patterns in the path - if any(pattern in clean_spelling.lower() for pattern in ['pods/', 'carthage/', 'node_modules/']): - return 'third_party' - - # Check if it looks like a local header (simple filename) - if '/' not in clean_spelling and clean_spelling.endswith('.h'): - return 'local' - - # Fallback: if it contains system-like paths, classify as standard_library - if any(pattern in clean_spelling.lower() for pattern in ['/system/', '/usr/', '/applications/xcode']): - return 'standard_library' - - # Default fallback - return 'local' - - except Exception as e: - logger.debug(f"Error classifying import from spelling {spelling}: {e}") - return 'local' - - def _extract_framework_version(self, include_path: str) -> str: - """Extract framework version from include path if available.""" - # For now, return empty string. Could be enhanced to detect versions - # from CocoaPods Podfile.lock, Carthage, or other dependency managers - return "" - - def _analyze_objc_file(self, file_path: str, project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> Optional[scip_pb2.Document]: - """Analyze a single Objective-C file and generate complete SCIP document.""" - content = self._read_file_content(file_path) - if not content: - return None - - try: - # Parse with libclang - index = clang.Index.create() - translation_unit = index.parse( - file_path, - args=['-ObjC', '-x', 'objective-c'], - options=clang.TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD - ) - - if not translation_unit: - return None - - # Create SCIP document - document = scip_pb2.Document() - document.relative_path = self._get_relative_path(file_path, project_path) - document.language = self._get_document_language(file_path) - - # Initialize position calculator - self.position_calculator = PositionCalculator(content) - - # Reset processed symbols for each file - self._processed_symbols.clear() - self._symbol_counter = 0 - - # Generate occurrences and symbols - occurrences = [] - symbols = [] - - # Traverse AST for document generation - self._traverse_clang_ast_for_document(translation_unit.cursor, content, occurrences, symbols, relationships) - - # Add results to document - document.occurrences.extend(occurrences) - document.symbols.extend(symbols) - - logger.debug(f"Analyzed Objective-C file {document.relative_path}: " - f"{len(document.occurrences)} occurrences, {len(document.symbols)} symbols") - - return document - - except Exception as e: - logger.error(f"Error analyzing {file_path} with libclang: {e}") - return None - - def _traverse_clang_ast_for_symbols(self, cursor: 'clang.Cursor', file_path: str, content: str, full_file_path: str) -> None: - """Traverse libclang AST for symbol definitions (Phase 1).""" - try: - # Process current cursor - self._process_cursor_for_symbols(cursor, file_path, content, full_file_path) - - # Recursively process children - for child in cursor.get_children(): - self._traverse_clang_ast_for_symbols(child, file_path, content, full_file_path) - - except Exception as e: - logger.error(f"Error traversing AST for symbols: {e}") - - def _traverse_clang_ast_for_imports(self, cursor: 'clang.Cursor', file_path: str, imports: 'ImportGroup') -> None: - """Traverse libclang AST specifically for import/include statements.""" - try: - # Process current cursor for imports - self._process_cursor_for_imports(cursor, file_path, imports) - - # Recursively process children - for child in cursor.get_children(): - self._traverse_clang_ast_for_imports(child, file_path, imports) - - except Exception as e: - logger.error(f"Error traversing AST for imports: {e}") - - def _traverse_clang_ast_for_document(self, cursor: 'clang.Cursor', content: str, occurrences: List, symbols: List, relationships: Optional[Dict[str, List[tuple]]] = None) -> None: - """Traverse libclang AST for document generation (Phase 3).""" - try: - # Process current cursor - self._process_cursor_for_document(cursor, content, occurrences, symbols, relationships) - - # Recursively process children - for child in cursor.get_children(): - self._traverse_clang_ast_for_document(child, content, occurrences, symbols, relationships) - - except Exception as e: - logger.error(f"Error traversing AST for document: {e}") - - def _process_cursor_for_symbols(self, cursor: 'clang.Cursor', file_path: str, content: str, full_file_path: str) -> None: - """Process a cursor for symbol registration (Phase 1).""" - try: - # Skip invalid cursors or those outside our file - if not cursor.location.file or cursor.spelling == "": - return - - # Check if cursor is in the file we're processing - cursor_file = str(cursor.location.file) - if not cursor_file.endswith(os.path.basename(full_file_path)): - return - - cursor_kind = cursor.kind - symbol_name = cursor.spelling - - # Map libclang cursor kinds to SCIP symbols - symbol_info = self._map_cursor_to_symbol(cursor, symbol_name) - if not symbol_info: - return - - symbol_id, symbol_kind, symbol_roles = symbol_info - - # Avoid duplicates - duplicate_key = f"{symbol_id}:{cursor.location.line}:{cursor.location.column}" - if duplicate_key in self._processed_symbols: - return - self._processed_symbols.add(duplicate_key) - - # Calculate position - location = cursor.location - if location.line is not None and location.column is not None: - # libclang uses 1-based indexing, convert to 0-based - line = location.line - 1 - column = location.column - 1 - - # Calculate end position (approximate) - end_line = line - end_column = column + len(symbol_name) - - # Register symbol with reference resolver - if self.position_calculator: - range_obj = self.position_calculator.line_col_to_range(line, column, end_line, end_column) - else: - # Create a simple range object if position_calculator is not available - from ..proto.scip_pb2 import Range - range_obj = Range() - range_obj.start.extend([line, column]) - range_obj.end.extend([end_line, end_column]) - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=range_obj, - symbol_kind=symbol_kind, - display_name=symbol_name, - documentation=[f"Objective-C {cursor_kind.name}"] - ) - - logger.debug(f"Registered Objective-C symbol: {symbol_name} ({cursor_kind.name}) at {line}:{column}") - - except Exception as e: - logger.error(f"Error processing cursor for symbols {cursor.spelling}: {e}") - - def _process_cursor_for_document(self, cursor: 'clang.Cursor', content: str, occurrences: List, symbols: List, relationships: Optional[Dict[str, List[tuple]]] = None) -> None: - """Process a cursor for document generation (Phase 3).""" - try: - # Skip invalid cursors or those outside our file - if not cursor.location.file or cursor.spelling == "": - return - - cursor_kind = cursor.kind - symbol_name = cursor.spelling - - # Map libclang cursor kinds to SCIP symbols - symbol_info = self._map_cursor_to_symbol(cursor, symbol_name) - if not symbol_info: - return - - symbol_id, symbol_kind, symbol_roles = symbol_info - - # Avoid duplicates - duplicate_key = f"{symbol_id}:{cursor.location.line}:{cursor.location.column}" - if duplicate_key in self._processed_symbols: - return - self._processed_symbols.add(duplicate_key) - - # Calculate position - location = cursor.location - if location.line is not None and location.column is not None: - # libclang uses 1-based indexing, convert to 0-based - line = location.line - 1 - column = location.column - 1 - - # Calculate end position (approximate) - end_line = line - end_column = column + len(symbol_name) - - # Create SCIP occurrence - occurrence = self._create_occurrence(symbol_id, line, column, end_line, end_column, symbol_roles) - if occurrence: - occurrences.append(occurrence) - - # Get relationships for this symbol - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - # Create SCIP symbol information with relationships - symbol_info_obj = self._create_symbol_information_with_relationships(symbol_id, symbol_name, symbol_kind, scip_relationships) - if symbol_info_obj: - symbols.append(symbol_info_obj) - - logger.debug(f"Added Objective-C symbol: {symbol_name} ({cursor_kind.name}) at {line}:{column} with {len(scip_relationships)} relationships") - - except Exception as e: - logger.error(f"Error processing cursor for document {cursor.spelling}: {e}") - - def _process_cursor_for_imports(self, cursor: 'clang.Cursor', file_path: str, imports: 'ImportGroup') -> None: - """Process a cursor for import/include statements.""" - try: - # Skip invalid cursors or those outside our file - if not cursor.location.file: - return - - cursor_kind = cursor.kind - - # Process inclusion directives (#import, #include, @import) - if cursor_kind == CursorKind.INCLUSION_DIRECTIVE: - self._process_inclusion_directive(cursor, file_path, imports) - - except Exception as e: - logger.error(f"Error processing cursor for imports: {e}") - - def _process_inclusion_directive(self, cursor: 'clang.Cursor', file_path: str, imports: 'ImportGroup') -> None: - """Process a single #import/#include/@import directive.""" - try: - # Get the included file - included_file = cursor.get_included_file() - if not included_file: - return - - include_path = str(included_file) - - # Extract framework/module name - framework_name = self._extract_framework_name(include_path, cursor) - if not framework_name: - return - - # Classify the import type - import_type = self._classify_objc_import(include_path) - - # Add to imports - imports.add_import(framework_name, import_type) - - # Register with moniker manager for external dependencies - if import_type in ['standard_library', 'third_party'] and self.symbol_manager: - self._register_framework_dependency(framework_name, import_type, include_path) - - logger.debug(f"Processed import: {framework_name} ({import_type}) from {include_path}") - - except Exception as e: - logger.error(f"Error processing inclusion directive: {e}") - - def _extract_framework_name(self, include_path: str, cursor: 'clang.Cursor') -> Optional[str]: - """Extract framework/module name from include path.""" - try: - # Get the original spelling from the cursor (what was actually written) - spelling = cursor.spelling - if spelling: - # Remove quotes and angle brackets - clean_spelling = spelling.strip('"<>') - - # For framework imports like - if '/' in clean_spelling: - parts = clean_spelling.split('/') - if len(parts) >= 2: - framework_name = parts[0] - # Common iOS/macOS frameworks - if framework_name in ['Foundation', 'UIKit', 'CoreData', 'CoreGraphics', - 'QuartzCore', 'AVFoundation', 'CoreLocation', 'MapKit']: - return framework_name - # For other frameworks, use the framework name - return framework_name - - # For simple includes like "MyHeader.h" - header_name = clean_spelling.replace('.h', '').replace('.m', '').replace('.mm', '') - return header_name - - # Fallback: extract from full path - if '/' in include_path: - path_parts = include_path.split('/') - - # Look for .framework in path - for i, part in enumerate(path_parts): - if part.endswith('.framework') and i + 1 < len(path_parts): - return part.replace('.framework', '') - - # Look for Headers directory (common in frameworks) - if 'Headers' in path_parts: - headers_idx = path_parts.index('Headers') - if headers_idx > 0: - framework_part = path_parts[headers_idx - 1] - if framework_part.endswith('.framework'): - return framework_part.replace('.framework', '') - - # Use the filename without extension - filename = path_parts[-1] - return filename.replace('.h', '').replace('.m', '').replace('.mm', '') - - return None - - except Exception as e: - logger.debug(f"Error extracting framework name from {include_path}: {e}") - return None - - def _classify_objc_import(self, include_path: str) -> str: - """Classify Objective-C import as system, third-party, or local.""" - try: - # System frameworks (typical macOS/iOS system paths) - system_indicators = [ - '/Applications/Xcode.app/', - '/System/Library/', - '/usr/include/', - 'Platforms/iPhoneOS.platform/', - 'Platforms/iPhoneSimulator.platform/', - 'Platforms/MacOSX.platform/' - ] - - for indicator in system_indicators: - if indicator in include_path: - return 'standard_library' - - # Common system frameworks by name - system_frameworks = { - 'Foundation', 'UIKit', 'CoreData', 'CoreGraphics', 'QuartzCore', - 'AVFoundation', 'CoreLocation', 'MapKit', 'CoreAnimation', - 'Security', 'SystemConfiguration', 'CFNetwork', 'CoreFoundation', - 'AppKit', 'Cocoa', 'WebKit', 'JavaScriptCore' - } - - for framework in system_frameworks: - if f'/{framework}.framework/' in include_path or f'{framework}/' in include_path: - return 'standard_library' - - # Third-party dependency managers - third_party_indicators = [ - '/Pods/', # CocoaPods - '/Carthage/', # Carthage - '/node_modules/', # React Native - '/DerivedData/', # Sometimes used for third-party - ] - - for indicator in third_party_indicators: - if indicator in include_path: - return 'third_party' - - # Check if it's within the project directory - if hasattr(self, 'project_path') and self.project_path: - if include_path.startswith(str(self.project_path)): - return 'local' - - # Check for relative paths (usually local) - if include_path.startswith('./') or include_path.startswith('../'): - return 'local' - - # If path contains common local indicators - if any(indicator in include_path.lower() for indicator in ['src/', 'source/', 'include/', 'headers/']): - return 'local' - - # Default to third-party for unknown external dependencies - return 'third_party' - - except Exception as e: - logger.debug(f"Error classifying import {include_path}: {e}") - return 'third_party' - - def _register_framework_dependency(self, framework_name: str, import_type: str, include_path: str) -> None: - """Register framework dependency with moniker manager.""" - try: - if not self.symbol_manager: - return - - # Determine package manager based on import type and path - if import_type == 'standard_library': - manager = 'system' - elif '/Pods/' in include_path: - manager = 'cocoapods' - elif '/Carthage/' in include_path: - manager = 'carthage' - else: - manager = 'unknown' - - # Register the external symbol for the framework - self.symbol_manager.create_external_symbol( - language="objc", - package_name=framework_name, - module_path=framework_name, - symbol_name="*", # Framework-level import - version="", # Version detection could be added later - alias=None - ) - - logger.debug(f"Registered framework dependency: {framework_name} via {manager}") - - except Exception as e: - logger.error(f"Error registering framework dependency {framework_name}: {e}") - - def _map_cursor_to_symbol(self, cursor: 'clang.Cursor', symbol_name: str) -> Optional[Tuple[str, int, int]]: - """Map libclang cursor to SCIP symbol information.""" - try: - cursor_kind = cursor.kind - - # Map Objective-C specific cursors - if cursor_kind == CursorKind.OBJC_INTERFACE_DECL: - # @interface ClassName - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Class, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.OBJC_PROTOCOL_DECL: - # @protocol ProtocolName - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Interface, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.OBJC_CATEGORY_DECL: - # @interface ClassName (CategoryName) - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Class, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.OBJC_INSTANCE_METHOD_DECL: - # Instance method: - (void)methodName - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Method, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.OBJC_CLASS_METHOD_DECL: - # Class method: + (void)methodName - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Method, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.OBJC_PROPERTY_DECL: - # @property declaration - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Property, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.OBJC_IVAR_DECL: - # Instance variable - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Field, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.OBJC_IMPLEMENTATION_DECL: - # @implementation ClassName - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Class, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.OBJC_CATEGORY_IMPL_DECL: - # @implementation ClassName (CategoryName) - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Class, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.FUNCTION_DECL: - # Regular C function - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Function, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.VAR_DECL: - # Variable declaration - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.Variable, scip_pb2.SymbolRole.Definition) - - elif cursor_kind == CursorKind.TYPEDEF_DECL: - # Type definition - symbol_id = f"local {self._get_local_id_for_cursor(cursor)}" - return (symbol_id, scip_pb2.SymbolKind.TypeParameter, scip_pb2.SymbolRole.Definition) - - # Add more cursor mappings as needed - return None - - except Exception as e: - logger.error(f"Error mapping cursor {symbol_name}: {e}") - return None - - def _get_local_id(self) -> str: - """Generate unique local symbol ID.""" - self._symbol_counter += 1 - return f"objc_{self._symbol_counter}" - - def _get_local_id_for_cursor(self, cursor: 'clang.Cursor') -> str: - """Generate consistent local symbol ID based on cursor properties.""" - # Create deterministic ID based on cursor type, name, and location - cursor_type = cursor.kind.name.lower() - symbol_name = cursor.spelling or "unnamed" - line = cursor.location.line - - return f"{cursor_type}_{symbol_name}_{line}" - - def _create_occurrence(self, symbol_id: str, start_line: int, start_col: int, - end_line: int, end_col: int, symbol_roles: int) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence.""" - try: - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = symbol_roles - occurrence.range.start.extend([start_line, start_col]) - occurrence.range.end.extend([end_line, end_col]) - - return occurrence - - except Exception as e: - logger.error(f"Error creating occurrence: {e}") - return None - - def _create_symbol_information(self, symbol_id: str, display_name: str, symbol_kind: int) -> Optional[scip_pb2.SymbolInformation]: - """Create SCIP symbol information.""" - try: - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.kind = symbol_kind - symbol_info.display_name = display_name - - return symbol_info - - except Exception as e: - logger.error(f"Error creating symbol information: {e}") - return None - - def _create_symbol_information_with_relationships(self, symbol_id: str, display_name: str, symbol_kind: int, relationships: List['scip_pb2.Relationship']) -> Optional[scip_pb2.SymbolInformation]: - """Create SCIP symbol information with relationships.""" - try: - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.kind = symbol_kind - symbol_info.display_name = display_name - - # Add relationships if provided - if relationships: - symbol_info.relationships.extend(relationships) - - return symbol_info - - except Exception as e: - logger.error(f"Error creating symbol information with relationships: {e}") - return None - - def _extract_relationships_from_file(self, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """Extract relationships from a single Objective-C file using libclang.""" - content = self._read_file_content(file_path) - if not content: - return {} - - try: - # Parse with libclang - index = clang.Index.create() - translation_unit = index.parse( - file_path, - args=['-ObjC', '-x', 'objective-c'], - options=clang.TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD - ) - - if not translation_unit: - return {} - - return self._extract_relationships_from_ast(translation_unit.cursor, file_path, project_path) - - except Exception as e: - logger.error(f"Error extracting relationships from {file_path}: {e}") - return {} - - def _extract_relationships_from_ast(self, cursor: 'clang.Cursor', file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """Extract relationships from libclang AST.""" - relationships = {} - relative_path = self._get_relative_path(file_path, project_path) - - # Track current method context for method calls - current_method_symbol = None - - def traverse_for_relationships(cursor_node, parent_method=None): - """Recursively traverse AST to find relationships.""" - nonlocal current_method_symbol - - try: - # Skip if cursor is not in our file - if not cursor_node.location.file or cursor_node.spelling == "": - pass - else: - cursor_file = str(cursor_node.location.file) - if cursor_file.endswith(os.path.basename(file_path)): - cursor_kind = cursor_node.kind - - # Track method context - if cursor_kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - method_symbol_id = f"local {self._get_local_id_for_cursor(cursor_node)}" - current_method_symbol = method_symbol_id - parent_method = method_symbol_id - - # Detect Objective-C method calls - elif cursor_kind == CursorKind.OBJC_MESSAGE_EXPR: - if parent_method: - # Get the method being called - called_method = self._extract_method_from_message_expr(cursor_node) - if called_method: - target_symbol_id = f"local objc_call_{called_method}_{cursor_node.location.line}" - - if parent_method not in relationships: - relationships[parent_method] = [] - relationships[parent_method].append((target_symbol_id, InternalRelationshipType.CALLS)) - - logger.debug(f"Found method call: {parent_method} -> {target_symbol_id}") - - # Detect C function calls - elif cursor_kind == CursorKind.CALL_EXPR: - if parent_method: - function_name = cursor_node.spelling - if function_name: - target_symbol_id = f"local c_func_{function_name}_{cursor_node.location.line}" - - if parent_method not in relationships: - relationships[parent_method] = [] - relationships[parent_method].append((target_symbol_id, InternalRelationshipType.CALLS)) - - logger.debug(f"Found function call: {parent_method} -> {target_symbol_id}") - - # Recursively process children - for child in cursor_node.get_children(): - traverse_for_relationships(child, parent_method) - - except Exception as e: - logger.error(f"Error processing cursor for relationships: {e}") - - # Start traversal - traverse_for_relationships(cursor) - - return relationships - - def _extract_method_from_message_expr(self, cursor: 'clang.Cursor') -> Optional[str]: - """Extract method name from Objective-C message expression.""" - try: - # Get the selector/method name from the message expression - # This is a simplified extraction - could be enhanced - for child in cursor.get_children(): - if child.kind == CursorKind.OBJC_MESSAGE_EXPR: - return child.spelling - elif child.spelling and len(child.spelling) > 0: - # Try to get method name from any meaningful child - return child.spelling - - # Fallback: use the cursor's own spelling if available - return cursor.spelling if cursor.spelling else None - - except Exception as e: - logger.error(f"Error extracting method from message expression: {e}") - return None - - def _create_scip_relationships(self, relationships: List[tuple]) -> List['scip_pb2.Relationship']: - """Convert internal relationships to SCIP relationships.""" - scip_relationships = [] - - for target_symbol, relationship_type in relationships: - try: - relationship = scip_pb2.Relationship() - relationship.symbol = target_symbol - - # Map relationship type to SCIP flags - if relationship_type == InternalRelationshipType.CALLS: - relationship.is_reference = True - elif relationship_type == InternalRelationshipType.INHERITS: - relationship.is_reference = True - elif relationship_type == InternalRelationshipType.IMPLEMENTS: - relationship.is_implementation = True - else: - relationship.is_reference = True # Default fallback - - scip_relationships.append(relationship) - - except Exception as e: - logger.error(f"Error creating SCIP relationship: {e}") - continue - - return scip_relationships - - def _get_document_language(self, file_path: str) -> str: - """Get the document language identifier.""" - if file_path.endswith('.mm'): - return 'objcpp' - return 'objc' - - # Utility methods from base strategy - def _read_file_content(self, file_path: str) -> Optional[str]: - """Read file content safely.""" - try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: - return f.read() - except Exception as e: - logger.warning(f"Failed to read file {file_path}: {e}") - return None - - def _get_relative_path(self, file_path: str, project_path: str) -> str: - """Get relative path from project root.""" - return os.path.relpath(file_path, project_path).replace(os.sep, '/') - - def get_supported_languages(self) -> List[str]: - """Return list of supported language identifiers.""" - return ["objective-c", "objc", "objective-c-header"] - - -class StrategyError(Exception): - """Exception raised when a strategy cannot process files.""" - pass \ No newline at end of file diff --git a/src/code_index_mcp/scip/strategies/python_strategy.py b/src/code_index_mcp/scip/strategies/python_strategy.py deleted file mode 100644 index b14da42..0000000 --- a/src/code_index_mcp/scip/strategies/python_strategy.py +++ /dev/null @@ -1,413 +0,0 @@ -"""Python SCIP indexing strategy - SCIP standard compliant.""" - -import ast -import logging -import os -from typing import List, Optional, Dict, Any, Set -from pathlib import Path - -from .base_strategy import SCIPIndexerStrategy, StrategyError -from ..proto import scip_pb2 -from ..core.position_calculator import PositionCalculator -from ..core.relationship_types import InternalRelationshipType - - -logger = logging.getLogger(__name__) - - -class PythonStrategy(SCIPIndexerStrategy): - """SCIP-compliant Python indexing strategy using AST analysis.""" - - SUPPORTED_EXTENSIONS = {'.py', '.pyw'} - - def __init__(self, priority: int = 90): - """Initialize the Python strategy.""" - super().__init__(priority) - - def can_handle(self, extension: str, file_path: str) -> bool: - """Check if this strategy can handle the file type.""" - return extension.lower() in self.SUPPORTED_EXTENSIONS - - def get_language_name(self) -> str: - """Get the language name for SCIP symbol generation.""" - return "python" - - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Collect all symbol definitions from Python files.""" - logger.debug(f"PythonStrategy Phase 1: Processing {len(files)} files for symbol collection") - processed_count = 0 - error_count = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - self._collect_symbols_from_file(file_path, project_path) - processed_count += 1 - - if i % 10 == 0 or i == len(files): # Progress every 10 files or at end - logger.debug(f"Phase 1 progress: {i}/{len(files)} files, last file: {relative_path}") - - except Exception as e: - error_count += 1 - logger.warning(f"Phase 1 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 1 summary: {processed_count} files processed, {error_count} errors") - - def _generate_documents_with_references(self, files: List[str], project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> List[scip_pb2.Document]: - """Phase 2: Generate complete SCIP documents with resolved references.""" - documents = [] - logger.debug(f"PythonStrategy Phase 2: Generating documents for {len(files)} files") - processed_count = 0 - error_count = 0 - total_occurrences = 0 - total_symbols = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - document = self._analyze_python_file(file_path, project_path, relationships) - if document: - documents.append(document) - total_occurrences += len(document.occurrences) - total_symbols += len(document.symbols) - processed_count += 1 - - if i % 10 == 0 or i == len(files): # Progress every 10 files or at end - logger.debug(f"Phase 2 progress: {i}/{len(files)} files, " - f"last file: {relative_path}, " - f"{len(document.occurrences) if document else 0} occurrences") - - except Exception as e: - error_count += 1 - logger.error(f"Phase 2 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 2 summary: {processed_count} documents generated, {error_count} errors, " - f"{total_occurrences} total occurrences, {total_symbols} total symbols") - - return documents - - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """ - Build relationships between Python symbols. - - Args: - files: List of file paths to process - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - logger.debug(f"PythonStrategy: Building symbol relationships for {len(files)} files") - - all_relationships = {} - - for file_path in files: - try: - file_relationships = self._extract_relationships_from_file(file_path, project_path) - all_relationships.update(file_relationships) - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - - total_symbols_with_relationships = len(all_relationships) - total_relationships = sum(len(rels) for rels in all_relationships.values()) - - logger.debug(f"PythonStrategy: Built {total_relationships} relationships for {total_symbols_with_relationships} symbols") - return all_relationships - - def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: - """Collect symbol definitions from a single Python file.""" - - # Read file content - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {os.path.relpath(file_path, project_path)}") - return - - # Parse AST - try: - tree = ast.parse(content, filename=file_path) - except SyntaxError as e: - logger.warning(f"Syntax error in {os.path.relpath(file_path, project_path)}: {e}") - return - - # Collect symbols using integrated visitor - relative_path = self._get_relative_path(file_path, project_path) - self._collect_symbols_from_ast(tree, relative_path, content) - logger.debug(f"Symbol collection - {relative_path}") - - def _analyze_python_file(self, file_path: str, project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> Optional[scip_pb2.Document]: - """Analyze a single Python file and generate complete SCIP document.""" - relative_path = self._get_relative_path(file_path, project_path) - - # Read file content - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {relative_path}") - return None - - # Parse AST - try: - tree = ast.parse(content, filename=file_path) - except SyntaxError as e: - logger.warning(f"Syntax error in {relative_path}: {e}") - return None - - # Create SCIP document - document = scip_pb2.Document() - document.relative_path = relative_path - document.language = self.get_language_name() - - # Analyze AST and generate occurrences - self.position_calculator = PositionCalculator(content) - - occurrences, symbols = self._analyze_ast_for_document(tree, relative_path, content, relationships) - - # Add results to document - document.occurrences.extend(occurrences) - document.symbols.extend(symbols) - - logger.debug(f"Document analysis - {relative_path}: " - f"-> {len(document.occurrences)} occurrences, {len(document.symbols)} symbols") - - return document - - def _extract_relationships_from_file(self, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """ - Extract relationships from a single Python file. - - Args: - file_path: File to analyze - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - content = self._read_file_content(file_path) - if not content: - return {} - - try: - tree = ast.parse(content) - except SyntaxError as e: - logger.warning(f"Syntax error in {file_path}: {e}") - return {} - - return self._extract_relationships_from_ast(tree, file_path, project_path) - - def _collect_symbols_from_ast(self, tree: ast.AST, file_path: str, content: str) -> None: - """Collect symbols from AST using integrated visitor.""" - scope_stack = [] - - for node in ast.walk(tree): - if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef): - self._register_function_symbol(node, node.name, file_path, scope_stack) - elif isinstance(node, ast.ClassDef): - self._register_class_symbol(node, node.name, file_path, scope_stack) - - def _analyze_ast_for_document(self, tree: ast.AST, file_path: str, content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[List[scip_pb2.Occurrence], List[scip_pb2.SymbolInformation]]: - """Analyze AST to generate occurrences and symbols for SCIP document.""" - occurrences = [] - symbols = [] - scope_stack = [] - - # Simple implementation - can be enhanced later - for node in ast.walk(tree): - if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef): - symbol_id = self._create_function_symbol_id(node.name, file_path, scope_stack) - occurrence = self._create_function_occurrence(node, symbol_id) - # Get relationships for this symbol - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_function_symbol_info(node, symbol_id, scip_relationships) - - if occurrence: - occurrences.append(occurrence) - if symbol_info: - symbols.append(symbol_info) - - elif isinstance(node, ast.ClassDef): - symbol_id = self._create_class_symbol_id(node.name, file_path, scope_stack) - occurrence = self._create_class_occurrence(node, symbol_id) - # Get relationships for this symbol - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_class_symbol_info(node, symbol_id, scip_relationships) - - if occurrence: - occurrences.append(occurrence) - if symbol_info: - symbols.append(symbol_info) - - return occurrences, symbols - - def _extract_relationships_from_ast(self, tree: ast.AST, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """Extract relationships from AST.""" - relationships = {} - scope_stack = [] - - for node in ast.walk(tree): - if isinstance(node, ast.ClassDef): - # Extract inheritance relationships - relative_path = self._get_relative_path(file_path, project_path) - class_symbol_id = self._create_class_symbol_id(node.name, relative_path, scope_stack) - - for base in node.bases: - if isinstance(base, ast.Name): - parent_symbol_id = self._create_class_symbol_id(base.id, relative_path, scope_stack) - if class_symbol_id not in relationships: - relationships[class_symbol_id] = [] - relationships[class_symbol_id].append((parent_symbol_id, InternalRelationshipType.INHERITS)) - - elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef): - # Extract function call relationships - relative_path = self._get_relative_path(file_path, project_path) - function_symbol_id = self._create_function_symbol_id(node.name, relative_path, scope_stack) - - for child in ast.walk(node): - if isinstance(child, ast.Call): - if isinstance(child.func, ast.Name): - target_symbol_id = self._create_function_symbol_id(child.func.id, relative_path, scope_stack) - if function_symbol_id not in relationships: - relationships[function_symbol_id] = [] - relationships[function_symbol_id].append((target_symbol_id, InternalRelationshipType.CALLS)) - - return relationships - - # Helper methods - def _register_function_symbol(self, node: ast.AST, name: str, file_path: str, scope_stack: List[str]) -> None: - """Register a function symbol definition.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="()." - ) - - # Create a dummy range for registration - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Function, - display_name=name, - documentation=["Python function"] - ) - - def _register_class_symbol(self, node: ast.AST, name: str, file_path: str, scope_stack: List[str]) -> None: - """Register a class symbol definition.""" - symbol_id = self.symbol_manager.create_local_symbol( - language="python", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="#" - ) - - # Create a dummy range for registration - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Class, - display_name=name, - documentation=["Python class"] - ) - - def _create_function_symbol_id(self, name: str, file_path: str, scope_stack: List[str]) -> str: - """Create symbol ID for function.""" - return self.symbol_manager.create_local_symbol( - language="python", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="()." - ) - - def _create_class_symbol_id(self, name: str, file_path: str, scope_stack: List[str]) -> str: - """Create symbol ID for class.""" - return self.symbol_manager.create_local_symbol( - language="python", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="#" - ) - - def _create_function_occurrence(self, node: ast.AST, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for function.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.ast_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierFunction - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_class_occurrence(self, node: ast.AST, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for class.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.ast_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierType - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_function_symbol_info(self, node: ast.AST, symbol_id: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for function.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = node.name - symbol_info.kind = scip_pb2.Function - - # Add docstring if available - docstring = ast.get_docstring(node) - if docstring: - symbol_info.documentation.append(docstring) - - # Add relationships if provided - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_class_symbol_info(self, node: ast.AST, symbol_id: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for class.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = node.name - symbol_info.kind = scip_pb2.Class - - # Add docstring if available - docstring = ast.get_docstring(node) - if docstring: - symbol_info.documentation.append(docstring) - - # Add relationships if provided - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info \ No newline at end of file diff --git a/src/code_index_mcp/scip/strategies/zig_strategy.py b/src/code_index_mcp/scip/strategies/zig_strategy.py deleted file mode 100644 index 4889454..0000000 --- a/src/code_index_mcp/scip/strategies/zig_strategy.py +++ /dev/null @@ -1,1086 +0,0 @@ -"""Zig SCIP indexing strategy - SCIP standard compliant.""" - -import logging -import os -import re -from typing import List, Optional, Dict, Any, Set -from pathlib import Path - -import tree_sitter -from tree_sitter_zig import language as zig_language - -from .base_strategy import SCIPIndexerStrategy, StrategyError -from ..proto import scip_pb2 -from ..core.position_calculator import PositionCalculator -from ..core.relationship_types import InternalRelationshipType - - -logger = logging.getLogger(__name__) - - -class ZigStrategy(SCIPIndexerStrategy): - """SCIP-compliant Zig indexing strategy.""" - - SUPPORTED_EXTENSIONS = {'.zig', '.zon'} - - def __init__(self, priority: int = 95): - """Initialize the Zig strategy.""" - super().__init__(priority) - - # Initialize parser - lang = tree_sitter.Language(zig_language()) - self.parser = tree_sitter.Parser(lang) - self.use_tree_sitter = True - - # Initialize dependency tracking - self.dependencies = { - 'imports': { - 'standard_library': [], - 'third_party': [], - 'local': [] - } - } - - def can_handle(self, extension: str, file_path: str) -> bool: - """Check if this strategy can handle the file type.""" - return extension.lower() in self.SUPPORTED_EXTENSIONS - - def get_language_name(self) -> str: - """Get the language name for SCIP symbol generation.""" - return "zig" - - def is_available(self) -> bool: - """Check if this strategy is available.""" - return self.use_tree_sitter and self.parser is not None - - def _collect_symbol_definitions(self, files: List[str], project_path: str) -> None: - """Phase 1: Collect all symbol definitions from Zig files.""" - logger.debug(f"ZigStrategy Phase 1: Processing {len(files)} files for symbol collection") - processed_count = 0 - error_count = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - self._collect_symbols_from_file(file_path, project_path) - processed_count += 1 - - if i % 10 == 0 or i == len(files): - logger.debug(f"Phase 1 progress: {i}/{len(files)} files, last file: {relative_path}") - - except Exception as e: - error_count += 1 - logger.warning(f"Phase 1 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 1 summary: {processed_count} files processed, {error_count} errors") - - def _generate_documents_with_references(self, files: List[str], project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> List[scip_pb2.Document]: - """Phase 2: Generate complete SCIP documents with resolved references.""" - documents = [] - logger.debug(f"ZigStrategy Phase 2: Generating documents for {len(files)} files") - processed_count = 0 - error_count = 0 - total_occurrences = 0 - total_symbols = 0 - - for i, file_path in enumerate(files, 1): - relative_path = os.path.relpath(file_path, project_path) - - try: - document = self._analyze_zig_file(file_path, project_path, relationships) - if document: - documents.append(document) - total_occurrences += len(document.occurrences) - total_symbols += len(document.symbols) - processed_count += 1 - - if i % 10 == 0 or i == len(files): - logger.debug(f"Phase 2 progress: {i}/{len(files)} files, " - f"last file: {relative_path}, " - f"{len(document.occurrences) if document else 0} occurrences") - - except Exception as e: - error_count += 1 - logger.error(f"Phase 2 failed for {relative_path}: {e}") - continue - - logger.info(f"Phase 2 summary: {processed_count} documents generated, {error_count} errors, " - f"{total_occurrences} total occurrences, {total_symbols} total symbols") - - return documents - - def _collect_symbols_from_file(self, file_path: str, project_path: str) -> None: - """Collect symbol definitions from a single Zig file.""" - # Reset dependencies for this file - self._reset_dependencies() - - # Read file content - content = self._read_file_content(file_path) - if not content: - logger.debug(f"Empty file skipped: {os.path.relpath(file_path, project_path)}") - return - - relative_path = self._get_relative_path(file_path, project_path) - - if self.use_tree_sitter and self.parser: - # Parse with Tree-sitter - tree = self._parse_content(content) - if tree: - self._collect_symbols_from_tree_sitter(tree, relative_path, content) - # Register dependencies with symbol manager - self._register_dependencies_with_symbol_manager() - logger.debug(f"Tree-sitter symbol collection - {relative_path}, deps: {self._count_dependencies()}") - return - - raise StrategyError(f"Failed to parse {relative_path} with tree-sitter for symbol collection") - - def _analyze_zig_file(self, file_path: str, project_path: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> Optional[scip_pb2.Document]: - """Analyze a single Zig file and generate complete SCIP document.""" - # Read file content - content = self._read_file_content(file_path) - if not content: - return None - - # Create SCIP document - document = scip_pb2.Document() - document.relative_path = self._get_relative_path(file_path, project_path) - document.language = "zig" - - # Initialize position calculator - self.position_calculator = PositionCalculator(content) - - # Reset dependencies for this file - self._reset_dependencies() - - if self.use_tree_sitter and self.parser: - # Parse with Tree-sitter - tree = self._parse_content(content) - if tree: - occurrences, symbols = self._analyze_tree_sitter_for_document(tree, document.relative_path, content, relationships) - document.occurrences.extend(occurrences) - document.symbols.extend(symbols) - - # Add dependency information to symbols - self._add_dependency_info_to_symbols(document, content) - - logger.debug(f"Analyzed Zig file {document.relative_path}: " - f"{len(document.occurrences)} occurrences, {len(document.symbols)} symbols, " - f"dependencies: {self._count_dependencies()}") - return document - - raise StrategyError(f"Failed to parse {document.relative_path} with tree-sitter for document analysis") - - def _parse_content(self, content: str) -> Optional[tree_sitter.Tree]: - """Parse content with tree-sitter parser.""" - if not self.parser: - return None - - try: - content_bytes = content.encode('utf-8') - return self.parser.parse(content_bytes) - except Exception as e: - logger.error(f"Failed to parse content with tree-sitter: {e}") - return None - - def _build_symbol_relationships(self, files: List[str], project_path: str) -> Dict[str, List[tuple]]: - """ - Build relationships between Zig symbols. - - Args: - files: List of file paths to process - project_path: Project root path - - Returns: - Dictionary mapping symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - logger.debug(f"ZigStrategy: Building symbol relationships for {len(files)} files") - - all_relationships = {} - - for file_path in files: - try: - file_relationships = self._extract_relationships_from_file(file_path, project_path) - all_relationships.update(file_relationships) - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - - total_symbols_with_relationships = len(all_relationships) - total_relationships = sum(len(rels) for rels in all_relationships.values()) - - logger.debug(f"ZigStrategy: Built {total_relationships} relationships for {total_symbols_with_relationships} symbols") - return all_relationships - - def _extract_relationships_from_file(self, file_path: str, project_path: str) -> Dict[str, List[tuple]]: - """Extract relationships from a single Zig file.""" - content = self._read_file_content(file_path) - if not content: - return {} - - relative_path = self._get_relative_path(file_path, project_path) - - if self.use_tree_sitter and self.parser: - tree = self._parse_content(content) - if tree: - return self._extract_relationships_from_tree_sitter(tree, relative_path, content) - - raise StrategyError(f"Failed to parse {relative_path} with tree-sitter for relationship extraction") - - # Tree-sitter based methods - def _collect_symbols_from_tree_sitter(self, tree, file_path: str, content: str) -> None: - """Collect symbols using Tree-sitter AST.""" - scope_stack = [] - - def visit_node(node): - node_type = node.type - - # Function declarations - if node_type == 'function_declaration': - self._register_function_symbol_ts(node, file_path, scope_stack, content) - # Struct declarations - elif node_type == 'struct_declaration': - self._register_struct_symbol_ts(node, file_path, scope_stack, content) - # Enum declarations - elif node_type == 'enum_declaration': - self._register_enum_symbol_ts(node, file_path, scope_stack, content) - # Variable declarations (const/var) - elif node_type == 'variable_declaration': - self._register_variable_symbol_ts(node, file_path, scope_stack, content) - # Check if it contains an @import call - self._check_for_import_in_variable(node, file_path, scope_stack, content) - # Test declarations - elif node_type == 'test_declaration': - self._register_test_symbol_ts(node, file_path, scope_stack, content) - - # Recursively analyze child nodes - for child in node.children: - visit_node(child) - - visit_node(tree.root_node) - - def _analyze_tree_sitter_for_document(self, tree, file_path: str, content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[List[scip_pb2.Occurrence], List[scip_pb2.SymbolInformation]]: - """Analyze Tree-sitter AST to generate SCIP occurrences and symbols.""" - occurrences = [] - symbols = [] - scope_stack = [] - - def visit_node(node): - node_type = node.type - - # Process different node types - if node_type == 'function_declaration': - occ, sym = self._process_function_ts(node, file_path, scope_stack, content, relationships) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - elif node_type == 'struct_declaration': - occ, sym = self._process_struct_ts(node, file_path, scope_stack, content, relationships) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - elif node_type == 'enum_declaration': - occ, sym = self._process_enum_ts(node, file_path, scope_stack, content, relationships) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - elif node_type == 'variable_declaration': - occ, sym = self._process_variable_ts(node, file_path, scope_stack, content, relationships) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - elif node_type == 'test_declaration': - occ, sym = self._process_test_ts(node, file_path, scope_stack, content, relationships) - if occ: occurrences.append(occ) - if sym: symbols.append(sym) - elif node_type == 'builtin_function_call' and self._is_import_call(node): - # Handle @import() calls - self._handle_import_declaration(node, file_path, scope_stack, content) - elif node_type == 'identifier': - occ = self._process_identifier_ts(node, file_path, scope_stack, content) - if occ: occurrences.append(occ) - - # Recursively analyze child nodes - for child in node.children: - visit_node(child) - - visit_node(tree.root_node) - return occurrences, symbols - - def _extract_relationships_from_tree_sitter(self, tree, file_path: str, content: str) -> Dict[str, List[tuple]]: - """Extract relationships from Tree-sitter AST.""" - relationships = {} - scope_stack = [] - - def visit_node(node): - node_type = node.type - - if node_type in ['function_declaration', 'test_declaration']: - # Extract function call relationships within this function - function_name = self._get_function_name_ts(node, content) - if function_name: - function_symbol_id = self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [function_name], - descriptor="()." - ) - - # Find call expressions within this function - self._extract_calls_from_node_ts(node, function_symbol_id, relationships, file_path, scope_stack, content) - - # Recursively visit children - for child in node.children: - visit_node(child) - - visit_node(tree.root_node) - return relationships - - # Tree-sitter node processing methods (missing implementations) - def _register_function_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: - """Register a function symbol definition.""" - name = self._get_function_name_ts(node, content) - if not name: - return - - symbol_id = self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="()." - ) - - # Create a dummy range for registration - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Function, - display_name=name, - documentation=["Zig function"] - ) - - def _register_struct_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: - """Register a struct symbol definition.""" - name = self._get_struct_name_ts(node, content) - if not name: - return - - symbol_id = self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="#" - ) - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Struct, - display_name=name, - documentation=["Zig struct"] - ) - - def _register_enum_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: - """Register an enum symbol definition.""" - name = self._get_enum_name_ts(node, content) - if not name: - return - - symbol_id = self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="#" - ) - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Enum, - display_name=name, - documentation=["Zig enum"] - ) - - def _register_variable_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: - """Register a variable/constant symbol definition.""" - name = self._get_variable_name_ts(node, content) - if not name: - return - - # Determine if it's const or var - is_const = self._is_const_declaration(node) - symbol_kind = scip_pb2.Constant if is_const else scip_pb2.Variable - descriptor = "." - - symbol_id = self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor=descriptor - ) - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=symbol_kind, - display_name=name, - documentation=["Zig constant" if is_const else "Zig variable"] - ) - - def _register_test_symbol_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> None: - """Register a test symbol definition.""" - name = self._get_test_name_ts(node, content) - if not name: - name = "test" # Default name for unnamed tests - - symbol_id = self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="()." - ) - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Function, - display_name=name, - documentation=["Zig test"] - ) - - # Process methods for document generation - def _process_function_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: - """Process function for document generation.""" - name = self._get_function_name_ts(node, content) - if not name: - return None, None - - symbol_id = self._create_function_symbol_id_ts(name, file_path, scope_stack) - occurrence = self._create_function_occurrence_ts(node, symbol_id) - - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_function_symbol_info_ts(node, symbol_id, name, scip_relationships) - - return occurrence, symbol_info - - def _process_struct_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: - """Process struct for document generation.""" - name = self._get_struct_name_ts(node, content) - if not name: - return None, None - - symbol_id = self._create_struct_symbol_id_ts(name, file_path, scope_stack) - occurrence = self._create_struct_occurrence_ts(node, symbol_id) - - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_struct_symbol_info_ts(node, symbol_id, name, scip_relationships) - - return occurrence, symbol_info - - def _process_enum_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: - """Process enum for document generation.""" - name = self._get_enum_name_ts(node, content) - if not name: - return None, None - - symbol_id = self._create_enum_symbol_id_ts(name, file_path, scope_stack) - occurrence = self._create_enum_occurrence_ts(node, symbol_id) - - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_enum_symbol_info_ts(node, symbol_id, name, scip_relationships) - - return occurrence, symbol_info - - def _process_variable_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: - """Process variable/constant for document generation.""" - name = self._get_variable_name_ts(node, content) - if not name: - return None, None - - symbol_id = self._create_variable_symbol_id_ts(name, file_path, scope_stack, node) - occurrence = self._create_variable_occurrence_ts(node, symbol_id) - - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_variable_symbol_info_ts(node, symbol_id, name, scip_relationships) - - return occurrence, symbol_info - - def _process_test_ts(self, node, file_path: str, scope_stack: List[str], content: str, relationships: Optional[Dict[str, List[tuple]]] = None) -> tuple[Optional[scip_pb2.Occurrence], Optional[scip_pb2.SymbolInformation]]: - """Process test for document generation.""" - name = self._get_test_name_ts(node, content) or "test" - - symbol_id = self._create_test_symbol_id_ts(name, file_path, scope_stack) - occurrence = self._create_test_occurrence_ts(node, symbol_id) - - symbol_relationships = relationships.get(symbol_id, []) if relationships else [] - scip_relationships = self._create_scip_relationships(symbol_relationships) if symbol_relationships else [] - - symbol_info = self._create_test_symbol_info_ts(node, symbol_id, name, scip_relationships) - - return occurrence, symbol_info - - def _process_identifier_ts(self, node, file_path: str, scope_stack: List[str], content: str) -> Optional[scip_pb2.Occurrence]: - """Process identifier for references.""" - name = self._get_node_text_ts(node) - if not name: - return None - - # Create a reference occurrence - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = f"local {name}" # Simple reference - occurrence.symbol_roles = scip_pb2.ReadAccess - occurrence.syntax_kind = scip_pb2.IdentifierLocal - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - # Helper methods for extracting names from Tree-sitter nodes - def _get_function_name_ts(self, node, content: str) -> Optional[str]: - """Extract function name from function node.""" - for child in node.children: - if child.type == "identifier": - return self._get_node_text_ts(child) - return None - - def _get_struct_name_ts(self, node, content: str) -> Optional[str]: - """Extract struct name from struct node.""" - for child in node.children: - if child.type == "identifier": - return self._get_node_text_ts(child) - return None - - def _get_enum_name_ts(self, node, content: str) -> Optional[str]: - """Extract enum name from enum node.""" - for child in node.children: - if child.type == "identifier": - return self._get_node_text_ts(child) - return None - - def _get_variable_name_ts(self, node, content: str) -> Optional[str]: - """Extract variable name from variable declaration node.""" - for child in node.children: - if child.type == "identifier": - return self._get_node_text_ts(child) - return None - - def _get_test_name_ts(self, node, content: str) -> Optional[str]: - """Extract test name from test node.""" - for child in node.children: - if child.type == "string_literal": - # Test with string name: test "my test" {} - text = self._get_node_text_ts(child) - if text: - return text.strip('"') - elif child.type == "identifier": - # Test with identifier: test my_test {} - return self._get_node_text_ts(child) - return None - - def _get_node_text_ts(self, node) -> Optional[str]: - """Get text content of a Tree-sitter node.""" - if hasattr(node, 'text'): - try: - return node.text.decode('utf-8') - except: - pass - return None - - def _is_const_declaration(self, node) -> bool: - """Check if a declaration is const.""" - return node.type == "const_declaration" - - # Symbol ID creation methods - def _create_function_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str]) -> str: - """Create symbol ID for function.""" - return self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="()." - ) - - def _create_struct_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str]) -> str: - """Create symbol ID for struct.""" - return self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="#" - ) - - def _create_enum_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str]) -> str: - """Create symbol ID for enum.""" - return self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="#" - ) - - def _create_variable_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str], node) -> str: - """Create symbol ID for variable/constant.""" - descriptor = "." - return self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor=descriptor - ) - - def _create_test_symbol_id_ts(self, name: str, file_path: str, scope_stack: List[str]) -> str: - """Create symbol ID for test.""" - return self.symbol_manager.create_local_symbol( - language="zig", - file_path=file_path, - symbol_path=scope_stack + [name], - descriptor="()." - ) - - # Occurrence creation methods - def _create_function_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for function.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierFunctionDefinition - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_struct_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for struct.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierType - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_enum_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for enum.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierType - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_variable_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for variable/constant.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - - # Check if this variable is an import by examining the node for @import - is_import = self._is_variable_import(node) - - if is_import: - occurrence.symbol_roles = scip_pb2.Import # Mark as Import role - occurrence.syntax_kind = scip_pb2.IdentifierNamespace - else: - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierConstant - - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - def _create_test_occurrence_ts(self, node, symbol_id: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for test.""" - if not self.position_calculator: - return None - - try: - range_obj = self.position_calculator.tree_sitter_node_to_range(node) - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = scip_pb2.Definition - occurrence.syntax_kind = scip_pb2.IdentifierFunctionDefinition - occurrence.range.CopyFrom(range_obj) - return occurrence - except: - return None - - # Symbol information creation methods - def _create_function_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for function.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = name - symbol_info.kind = scip_pb2.Function - - symbol_info.documentation.append("Zig function") - - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_struct_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for struct.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = name - symbol_info.kind = scip_pb2.Struct - - symbol_info.documentation.append("Zig struct") - - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_enum_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for enum.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = name - symbol_info.kind = scip_pb2.Enum - - symbol_info.documentation.append("Zig enum") - - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_variable_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for variable/constant.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = name - - # Determine if it's const or var - is_const = self._is_const_declaration(node) - symbol_info.kind = scip_pb2.Constant if is_const else scip_pb2.Variable - symbol_info.documentation.append("Zig constant" if is_const else "Zig variable") - - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_test_symbol_info_ts(self, node, symbol_id: str, name: str, relationships: Optional[List[scip_pb2.Relationship]] = None) -> scip_pb2.SymbolInformation: - """Create SCIP symbol information for test.""" - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = name - symbol_info.kind = scip_pb2.Function - - symbol_info.documentation.append("Zig test") - - if relationships and self.relationship_manager: - self.relationship_manager.add_relationships_to_symbol(symbol_info, relationships) - - return symbol_info - - def _create_scip_relationships(self, symbol_relationships: List[tuple]) -> List[scip_pb2.Relationship]: - """Convert internal relationships to SCIP relationships.""" - scip_relationships = [] - for target_symbol_id, relationship_type in symbol_relationships: - relationship = scip_pb2.Relationship() - relationship.symbol = target_symbol_id - relationship.is_reference = True - scip_relationships.append(relationship) - return scip_relationships - - # Dependency handling methods (Zig-specific) - def _is_import_call(self, node) -> bool: - """Check if a builtin function call is an @import call.""" - if node.type != "builtin_function_call": - return False - - for child in node.children: - if child.type == "builtin_identifier": - name = self._get_node_text_ts(child) - return name == "@import" - return False - - def _handle_import_declaration(self, node, file_path: str, scope_stack: List[str], content: str) -> None: - """Handle @import() declarations.""" - import_path = self._extract_import_path_from_node(node) - if not import_path: - return - - # Classify dependency type - dependency_type = self._classify_zig_dependency(import_path) - - # Store dependency - if import_path not in self.dependencies['imports'][dependency_type]: - self.dependencies['imports'][dependency_type].append(import_path) - - # Create SCIP symbol for import - var_name = f"import_{import_path.replace('.', '_').replace('/', '_')}" - local_id = ".".join(scope_stack + [var_name]) if scope_stack else var_name - symbol_id = f"local {local_id}(import)" - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Namespace, - display_name=var_name, - documentation=[f"Zig import from {import_path}"] - ) - - def _extract_import_path_from_node(self, node) -> Optional[str]: - """Extract import path from @import() call.""" - # Look for string in arguments based on actual AST structure - for child in node.children: - if child.type == "arguments": - for arg in child.children: - if arg.type == "string": - # Extract from string_content child - for string_child in arg.children: - if string_child.type == "string_content": - path = self._get_node_text_ts(string_child) - if path: - return path - return None - - def _classify_zig_dependency(self, import_path: str) -> str: - """Classify Zig dependency based on import path.""" - # Zig standard library modules - zig_std_modules = { - 'std', 'builtin', 'root', 'testing', 'math', 'mem', 'fs', 'net', - 'json', 'fmt', 'log', 'crypto', 'hash', 'sort', 'thread', 'atomic', - 'os', 'process', 'time', 'random', 'debug', 'meta', 'ascii', 'unicode' - } - - if import_path in zig_std_modules: - return 'standard_library' - elif import_path.startswith('./') or import_path.startswith('../') or import_path.endswith('.zig'): - return 'local' - else: - return 'third_party' - - def _extract_calls_from_node_ts(self, node, source_symbol_id: str, relationships: Dict, file_path: str, scope_stack: List[str], content: str) -> None: - """Extract function calls from a Tree-sitter node.""" - def visit_for_calls(n): - if n.type == 'call_expression': - # Get the function being called - function_node = n.children[0] if n.children else None - if function_node and function_node.type == 'identifier': - target_name = self._get_node_text_ts(function_node) - if target_name: - target_symbol_id = self._create_function_symbol_id_ts(target_name, file_path, scope_stack) - if source_symbol_id not in relationships: - relationships[source_symbol_id] = [] - relationships[source_symbol_id].append((target_symbol_id, InternalRelationshipType.CALLS)) - - for child in n.children: - visit_for_calls(child) - - visit_for_calls(node) - - def _check_for_import_in_variable(self, node, file_path: str, scope_stack: List[str], content: str) -> None: - """Check if a variable declaration contains an @import call.""" - for child in node.children: - if child.type == 'builtin_function': - # Check if it's @import - builtin_id = None - for grandchild in child.children: - if grandchild.type == 'builtin_identifier': - builtin_id = self._get_node_text_ts(grandchild) - break - - if builtin_id == '@import': - # Extract import path - import_path = self._extract_import_path_from_node(child) - if import_path: - # Classify and store dependency - dependency_type = self._classify_zig_dependency(import_path) - if import_path not in self.dependencies['imports'][dependency_type]: - self.dependencies['imports'][dependency_type].append(import_path) - - # Create SCIP symbol for import - var_name = self._get_variable_name_ts(node, content) - if var_name: - local_id = ".".join(scope_stack + [var_name]) if scope_stack else var_name - symbol_id = f"local {local_id}(import)" - - dummy_range = scip_pb2.Range() - dummy_range.start.extend([0, 0]) - dummy_range.end.extend([0, 1]) - - self.reference_resolver.register_symbol_definition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=dummy_range, - symbol_kind=scip_pb2.Namespace, - display_name=var_name, - documentation=[f"Zig import from {import_path}"] - ) - - def get_dependencies(self) -> Dict[str, Any]: - """Get collected dependencies for MCP response.""" - return self.dependencies - - def _reset_dependencies(self) -> None: - """Reset dependency tracking for new file analysis.""" - self.dependencies = { - 'imports': { - 'standard_library': [], - 'third_party': [], - 'local': [] - } - } - - def _add_dependency_info_to_symbols(self, document: scip_pb2.Document, content: str) -> None: - """Add dependency classification information to SCIP symbols.""" - if not self.dependencies['imports']: - return - - # Update existing import symbols with dependency classification - for symbol_info in document.symbols: - symbol_name = self._extract_symbol_name_from_id(symbol_info.symbol) - - # Check if this symbol is an import - if self._is_import_symbol(symbol_name, symbol_info): - # Find which dependency category this import belongs to - dependency_type = self._find_dependency_type(symbol_name) - if dependency_type: - # Update symbol documentation with dependency type - symbol_info.documentation.append(f"Dependency type: {dependency_type}") - # Mark as import role - if hasattr(symbol_info, 'symbol_roles'): - symbol_info.symbol_roles |= 2 # SymbolRole.Import = 2 - - def _count_dependencies(self) -> str: - """Get dependency count summary for logging.""" - total = (len(self.dependencies['imports']['standard_library']) + - len(self.dependencies['imports']['third_party']) + - len(self.dependencies['imports']['local'])) - return f"{total} total ({len(self.dependencies['imports']['standard_library'])} std, " \ - f"{len(self.dependencies['imports']['third_party'])} 3rd, " \ - f"{len(self.dependencies['imports']['local'])} local)" - - def _extract_symbol_name_from_id(self, symbol_id: str) -> str: - """Extract symbol name from SCIP symbol ID.""" - # Symbol ID format: "scip-zig local code-index-mcp .../filename/symbol_name." - parts = symbol_id.split('/') - if parts: - last_part = parts[-1] - # Remove trailing descriptor (., (), #) - if last_part.endswith('.'): - return last_part[:-1] - elif last_part.endswith('().'): - return last_part[:-3] - elif last_part.endswith('#'): - return last_part[:-1] - return "" - - def _is_import_symbol(self, symbol_name: str, symbol_info: scip_pb2.SymbolInformation) -> bool: - """Check if a symbol represents an import.""" - # Check if symbol documentation mentions import - for doc in symbol_info.documentation: - if "import" in doc.lower(): - return True - return False - - def _find_dependency_type(self, symbol_name: str) -> str: - """Find which dependency type category a symbol belongs to.""" - for dep_type, imports in self.dependencies['imports'].items(): - if symbol_name in imports: - return dep_type - return "" - - def _register_dependencies_with_symbol_manager(self) -> None: - """Register collected dependencies with the symbol manager.""" - if not self.symbol_manager or not self.dependencies['imports']: - return - - for dep_type, imports in self.dependencies['imports'].items(): - for import_path in imports: - try: - # Register with symbol manager for global dependency tracking - symbol_id = self.symbol_manager.moniker_manager.register_import( - package_name=import_path, - symbol_name=import_path, # Use import path as symbol name - module_path="", - alias=None, - import_kind="namespace", # Zig imports are namespace-like - version="" # Zig doesn't use version in @import() - ) - logger.debug(f"Registered dependency: {import_path} ({dep_type}) -> {symbol_id}") - except Exception as e: - logger.warning(f"Failed to register dependency {import_path}: {e}") - - def _is_variable_import(self, node) -> bool: - """Check if a variable declaration contains an @import call.""" - for child in node.children: - if child.type == 'builtin_function': - # Check if it's @import - builtin_id = None - for grandchild in child.children: - if grandchild.type == 'builtin_identifier': - builtin_id = self._get_node_text_ts(grandchild) - break - - if builtin_id == '@import': - return True - return False diff --git a/src/code_index_mcp/services/project_management_service.py b/src/code_index_mcp/services/project_management_service.py index ac3013b..c18e1a9 100644 --- a/src/code_index_mcp/services/project_management_service.py +++ b/src/code_index_mcp/services/project_management_service.py @@ -177,6 +177,22 @@ def _initialize_index_manager(self, project_path: str) -> Dict[str, Any]: Dictionary with initialization results """ with self._noop_operation(): + # Check if index needs rebuild before initialization + needs_rebuild = not self.helper.settings.is_latest_index() + + if needs_rebuild: + # Clean up legacy files + self.helper.settings.cleanup_legacy_files() + + # Force rebuild by ensuring fresh start + try: + from ..services.index_management_service import IndexManagementService + index_service = IndexManagementService(self._context) + index_service.rebuild_index() + except Exception: + # If rebuild fails, continue with normal initialization + pass + # Create unified index manager index_manager = UnifiedIndexManager(project_path, self.helper.settings) diff --git a/src/code_index_mcp/tools/config/project_config_tool.py b/src/code_index_mcp/tools/config/project_config_tool.py index 812dd93..304b974 100644 --- a/src/code_index_mcp/tools/config/project_config_tool.py +++ b/src/code_index_mcp/tools/config/project_config_tool.py @@ -96,12 +96,12 @@ def save_index_data(self, index_data: Dict[str, Any]) -> None: self._settings.save_index(index_data) - def check_index_version(self) -> Optional[str]: + def check_index_version(self) -> bool: """ - Check the version of existing index. + Check if index is the latest version. Returns: - Version string or None if no index exists + True if latest SCIP index exists, False if needs rebuild Raises: RuntimeError: If settings not initialized @@ -109,14 +109,11 @@ def check_index_version(self) -> Optional[str]: if not self._settings: raise RuntimeError("Settings not initialized") - return self._settings.detect_index_version() + return self._settings.is_latest_index() - def migrate_legacy_index(self) -> bool: + def cleanup_legacy_files(self) -> None: """ - Migrate legacy index format if needed. - - Returns: - True if migration successful or not needed, False if manual rebuild required + Clean up legacy index files. Raises: RuntimeError: If settings not initialized @@ -124,7 +121,7 @@ def migrate_legacy_index(self) -> bool: if not self._settings: raise RuntimeError("Settings not initialized") - return self._settings.migrate_legacy_index() + self._settings.cleanup_legacy_files() def get_search_tool_info(self) -> Dict[str, Any]: """ diff --git a/src/code_index_mcp/tools/scip/relationship_info.py b/src/code_index_mcp/tools/scip/relationship_info.py index 8076f4a..ed640b0 100644 --- a/src/code_index_mcp/tools/scip/relationship_info.py +++ b/src/code_index_mcp/tools/scip/relationship_info.py @@ -12,24 +12,24 @@ class RelationshipType(Enum): """Unified relationship types for all programming languages""" - + # Function relationships FUNCTION_CALL = "function_call" METHOD_CALL = "method_call" - + # Type relationships INHERITANCE = "inheritance" INTERFACE_IMPLEMENTATION = "interface_implementation" TYPE_REFERENCE = "type_reference" - + # Variable relationships VARIABLE_REFERENCE = "variable_reference" VARIABLE_ASSIGNMENT = "variable_assignment" - + # Module relationships MODULE_IMPORT = "module_import" MODULE_EXPORT = "module_export" - + # Generic relationships (fallback) REFERENCE = "reference" DEFINITION = "definition" @@ -38,114 +38,104 @@ class RelationshipType(Enum): @dataclass class RelationshipInfo: """Complete information about a single relationship""" - + target: str # Target symbol name target_symbol_id: str # Complete SCIP symbol ID - line: int # Line where relationship occurs - column: int # Column where relationship occurs relationship_type: RelationshipType # Type of relationship source: Optional[str] = None # Source symbol name (for reverse relationships) source_symbol_id: Optional[str] = None # Source symbol ID (for reverse relationships) - + def to_dict(self) -> Dict[str, Any]: """Convert to dictionary format for JSON output""" result = { "target": self.target, "target_symbol_id": self.target_symbol_id, - "line": self.line, - "column": self.column, "relationship_type": self.relationship_type.value } - + if self.source: result["source"] = self.source if self.source_symbol_id: result["source_symbol_id"] = self.source_symbol_id - + return result @dataclass class SymbolRelationships: """Container for all relationships of a symbol""" - + # Active relationships (this symbol to others) calls: List[RelationshipInfo] = field(default_factory=list) inherits_from: List[RelationshipInfo] = field(default_factory=list) implements: List[RelationshipInfo] = field(default_factory=list) references: List[RelationshipInfo] = field(default_factory=list) - + # Passive relationships (others to this symbol) called_by: List[RelationshipInfo] = field(default_factory=list) inherited_by: List[RelationshipInfo] = field(default_factory=list) implemented_by: List[RelationshipInfo] = field(default_factory=list) referenced_by: List[RelationshipInfo] = field(default_factory=list) - + def add_relationship(self, relationship: RelationshipInfo, is_reverse: bool = False): - """Add a relationship to the appropriate category""" + """Add a relationship to the appropriate category with deduplication""" rel_type = relationship.relationship_type - + if is_reverse: # This is a reverse relationship (others -> this symbol) if rel_type in [RelationshipType.FUNCTION_CALL, RelationshipType.METHOD_CALL]: - self.called_by.append(relationship) + self._add_unique_relationship(self.called_by, relationship) elif rel_type == RelationshipType.INHERITANCE: - self.inherited_by.append(relationship) + self._add_unique_relationship(self.inherited_by, relationship) elif rel_type == RelationshipType.INTERFACE_IMPLEMENTATION: - self.implemented_by.append(relationship) + self._add_unique_relationship(self.implemented_by, relationship) else: - self.referenced_by.append(relationship) + self._add_unique_relationship(self.referenced_by, relationship) else: # This is a forward relationship (this symbol -> others) if rel_type in [RelationshipType.FUNCTION_CALL, RelationshipType.METHOD_CALL]: - self.calls.append(relationship) + self._add_unique_relationship(self.calls, relationship) elif rel_type == RelationshipType.INHERITANCE: - self.inherits_from.append(relationship) + self._add_unique_relationship(self.inherits_from, relationship) elif rel_type == RelationshipType.INTERFACE_IMPLEMENTATION: - self.implements.append(relationship) + self._add_unique_relationship(self.implements, relationship) else: - self.references.append(relationship) - + self._add_unique_relationship(self.references, relationship) + + def _add_unique_relationship(self, relationship_list: List[RelationshipInfo], new_relationship: RelationshipInfo): + """Add relationship only if it doesn't already exist""" + for existing in relationship_list: + if (existing.target_symbol_id == new_relationship.target_symbol_id and + existing.relationship_type == new_relationship.relationship_type): + return # Skip duplicate + relationship_list.append(new_relationship) + def get_total_count(self) -> int: """Get total number of relationships""" - return (len(self.calls) + len(self.called_by) + + return (len(self.calls) + len(self.called_by) + len(self.inherits_from) + len(self.inherited_by) + len(self.implements) + len(self.implemented_by) + len(self.references) + len(self.referenced_by)) - + def to_dict(self) -> Dict[str, List[Dict[str, Any]]]: - """Convert to dictionary format for JSON output""" + """Convert to dictionary format for JSON output - simplified for token efficiency""" result = {} - - # Only include non-empty relationship categories - if self.calls: - result["calls"] = [rel.to_dict() for rel in self.calls] + + # Only include called_by relationships if self.called_by: result["called_by"] = [rel.to_dict() for rel in self.called_by] - if self.inherits_from: - result["inherits_from"] = [rel.to_dict() for rel in self.inherits_from] - if self.inherited_by: - result["inherited_by"] = [rel.to_dict() for rel in self.inherited_by] - if self.implements: - result["implements"] = [rel.to_dict() for rel in self.implements] - if self.implemented_by: - result["implemented_by"] = [rel.to_dict() for rel in self.implemented_by] - if self.references: - result["references"] = [rel.to_dict() for rel in self.references] - if self.referenced_by: - result["referenced_by"] = [rel.to_dict() for rel in self.referenced_by] - + return result @dataclass class RelationshipsSummary: """Summary statistics for all relationships in a file""" - + total_relationships: int by_type: Dict[str, int] cross_file_relationships: int - + def to_dict(self) -> Dict[str, Any]: """Convert to dictionary format for JSON output""" return { @@ -157,88 +147,290 @@ def to_dict(self) -> Dict[str, Any]: class SCIPRelationshipReader: """Reads and parses relationships from SCIP index""" - + def __init__(self): """Initialize the relationship reader""" - pass - - def extract_relationships_from_document(self, document) -> Dict[str, SymbolRelationships]: + self._symbol_kinds = {} # symbol_id -> SymbolKind mapping + + def extract_relationships_from_document(self, document, scip_index=None) -> Dict[str, SymbolRelationships]: """ - Extract all relationships from a SCIP document - + Enhanced relationship extraction from both symbol.relationships and occurrences. + + This dual-source approach dramatically improves relationship coverage: + - symbol.relationships: Explicit relationships (inheritance, implements) + - occurrences: Implicit relationships (function calls, references) + - Cross-document analysis: Enables called_by relationships across files + Args: document: SCIP document containing symbols and relationships - + scip_index: Optional full SCIP index for cross-document analysis + Returns: Dictionary mapping symbol_id -> SymbolRelationships """ all_relationships = {} + + # Step 0: Build global symbol registry for cross-document analysis + self._build_global_symbol_registry(document, scip_index) + + # Step 1: Extract from explicit symbol relationships (existing logic) + self._extract_from_symbol_relationships(document, all_relationships) + + # Step 2: Extract from occurrences with cross-document support + self._extract_from_occurrences(document, all_relationships, scip_index) + + # Step 3: Build reverse relationships with cross-document support + self._build_reverse_relationships(all_relationships, document, scip_index) + + return all_relationships + + def _build_global_symbol_registry(self, document, scip_index=None): + """Build comprehensive symbol registry supporting cross-document analysis.""" + # Clear previous state + self._symbol_kinds.clear() + + # Build registry from current document + self._add_document_to_registry(document) - # Process each symbol in the document + # If full index provided, build global registry for cross-document analysis + if scip_index: + for doc in scip_index.documents: + if doc != document: # Avoid duplicate processing + self._add_document_to_registry(doc) + + def _add_document_to_registry(self, document): + """Add document symbols to the global registry.""" for symbol_info in document.symbols: symbol_id = symbol_info.symbol - symbol_name = symbol_info.display_name + self._symbol_kinds[symbol_id] = symbol_info.kind + # For function symbols, also map the occurrence format (without ().suffix) + if symbol_info.kind == 11: # SymbolKind.Function + if symbol_id.endswith('().'): + base_id = symbol_id[:-3] # Remove '().' + self._symbol_kinds[base_id] = symbol_info.kind + + def _extract_from_symbol_relationships(self, document, all_relationships: Dict[str, SymbolRelationships]): + """ + Extract relationships from explicit symbol.relationships (original logic). + + Args: + document: SCIP document + all_relationships: Dictionary to populate with relationships + """ + for symbol_info in document.symbols: + symbol_id = symbol_info.symbol + symbol_name = symbol_info.display_name + if not symbol_info.relationships: continue - - # Create relationships container for this symbol - symbol_rels = SymbolRelationships() - - # Process each relationship + + # Create or get existing relationships container + if symbol_id not in all_relationships: + all_relationships[symbol_id] = SymbolRelationships() + + symbol_rels = all_relationships[symbol_id] + + # Process each explicit relationship for scip_relationship in symbol_info.relationships: rel_info = self._parse_scip_relationship( scip_relationship, symbol_name, symbol_id, document ) if rel_info: symbol_rels.add_relationship(rel_info) - - if symbol_rels.get_total_count() > 0: - all_relationships[symbol_id] = symbol_rels - - # Build reverse relationships - self._build_reverse_relationships(all_relationships, document) - - return all_relationships - - def _parse_scip_relationship(self, scip_relationship, source_name: str, + + def _extract_from_occurrences(self, document, all_relationships: Dict[str, SymbolRelationships], scip_index=None): + """ + Extract relationships from document occurrences (major new functionality). + + This extracts the majority of missing relationships, especially function calls. + + Args: + document: SCIP document containing occurrences + all_relationships: Dictionary to populate with relationships + """ + # Process each occurrence to find relationships + for occurrence in document.occurrences: + try: + # Skip if no symbol or range information + if not occurrence.symbol or not hasattr(occurrence, 'range'): + continue + + target_symbol_id = occurrence.symbol + roles = getattr(occurrence, 'symbol_roles', 0) + + # Skip definitions and imports - these aren't "uses" of other symbols + if roles & 1: # Definition role - skip + continue + if roles & 2: # Import role - skip + continue + + # Find which symbol contains this occurrence (context analysis) + source_symbol_id = self._find_containing_symbol(occurrence, document) + if not source_symbol_id or source_symbol_id == target_symbol_id: + continue # Self-reference or no container found + + # Determine relationship type based on roles and symbol characteristics + rel_type = self._determine_occurrence_relationship_type(roles, target_symbol_id, source_symbol_id) + if not rel_type: + continue + + + # Create relationship info + rel_info = RelationshipInfo( + target=self._extract_symbol_name(target_symbol_id), + target_symbol_id=target_symbol_id, + relationship_type=rel_type + ) + + # Add to source symbol's relationships + if source_symbol_id not in all_relationships: + all_relationships[source_symbol_id] = SymbolRelationships() + + all_relationships[source_symbol_id].add_relationship(rel_info) + + # For function calls, also create reverse "called_by" relationship + # This is the key to cross-document relationship building + if (rel_type == RelationshipType.FUNCTION_CALL or rel_type == RelationshipType.METHOD_CALL): + self._add_cross_document_called_by( + all_relationships, target_symbol_id, source_symbol_id, scip_index + ) + + except Exception as e: + # Log but continue processing other occurrences + continue + + def _find_containing_symbol(self, occurrence, document) -> Optional[str]: + """ + Find which symbol definition contains this occurrence. + + This is crucial for establishing "X calls Y" relationships. + """ + if not hasattr(occurrence, 'range') or not occurrence.range: + return None + + try: + occ_line = occurrence.range.start[0] if occurrence.range.start else 0 + except (AttributeError, IndexError): + return None + + # Find symbol definitions that could contain this occurrence + containing_symbols = [] + + for other_occurrence in document.occurrences: + try: + # Only consider definitions + roles = getattr(other_occurrence, 'symbol_roles', 0) + if not (roles & 1): # Must be definition + continue + + if not hasattr(other_occurrence, 'range') or not other_occurrence.range: + continue + + def_line = other_occurrence.range.start[0] if other_occurrence.range.start else 0 + + # Simple heuristic: find the closest preceding definition + if def_line <= occ_line: + containing_symbols.append((other_occurrence.symbol, def_line)) + + except Exception: + continue + + # Return the symbol with the closest line number to the occurrence + if containing_symbols: + containing_symbols.sort(key=lambda x: x[1], reverse=True) # Closest first + return containing_symbols[0][0] + + # If no containing symbol found, use file-level context for cross-file relationships + # This handles cases like run.py calling server.py functions + if hasattr(document, 'relative_path') and document.relative_path: + file_name = document.relative_path.replace('\\', '/').split('/')[-1] + return f"local file:{file_name}" + + return None + + def _determine_occurrence_relationship_type(self, roles: int, target_symbol_id: str, + source_symbol_id: str) -> Optional[RelationshipType]: + """ + Determine relationship type from occurrence roles and symbol characteristics. + + Args: + roles: SCIP symbol roles (bit flags) + target_symbol_id: Symbol being referenced + source_symbol_id: Symbol doing the referencing + + Returns: + RelationshipType or None if not a relevant relationship + """ + # Write access (assignment/modification) + if roles & 4: # Write role + return RelationshipType.VARIABLE_ASSIGNMENT + + # Read access - determine specific type + if roles == 0 or roles & 8: # Read role or unspecified + if self._is_function_symbol(target_symbol_id): + return RelationshipType.FUNCTION_CALL if not self._is_method_symbol(target_symbol_id) else RelationshipType.METHOD_CALL + elif self._is_class_symbol(target_symbol_id): + return RelationshipType.TYPE_REFERENCE + else: + return RelationshipType.VARIABLE_REFERENCE + + # Type role + if roles & 64: # Type role + return RelationshipType.TYPE_REFERENCE + + # Default to generic reference + return RelationshipType.REFERENCE + + def _is_function_symbol(self, symbol_id: str) -> bool: + """Check if symbol represents a function using SymbolKind.""" + # Check our symbol kinds cache + symbol_kind = self._symbol_kinds.get(symbol_id) + return symbol_kind == 11 # SymbolKind.Function + + def _is_method_symbol(self, symbol_id: str) -> bool: + """Check if symbol represents a method (function within a class).""" + return '#' in symbol_id and self._is_function_symbol(symbol_id) + + def _is_class_symbol(self, symbol_id: str) -> bool: + """Check if symbol represents a class using SymbolKind.""" + # Check our symbol kinds cache + symbol_kind = self._symbol_kinds.get(symbol_id) + return symbol_kind == 3 # SymbolKind.Class + + + def _parse_scip_relationship(self, scip_relationship, source_name: str, source_symbol_id: str, document) -> Optional[RelationshipInfo]: """ Parse a single SCIP relationship into RelationshipInfo - + Args: scip_relationship: SCIP Relationship object source_name: Name of the source symbol source_symbol_id: SCIP ID of the source symbol document: SCIP document for context - + Returns: RelationshipInfo object or None if parsing fails """ target_symbol_id = scip_relationship.symbol - + # Extract target symbol name from symbol ID target_name = self._extract_symbol_name(target_symbol_id) - + # Determine relationship type from SCIP flags rel_type = self._determine_relationship_type(scip_relationship, target_symbol_id) - - # Find the location where this relationship occurs - line, column = self._find_relationship_location( - source_symbol_id, target_symbol_id, document - ) - + + return RelationshipInfo( target=target_name, target_symbol_id=target_symbol_id, - line=line, - column=column, relationship_type=rel_type ) - + def _determine_relationship_type(self, scip_relationship, target_symbol_id: str) -> RelationshipType: """Determine the relationship type from SCIP flags and symbol ID""" - + # Check SCIP relationship flags if scip_relationship.is_implementation: return RelationshipType.INTERFACE_IMPLEMENTATION @@ -260,10 +452,14 @@ def _determine_relationship_type(self, scip_relationship, target_symbol_id: str) else: # Fallback return RelationshipType.REFERENCE - + def _extract_symbol_name(self, symbol_id: str) -> str: """Extract the symbol name from SCIP symbol ID""" try: + # Handle file-level symbols + if symbol_id.startswith("local file:"): + return symbol_id[11:] # Remove "local file:" prefix + # SCIP symbol format: scip- / if "/" in symbol_id: symbol_part = symbol_id.split("/")[-1] @@ -277,82 +473,139 @@ def _extract_symbol_name(self, symbol_id: str) -> str: return symbol_id except: return symbol_id - - def _find_relationship_location(self, source_symbol_id: str, target_symbol_id: str, - document) -> tuple[int, int]: - """Find the line and column where the relationship occurs""" + + + def _add_cross_document_called_by(self, all_relationships: Dict[str, SymbolRelationships], + target_symbol_id: str, source_symbol_id: str, + scip_index=None): + """ + Add cross-document called_by relationship. - # Look for occurrences that reference the target symbol - for occurrence in document.occurrences: - if occurrence.symbol == target_symbol_id: - if hasattr(occurrence, 'range') and occurrence.range: - start = occurrence.range.start - if len(start) >= 2: - return start[0] + 1, start[1] + 1 # Convert to 1-based indexing + This creates the reverse relationship that enables cross-file function call tracking. + For example, when run.py calls server.main(), we add main as called_by run. - # Fallback: look for the source symbol definition - for occurrence in document.occurrences: - if occurrence.symbol == source_symbol_id: - if hasattr(occurrence, 'range') and occurrence.range: - start = occurrence.range.start - if len(start) >= 2: - return start[0] + 1, start[1] + 1 # Convert to 1-based indexing + Args: + all_relationships: Current document's relationships + target_symbol_id: Function being called (e.g., 'local main') + source_symbol_id: Function making the call (e.g., 'local ') + scip_index: Full SCIP index for cross-document lookup + """ + # Find the definition format symbol ID for the target function + definition_symbol_id = self._find_definition_symbol_id(target_symbol_id, scip_index) + if not definition_symbol_id: + return + + # Create called_by relationship + source_name = self._extract_symbol_name(source_symbol_id) + called_by_rel = RelationshipInfo( + target=source_name, + target_symbol_id=source_symbol_id, + relationship_type=RelationshipType.FUNCTION_CALL + ) - # Default fallback - return 0, 0 - - def _build_reverse_relationships(self, all_relationships: Dict[str, SymbolRelationships], - document): - """Build reverse relationships (called_by, inherited_by, etc.)""" + # Add to target function's called_by relationships (with deduplication) + if definition_symbol_id not in all_relationships: + all_relationships[definition_symbol_id] = SymbolRelationships() - # Create a mapping of all symbols for reverse lookup + # Check if this called_by relationship already exists to avoid duplicates + existing_called_by = all_relationships[definition_symbol_id].called_by + for existing_rel in existing_called_by: + if (existing_rel.target_symbol_id == called_by_rel.target_symbol_id and + existing_rel.relationship_type == called_by_rel.relationship_type): + return # Skip duplicate + + all_relationships[definition_symbol_id].called_by.append(called_by_rel) + + def _find_definition_symbol_id(self, occurrence_symbol_id: str, scip_index=None) -> Optional[str]: + """ + Find the definition format symbol ID from occurrence format. + + SCIP uses different formats: + - Occurrences: 'local main' + - Definitions: 'local main().' + + This method maps from occurrence to definition format using SymbolKind. + """ + if not scip_index: + return None + + # If already in definition format, return as-is + if occurrence_symbol_id.endswith('().'): + return occurrence_symbol_id + + # Search all documents for function symbol with this base name + for doc in scip_index.documents: + for symbol_info in doc.symbols: + if symbol_info.kind == 11: # SymbolKind.Function + symbol_id = symbol_info.symbol + if symbol_id.endswith('().'): + # Extract base name from definition format + base_name = symbol_id[:-3] # Remove '().' + if base_name == occurrence_symbol_id: + return symbol_id + + return None + + def _build_reverse_relationships(self, all_relationships: Dict[str, SymbolRelationships], + document, scip_index=None): + """Build reverse relationships (called_by, inherited_by, etc.) with cross-document support""" + + # Create a comprehensive mapping of all symbols for reverse lookup symbol_names = {} + + # Add symbols from current document for symbol_info in document.symbols: symbol_names[symbol_info.symbol] = symbol_info.display_name + # Add symbols from all other documents if full index provided + if scip_index: + for doc in scip_index.documents: + if doc != document: # Avoid duplicate processing + for symbol_info in doc.symbols: + if symbol_info.symbol not in symbol_names: # Avoid overriding + symbol_names[symbol_info.symbol] = symbol_info.display_name + # Build reverse relationships (iterate over a copy to avoid modification during iteration) for source_symbol_id, source_rels in list(all_relationships.items()): source_name = symbol_names.get(source_symbol_id, "unknown") - + # Process each forward relationship to create reverse relationships for rel in source_rels.calls: self._add_reverse_relationship( all_relationships, rel.target_symbol_id, rel, source_name, source_symbol_id ) - + for rel in source_rels.inherits_from: self._add_reverse_relationship( all_relationships, rel.target_symbol_id, rel, source_name, source_symbol_id ) - + for rel in source_rels.implements: self._add_reverse_relationship( all_relationships, rel.target_symbol_id, rel, source_name, source_symbol_id ) - + for rel in source_rels.references: self._add_reverse_relationship( all_relationships, rel.target_symbol_id, rel, source_name, source_symbol_id ) - + def _add_reverse_relationship(self, all_relationships: Dict[str, SymbolRelationships], target_symbol_id: str, original_rel: RelationshipInfo, source_name: str, source_symbol_id: str): """Add a reverse relationship to the target symbol""" - + if target_symbol_id not in all_relationships: all_relationships[target_symbol_id] = SymbolRelationships() - + # Create reverse relationship reverse_rel = RelationshipInfo( target=source_name, target_symbol_id=source_symbol_id, - line=original_rel.line, - column=original_rel.column, relationship_type=original_rel.relationship_type, source=original_rel.target, source_symbol_id=original_rel.target_symbol_id ) - + # Add as reverse relationship all_relationships[target_symbol_id].add_relationship(reverse_rel, is_reverse=True) \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py b/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py index 9b96cc4..d357dc3 100644 --- a/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py +++ b/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py @@ -15,8 +15,9 @@ SymbolDefinition, FileAnalysis, ImportGroup, LocationInfo, SymbolLocationError, SymbolResolutionError ) -from .relationship_info import SCIPRelationshipReader +# Removed SCIPRelationshipReader - relationships now read directly from SCIP index from ...scip.core.symbol_manager import SCIPSymbolManager +from .relationship_info import SymbolRelationships, RelationshipInfo, RelationshipType logger = logging.getLogger(__name__) @@ -47,7 +48,7 @@ def __init__(self): self._symbol_kind_cache: Dict[int, str] = {} self._scip_symbol_cache: Dict[str, Dict[str, Any]] = {} self._symbol_parser: Optional[SCIPSymbolManager] = None - self._relationship_reader = SCIPRelationshipReader() + # Removed relationship reader - relationships now read directly from SCIP index # Initialize SCIP symbol kind mapping self._init_symbol_kind_mapping() @@ -538,25 +539,143 @@ def _enrich_symbol_metadata(self, symbol: SymbolDefinition, symbol_info, documen def _extract_call_relationships(self, document, symbols: Dict[str, SymbolDefinition], scip_index): """ - Extract all relationships from SCIP document using the new relationship reader. + Extract relationships from SCIP index and build correct called_by relationships. Args: document: SCIP document containing symbols and relationships symbols: Dictionary of extracted symbols - scip_index: Full SCIP index for cross-file resolution + scip_index: Full SCIP index """ - logger.debug("Starting relationship extraction using SCIP relationship reader") - - # Use the new relationship reader to extract all relationships - all_relationships = self._relationship_reader.extract_relationships_from_document(document) + logger.debug("Building called_by relationships from SCIP index") - # Assign relationships to symbols - for symbol_id, symbol_def in symbols.items(): - if symbol_id in all_relationships: - symbol_def.relationships = all_relationships[symbol_id] - logger.debug(f"Assigned {symbol_def.relationships.get_total_count()} relationships to {symbol_def.name}") + # Step 1: Collect all call relationships from the document + call_relationships = [] # List of (caller_id, target_id) tuples + + for symbol_info in document.symbols: + caller_id = symbol_info.symbol + + # Process each relationship of this symbol + for scip_rel in symbol_info.relationships: + if scip_rel.is_reference: # This indicates a call/reference relationship + target_id = scip_rel.symbol + call_relationships.append((caller_id, target_id)) + + # Step 2: Build called_by relationships by reversing the direction + for caller_id, target_id in call_relationships: + # Find the target symbol and add the caller to its called_by list + if target_id in symbols: + target_symbol = symbols[target_id] + caller_name = self._extract_symbol_name(caller_id) + + # Create RelationshipInfo for called_by + rel_info = RelationshipInfo( + target=caller_name, + target_symbol_id=caller_id, + relationship_type=RelationshipType.FUNCTION_CALL + ) + + # Add to target symbol's called_by relationships with deduplication + target_symbol.relationships.add_relationship(rel_info, is_reverse=True) logger.debug(f"Relationship extraction completed for {len(symbols)} symbols") + + def _convert_scip_relationships(self, scip_relationships, document): + """ + Convert SCIP Relationship objects to our SymbolRelationships format. + + Args: + scip_relationships: List of SCIP Relationship objects + document: SCIP document for context + + Returns: + SymbolRelationships object or None + """ + if not scip_relationships: + return None + + symbol_rels = SymbolRelationships() + + for scip_rel in scip_relationships: + # Extract symbol name from the relationship + target_name = self._extract_symbol_name(scip_rel.symbol) + + + # Create RelationshipInfo + rel_info = RelationshipInfo( + target=target_name, + target_symbol_id=scip_rel.symbol, + relationship_type=RelationshipType.FUNCTION_CALL if scip_rel.is_reference else RelationshipType.REFERENCE + ) + + # Add to appropriate category based on relationship type with deduplication + if scip_rel.is_reference: + # This is a "called_by" relationship (the symbol calls us) + symbol_rels.add_relationship(rel_info, is_reverse=True) + elif scip_rel.is_implementation: + symbol_rels.add_relationship(rel_info, is_reverse=True) # implements + elif scip_rel.is_type_definition: + symbol_rels.add_relationship(rel_info, is_reverse=False) # references + else: + symbol_rels.add_relationship(rel_info, is_reverse=False) # references + + return symbol_rels + + def _find_call_occurrence_position(self, caller_id: str, target_id: str, document) -> tuple[int, int]: + """ + Find the position where caller calls the target by looking up call occurrences. + + Args: + caller_id: The symbol ID of the calling function + target_id: The symbol ID of the called function + document: SCIP document containing occurrences + + Returns: + Tuple of (line, column) of the call or (0, 0) if not found + """ + try: + # Look through document occurrences to find where target_id is referenced + call_positions = [] + + for occurrence in document.occurrences: + if occurrence.symbol == target_id: + # Debug log the occurrence details + logger.debug(f"Found occurrence for {target_id}: roles={occurrence.symbol_roles}, range={occurrence.range}") + + # Only include reference/call occurrences, not definitions + # SCIP SymbolRole: 1=Definition, 8=Read/Reference + if occurrence.symbol_roles != 1: # Not a definition + # Extract line and column from the occurrence range + if occurrence.range and occurrence.range.start: + # SCIP uses 0-based indexing, convert to 1-based for display + line = occurrence.range.start[0] + 1 if len(occurrence.range.start) > 0 else 1 + column = occurrence.range.start[1] + 1 if len(occurrence.range.start) > 1 else 1 + call_positions.append((line, column)) + logger.debug(f"Added call position: line={line}, column={column}") + + # Return the first call position found (we can improve this later to be more specific) + if call_positions: + return call_positions[0] + + # Fallback: if not found in occurrences, return default + return 0, 0 + + except (AttributeError, IndexError, TypeError) as e: + # Handle any issues with accessing the occurrence data + logger.debug(f"Error in _find_call_occurrence_position: {e}") + return 0, 0 + + def _extract_symbol_name(self, symbol_id: str) -> str: + """Extract readable name from symbol ID.""" + if symbol_id.startswith('local '): + # Remove 'local ' prefix and any suffix + name = symbol_id[6:] + # Remove common suffixes + for suffix in ['().', '#', '.', '()']: + if name.endswith(suffix): + name = name[:-len(suffix)] + break + return name + return symbol_id def _organize_results(self, document, symbols: Dict[str, SymbolDefinition], scip_index=None) -> FileAnalysis: """ diff --git a/src/code_index_mcp/tools/scip/symbol_definitions.py b/src/code_index_mcp/tools/scip/symbol_definitions.py index 2ef957b..4bfecd5 100644 --- a/src/code_index_mcp/tools/scip/symbol_definitions.py +++ b/src/code_index_mcp/tools/scip/symbol_definitions.py @@ -227,7 +227,7 @@ def get_class_by_name(self, name: str) -> Optional[SymbolDefinition]: def to_dict(self) -> Dict[str, Any]: - """Convert to final JSON output format - EXACT specification.""" + """Convert to final JSON output format - simplified for token efficiency.""" return { "file_path": self.file_path, "language": self.language, @@ -240,9 +240,6 @@ def to_dict(self) -> Dict[str, Any]: "variables": [var.to_variable_dict() for var in self.variables], "constants": [const.to_constant_dict() for const in self.constants] }, - "dependencies": { - "imports": self.imports.to_dict() - }, "status": "success" } From 96c716472fac8b4448ba7f7e0704f6736441a963 Mon Sep 17 00:00:00 2001 From: johnhuang316 <134570882+johnhuang316@users.noreply.github.com> Date: Fri, 22 Aug 2025 17:34:58 +0800 Subject: [PATCH 6/8] Refactor dependencies in uv.lock: remove libclang and protobuf, add msgpack; update pathspec version --- LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md | 345 ++++ pyproject.toml | 3 +- src/code_index_mcp/constants.py | 5 +- src/code_index_mcp/indexing/__init__.py | 15 +- .../indexing/json_index_builder.py | 312 ++++ .../indexing/json_index_manager.py | 355 ++++ .../indexing/models/__init__.py | 8 + .../indexing/models/file_info.py | 24 + .../indexing/models/symbol_info.py | 23 + src/code_index_mcp/indexing/scip_builder.py | 260 --- .../indexing/strategies/__init__.py | 8 + .../indexing/strategies/base_strategy.py | 91 + .../indexing/strategies/fallback_strategy.py | 47 + .../indexing/strategies/go_strategy.py | 162 ++ .../indexing/strategies/java_strategy.py | 222 +++ .../strategies/javascript_strategy.py | 353 ++++ .../strategies/objective_c_strategy.py | 157 ++ .../indexing/strategies/python_strategy.py | 203 +++ .../indexing/strategies/strategy_factory.py | 180 ++ .../strategies/typescript_strategy.py | 376 ++++ .../indexing/strategies/zig_strategy.py | 179 ++ .../indexing/unified_index_manager.py | 433 ----- src/code_index_mcp/project_settings.py | 143 +- src/code_index_mcp/scip/__init__.py | 10 - src/code_index_mcp/scip/core/__init__.py | 1 - .../scip/core/local_reference_resolver.py | 470 ----- .../scip/core/moniker_manager.py | 375 ---- .../scip/core/position_calculator.py | 306 ---- .../scip/core/relationship_manager.py | 286 --- .../scip/core/relationship_types.py | 389 ---- .../scip/core/symbol_manager.py | 323 ---- src/code_index_mcp/scip/framework/__init__.py | 157 -- .../scip/framework/base/__init__.py | 13 - .../scip/framework/base/enum_mapper.py | 38 - .../scip/framework/base/index_factory.py | 206 --- .../scip/framework/base/language_analyzer.py | 77 - .../framework/base/relationship_extractor.py | 41 - .../scip/framework/caching_system.py | 346 ---- .../scip/framework/compliance_validator.py | 319 ---- .../scip/framework/fallback/__init__.py | 14 - .../scip/framework/fallback/basic_analyzer.py | 156 -- .../scip/framework/fallback/enum_mapper.py | 102 -- .../scip/framework/fallback/factory.py | 153 -- .../fallback/relationship_extractor.py | 85 - .../scip/framework/index_factory.py | 337 ---- .../scip/framework/java/__init__.py | 14 - .../scip/framework/java/enum_mapper.py | 200 --- .../scip/framework/java/factory.py | 399 ----- .../framework/java/relationship_extractor.py | 295 ---- .../framework/java/tree_sitter_analyzer.py | 327 ---- .../scip/framework/javascript/__init__.py | 14 - .../scip/framework/javascript/enum_mapper.py | 237 --- .../scip/framework/javascript/factory.py | 376 ---- .../javascript/relationship_extractor.py | 281 --- .../framework/javascript/syntax_analyzer.py | 418 ----- .../scip/framework/objective_c/__init__.py | 14 - .../framework/objective_c/clang_analyzer.py | 338 ---- .../scip/framework/objective_c/enum_mapper.py | 228 --- .../scip/framework/objective_c/factory.py | 500 ------ .../objective_c/relationship_extractor.py | 276 --- .../scip/framework/position_calculator.py | 225 --- .../scip/framework/python/__init__.py | 14 - .../scip/framework/python/ast_analyzer.py | 312 ---- .../scip/framework/python/enum_mapper.py | 181 -- .../scip/framework/python/factory.py | 583 ------ .../python/relationship_extractor.py | 205 --- .../scip/framework/relationship_manager.py | 406 ----- .../scip/framework/standard_framework.py | 354 ---- .../scip/framework/streaming_indexer.py | 429 ----- .../scip/framework/symbol_generator.py | 144 -- src/code_index_mcp/scip/framework/types.py | 79 - .../scip/framework/unified_api.py | 456 ----- .../scip/framework/zig/__init__.py | 14 - .../scip/framework/zig/enum_mapper.py | 217 --- .../scip/framework/zig/factory.py | 388 ---- .../framework/zig/relationship_extractor.py | 322 ---- .../framework/zig/tree_sitter_analyzer.py | 357 ---- src/code_index_mcp/scip/language_manager.py | 522 ------ src/code_index_mcp/scip/proto/__init__.py | 1 - src/code_index_mcp/scip/proto/scip.proto | 265 --- src/code_index_mcp/scip/proto/scip_pb2.py | 69 - src/code_index_mcp/server.py | 1 - src/code_index_mcp/services/base_service.py | 4 +- .../services/code_intelligence_service.py | 94 +- .../services/file_discovery_service.py | 249 +-- .../services/index_management_service.py | 161 +- .../services/project_management_service.py | 186 +- .../services/settings_service.py | 22 +- src/code_index_mcp/tools/__init__.py | 3 - .../tools/config/project_config_tool.py | 16 +- .../tools/filesystem/file_matching_tool.py | 9 +- src/code_index_mcp/tools/scip/__init__.py | 8 - .../tools/scip/analyzers/__init__.py | 61 - .../tools/scip/analyzers/base.py | 324 ---- .../tools/scip/analyzers/factory.py | 383 ---- .../scip/analyzers/javascript_analyzer.py | 410 ----- .../tools/scip/analyzers/objc_analyzer.py | 366 ---- .../tools/scip/analyzers/python_analyzer.py | 400 ----- .../tools/scip/analyzers/zig_analyzer.py | 300 ---- .../tools/scip/dependencies/__init__.py | 33 - .../tools/scip/dependencies/classifier.py | 361 ---- .../scip/dependencies/configs/__init__.py | 74 - .../tools/scip/dependencies/configs/base.py | 236 --- .../scip/dependencies/configs/javascript.py | 283 --- .../tools/scip/dependencies/configs/objc.py | 346 ---- .../tools/scip/dependencies/configs/python.py | 355 ---- .../tools/scip/dependencies/configs/zig.py | 266 --- .../tools/scip/dependencies/normalizer.py | 354 ---- .../tools/scip/dependencies/registry.py | 371 ---- .../tools/scip/position/__init__.py | 46 - .../tools/scip/position/calculator.py | 394 ----- .../tools/scip/position/confidence.py | 317 ---- .../tools/scip/position/resolver.py | 436 ----- .../scip/position/strategies/__init__.py | 18 - .../tools/scip/position/strategies/base.py | 185 -- .../scip/position/strategies/heuristic.py | 568 ------ .../position/strategies/scip_occurrence.py | 236 --- .../strategies/tree_sitter_strategy.py | 523 ------ .../tools/scip/relationship_info.py | 611 ------- .../tools/scip/scip_index_tool.py | 230 --- .../tools/scip/scip_symbol_analyzer.py | 1565 ----------------- .../tools/scip/symbol_definitions.py | 291 --- uv.lock | 83 +- 123 files changed, 3284 insertions(+), 25463 deletions(-) create mode 100644 LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md create mode 100644 src/code_index_mcp/indexing/json_index_builder.py create mode 100644 src/code_index_mcp/indexing/json_index_manager.py create mode 100644 src/code_index_mcp/indexing/models/__init__.py create mode 100644 src/code_index_mcp/indexing/models/file_info.py create mode 100644 src/code_index_mcp/indexing/models/symbol_info.py delete mode 100644 src/code_index_mcp/indexing/scip_builder.py create mode 100644 src/code_index_mcp/indexing/strategies/__init__.py create mode 100644 src/code_index_mcp/indexing/strategies/base_strategy.py create mode 100644 src/code_index_mcp/indexing/strategies/fallback_strategy.py create mode 100644 src/code_index_mcp/indexing/strategies/go_strategy.py create mode 100644 src/code_index_mcp/indexing/strategies/java_strategy.py create mode 100644 src/code_index_mcp/indexing/strategies/javascript_strategy.py create mode 100644 src/code_index_mcp/indexing/strategies/objective_c_strategy.py create mode 100644 src/code_index_mcp/indexing/strategies/python_strategy.py create mode 100644 src/code_index_mcp/indexing/strategies/strategy_factory.py create mode 100644 src/code_index_mcp/indexing/strategies/typescript_strategy.py create mode 100644 src/code_index_mcp/indexing/strategies/zig_strategy.py delete mode 100644 src/code_index_mcp/indexing/unified_index_manager.py delete mode 100644 src/code_index_mcp/scip/__init__.py delete mode 100644 src/code_index_mcp/scip/core/__init__.py delete mode 100644 src/code_index_mcp/scip/core/local_reference_resolver.py delete mode 100644 src/code_index_mcp/scip/core/moniker_manager.py delete mode 100644 src/code_index_mcp/scip/core/position_calculator.py delete mode 100644 src/code_index_mcp/scip/core/relationship_manager.py delete mode 100644 src/code_index_mcp/scip/core/relationship_types.py delete mode 100644 src/code_index_mcp/scip/core/symbol_manager.py delete mode 100644 src/code_index_mcp/scip/framework/__init__.py delete mode 100644 src/code_index_mcp/scip/framework/base/__init__.py delete mode 100644 src/code_index_mcp/scip/framework/base/enum_mapper.py delete mode 100644 src/code_index_mcp/scip/framework/base/index_factory.py delete mode 100644 src/code_index_mcp/scip/framework/base/language_analyzer.py delete mode 100644 src/code_index_mcp/scip/framework/base/relationship_extractor.py delete mode 100644 src/code_index_mcp/scip/framework/caching_system.py delete mode 100644 src/code_index_mcp/scip/framework/compliance_validator.py delete mode 100644 src/code_index_mcp/scip/framework/fallback/__init__.py delete mode 100644 src/code_index_mcp/scip/framework/fallback/basic_analyzer.py delete mode 100644 src/code_index_mcp/scip/framework/fallback/enum_mapper.py delete mode 100644 src/code_index_mcp/scip/framework/fallback/factory.py delete mode 100644 src/code_index_mcp/scip/framework/fallback/relationship_extractor.py delete mode 100644 src/code_index_mcp/scip/framework/index_factory.py delete mode 100644 src/code_index_mcp/scip/framework/java/__init__.py delete mode 100644 src/code_index_mcp/scip/framework/java/enum_mapper.py delete mode 100644 src/code_index_mcp/scip/framework/java/factory.py delete mode 100644 src/code_index_mcp/scip/framework/java/relationship_extractor.py delete mode 100644 src/code_index_mcp/scip/framework/java/tree_sitter_analyzer.py delete mode 100644 src/code_index_mcp/scip/framework/javascript/__init__.py delete mode 100644 src/code_index_mcp/scip/framework/javascript/enum_mapper.py delete mode 100644 src/code_index_mcp/scip/framework/javascript/factory.py delete mode 100644 src/code_index_mcp/scip/framework/javascript/relationship_extractor.py delete mode 100644 src/code_index_mcp/scip/framework/javascript/syntax_analyzer.py delete mode 100644 src/code_index_mcp/scip/framework/objective_c/__init__.py delete mode 100644 src/code_index_mcp/scip/framework/objective_c/clang_analyzer.py delete mode 100644 src/code_index_mcp/scip/framework/objective_c/enum_mapper.py delete mode 100644 src/code_index_mcp/scip/framework/objective_c/factory.py delete mode 100644 src/code_index_mcp/scip/framework/objective_c/relationship_extractor.py delete mode 100644 src/code_index_mcp/scip/framework/position_calculator.py delete mode 100644 src/code_index_mcp/scip/framework/python/__init__.py delete mode 100644 src/code_index_mcp/scip/framework/python/ast_analyzer.py delete mode 100644 src/code_index_mcp/scip/framework/python/enum_mapper.py delete mode 100644 src/code_index_mcp/scip/framework/python/factory.py delete mode 100644 src/code_index_mcp/scip/framework/python/relationship_extractor.py delete mode 100644 src/code_index_mcp/scip/framework/relationship_manager.py delete mode 100644 src/code_index_mcp/scip/framework/standard_framework.py delete mode 100644 src/code_index_mcp/scip/framework/streaming_indexer.py delete mode 100644 src/code_index_mcp/scip/framework/symbol_generator.py delete mode 100644 src/code_index_mcp/scip/framework/types.py delete mode 100644 src/code_index_mcp/scip/framework/unified_api.py delete mode 100644 src/code_index_mcp/scip/framework/zig/__init__.py delete mode 100644 src/code_index_mcp/scip/framework/zig/enum_mapper.py delete mode 100644 src/code_index_mcp/scip/framework/zig/factory.py delete mode 100644 src/code_index_mcp/scip/framework/zig/relationship_extractor.py delete mode 100644 src/code_index_mcp/scip/framework/zig/tree_sitter_analyzer.py delete mode 100644 src/code_index_mcp/scip/language_manager.py delete mode 100644 src/code_index_mcp/scip/proto/__init__.py delete mode 100644 src/code_index_mcp/scip/proto/scip.proto delete mode 100644 src/code_index_mcp/scip/proto/scip_pb2.py delete mode 100644 src/code_index_mcp/tools/scip/__init__.py delete mode 100644 src/code_index_mcp/tools/scip/analyzers/__init__.py delete mode 100644 src/code_index_mcp/tools/scip/analyzers/base.py delete mode 100644 src/code_index_mcp/tools/scip/analyzers/factory.py delete mode 100644 src/code_index_mcp/tools/scip/analyzers/javascript_analyzer.py delete mode 100644 src/code_index_mcp/tools/scip/analyzers/objc_analyzer.py delete mode 100644 src/code_index_mcp/tools/scip/analyzers/python_analyzer.py delete mode 100644 src/code_index_mcp/tools/scip/analyzers/zig_analyzer.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/__init__.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/classifier.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/__init__.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/base.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/javascript.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/objc.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/python.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/configs/zig.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/normalizer.py delete mode 100644 src/code_index_mcp/tools/scip/dependencies/registry.py delete mode 100644 src/code_index_mcp/tools/scip/position/__init__.py delete mode 100644 src/code_index_mcp/tools/scip/position/calculator.py delete mode 100644 src/code_index_mcp/tools/scip/position/confidence.py delete mode 100644 src/code_index_mcp/tools/scip/position/resolver.py delete mode 100644 src/code_index_mcp/tools/scip/position/strategies/__init__.py delete mode 100644 src/code_index_mcp/tools/scip/position/strategies/base.py delete mode 100644 src/code_index_mcp/tools/scip/position/strategies/heuristic.py delete mode 100644 src/code_index_mcp/tools/scip/position/strategies/scip_occurrence.py delete mode 100644 src/code_index_mcp/tools/scip/position/strategies/tree_sitter_strategy.py delete mode 100644 src/code_index_mcp/tools/scip/relationship_info.py delete mode 100644 src/code_index_mcp/tools/scip/scip_index_tool.py delete mode 100644 src/code_index_mcp/tools/scip/scip_symbol_analyzer.py delete mode 100644 src/code_index_mcp/tools/scip/symbol_definitions.py diff --git a/LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md b/LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md new file mode 100644 index 0000000..7710532 --- /dev/null +++ b/LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md @@ -0,0 +1,345 @@ +# LLM-Optimized Index Replacement Plan + +## Current Architecture Analysis + +### Actual Implementation Process +1. **Project Initialization**: LLM calls `set_project_path()` to establish project root +2. **File Watcher Activation**: Automatic file monitoring starts with debounced re-indexing +3. **Codebase Traversal**: System scans all files using extension whitelist (SUPPORTED_EXTENSIONS) +4. **Language-Specific Processing**: Different strategies for each language's unique characteristics +5. **Dual Storage**: Index stored in temporary path + in-memory for fast access +6. **Query Tools**: LLMs call analysis tools that use the built index + +### SCIP-Based System Issues +- **Complex Protocol**: SCIP protobuf format designed for IDEs, not LLM consumption +- **Over-Engineering**: Multi-layer abstraction (strategies/factories) creates complexity +- **Token Inefficiency**: Verbose SCIP format wastes LLM context tokens +- **Parsing Overhead**: Complex symbol ID generation and validation +- **Cross-Document Complexity**: Relationship building adds minimal LLM value + +### Current Flow Analysis +``` +set_project_path() → File Watcher Activation → Codebase Traversal (Extension Whitelist) → +Language-Specific Strategies → SCIP Builder → Index Storage (Temp + Memory) → +Query Tools Access Index +``` + +### Reusable Components +- **Extension Whitelist**: SUPPORTED_EXTENSIONS constant defining indexable file types +- **File Watcher Service**: Robust debounced file monitoring with auto re-indexing +- **Language Strategy System**: Multi-language support with unique characteristics per language +- **Dual Storage Pattern**: Temporary file storage + in-memory caching for performance +- **Service Architecture**: Clean 3-layer pattern (MCP → Services → Tools) +- **Tree-sitter Parsing**: High-quality AST parsing for supported languages + +## Replacement Architecture + +### Core Principle +Clean slate approach: Delete all SCIP components and build simple, LLM-optimized JSON indexing system from scratch. Preserve three-layer architecture by only replacing the tool layer. + +### New Index Format Design + +#### Design Rationale +The index should optimize for **LLM query patterns** rather than IDE features: + +1. **Function Tracing Focus**: LLMs primarily need to understand "what calls what" +2. **Fast Lookups**: Hash-based access for instant symbol resolution +3. **Minimal Redundancy**: Avoid duplicate data that wastes tokens +4. **Query-Friendly Structure**: Organize data how LLMs will actually access it +5. **Incremental Updates**: Support efficient file-by-file rebuilds + +#### Multi-Language Index Format +```json +{ + "metadata": { + "project_path": "/absolute/path/to/project", + "indexed_files": 275, + "index_version": "1.0.0", + "timestamp": "2025-01-15T10:30:00Z", + "languages": ["python", "javascript", "java", "objective-c"] + }, + + "symbols": { + "src/main.py::process_data": { + "type": "function", + "file": "src/main.py", + "line": 42, + "signature": "def process_data(items: List[str]) -> None:", + "called_by": ["src/main.py::main"] + }, + "src/main.py::MyClass": { + "type": "class", + "file": "src/main.py", + "line": 10 + }, + "src/main.py::MyClass.process": { + "type": "method", + "file": "src/main.py", + "line": 20, + "signature": "def process(self, data: str) -> bool:", + "called_by": ["src/main.py::process_data"] + }, + "src/MyClass.java::com.example.MyClass": { + "type": "class", + "file": "src/MyClass.java", + "line": 5, + "package": "com.example" + }, + "src/MyClass.java::com.example.MyClass.process": { + "type": "method", + "file": "src/MyClass.java", + "line": 10, + "signature": "public void process(String data)", + "called_by": ["src/Main.java::com.example.Main.main"] + }, + "src/main.js::regularFunction": { + "type": "function", + "file": "src/main.js", + "line": 5, + "signature": "function regularFunction(data)", + "called_by": ["src/main.js::main"] + }, + "src/main.js::MyClass.method": { + "type": "method", + "file": "src/main.js", + "line": 15, + "signature": "method(data)", + "called_by": ["src/main.js::regularFunction"] + } + }, + + "files": { + "src/main.py": { + "language": "python", + "line_count": 150, + "symbols": { + "functions": ["process_data", "helper"], + "classes": ["MyClass"] + }, + "imports": ["os", "json", "typing"] + }, + "src/MyClass.java": { + "language": "java", + "line_count": 80, + "symbols": { + "classes": ["MyClass"] + }, + "package": "com.example", + "imports": ["java.util.List", "java.io.File"] + }, + "src/main.js": { + "language": "javascript", + "line_count": 120, + "symbols": { + "functions": ["regularFunction", "helperFunction"], + "classes": ["MyClass"] + }, + "imports": ["fs", "path"], + "exports": ["regularFunction", "MyClass"] + } + } +} +``` + +#### Key Design Decisions + +**1. Universal Qualified Symbol Names** +- Use `"file::symbol"` for standalone symbols, `"file::scope.symbol"` for nested +- **Why**: Eliminates name collisions across all languages, consistent naming +- **LLM Benefit**: Unambiguous symbol identification with clear hierarchy + +**2. Multi-Language Consistency** +- Same symbol format for Python classes, Java packages, JavaScript exports +- **Why**: Single query pattern works across all languages +- **LLM Benefit**: Learn once, query any language the same way + +**3. Called-By Only Relationships** +- Track only `called_by` arrays, not `calls` +- **Why**: Simpler implementation, linear build performance, focuses on usage +- **LLM Benefit**: Direct answers to "where is function X used?" queries + +**4. Language-Specific Fields** +- Java: `package` field, JavaScript: `exports` array, etc. +- **Why**: Preserve important language semantics without complexity +- **LLM Benefit**: Access language-specific information when needed + +**5. Simplified File Structure** +- Organized `symbols` object with arrays by type (functions, classes) +- **Why**: Fast file-level queries, clear organization +- **LLM Benefit**: Immediate file overview showing what symbols exist + +**6. Scope Resolution Strategy** +- Python: `MyClass.method`, Java: `com.example.MyClass.method` +- **Why**: Natural language patterns, includes necessary context +- **LLM Benefit**: Symbol names match how developers think about code + +### Simplified Flow +``` +set_project_path() → File Watcher Activation → Extension Whitelist Traversal → +Language-Specific Simple Parsers → JSON Index Update → Dual Storage (Temp + Memory) → +Query Tools Access Optimized Index +``` + +## Implementation Plan + +### Phase 1: Clean Slate - Remove SCIP System +- **Delete all SCIP tools**: Remove `src/code_index_mcp/scip/` directory completely +- **Remove protobuf dependencies**: Clean up `scip_pb2.py` and related imports +- **Strip SCIP from services**: Remove SCIP references from business logic layers +- **Clean constants**: Remove `SCIP_INDEX_FILE` and related SCIP constants +- **Update dependencies**: Remove protobuf from `pyproject.toml` + +### Phase 2: Tool Layer Replacement +- **Keep three-layer architecture**: Only modify the tool layer, preserve services/MCP layers +- **New simple index format**: Implement lightweight JSON-based indexing tools +- **Language parsers**: Create simple parsers in tool layer (Python `ast`, simplified tree-sitter) +- **Storage tools**: Implement dual storage tools (temp + memory) for new format +- **Query tools**: Build fast lookup tools for the new index structure + +### Phase 3: Service Layer Integration +- **Minimal service changes**: Services delegate to new tools instead of SCIP tools +- **Preserve business logic**: Keep existing service workflows and validation +- **Maintain interfaces**: Services still expose same functionality to MCP layer +- **File watcher integration**: Connect file watcher to new index rebuild tools + +### Phase 4: MCP Layer Compatibility +- **Zero MCP changes**: Existing `@mcp.tool` functions unchanged +- **Same interfaces**: Tools return data in expected formats +- **Backward compatibility**: Existing LLM workflows continue working +- **Performance gains**: Faster responses with same functionality + +### Phase 5: Build from Scratch Mentality +- **New index design**: Simple, LLM-optimized format built fresh +- **Clean codebase**: Remove all SCIP complexity and start simple +- **Fresh dependencies**: Only essential libraries (no protobuf, simplified tree-sitter) +- **Focused scope**: Build only what's needed for LLM use cases + +## Technical Specifications + +### Index Storage +- **Dual Storage**: Temporary path (`%TEMP%/code_indexer//`) + in-memory caching +- **Format**: JSON with msgpack binary serialization for performance +- **Location**: Follow existing pattern (discoverable via constants.py) +- **Extension Filtering**: Use existing SUPPORTED_EXTENSIONS whitelist +- **Size**: ~10-50KB for typical projects vs ~1-5MB SCIP +- **Access**: Direct dict lookups vs protobuf traversal +- **File Watcher Integration**: Automatic updates when files change + +### Language Support +- **Python**: Built-in `ast` module for optimal performance and accuracy +- **JavaScript/TypeScript**: Existing tree-sitter parsers (proven reliability) +- **Other Languages**: Reuse existing tree-sitter implementations +- **Simplify**: Remove SCIP-specific symbol generation overhead +- **Focus**: Extract symbols and `called_by` relationships only + +### Query Performance +- **Target**: <100ms for any query operation +- **Method**: Hash-based lookups vs linear SCIP traversal +- **Caching**: In-memory symbol registry for instant access + +### File Watching +- **Keep**: Existing watchdog-based file monitoring +- **Optimize**: Batch incremental updates vs full rebuilds +- **Debounce**: Maintain 4-6 second debounce for change batching + +## Migration Strategy + +### Backward Compatibility +- **Zero breaking changes**: Same MCP tool interfaces and return formats +- **Preserve workflows**: File watcher, project setup, and query patterns unchanged +- **Service contracts**: Business logic layer contracts remain stable +- **LLM experience**: Existing LLM usage patterns continue working + +### Rollback Plan +- **Git branch strategy**: Preserve SCIP implementation in separate branch +- **Incremental deployment**: Can revert individual components if needed +- **Performance monitoring**: Compare old vs new system metrics +- **Fallback mechanism**: Quick switch back to SCIP if issues arise + +### Testing Strategy +- Compare output accuracy between SCIP and simple index +- Benchmark query performance improvements +- Validate function tracing completeness +- Test incremental update correctness + +## Expected Benefits + +### Performance Improvements +- **Index Build**: 5-10x faster (no protobuf, no complex call analysis) +- **Query Speed**: 10-100x faster (direct hash lookups) +- **Memory Usage**: 80% reduction (simple JSON vs protobuf) +- **Build Complexity**: Linear O(n) vs complex relationship resolution + +### Maintenance Benefits +- **Code Complexity**: 70% reduction (remove entire SCIP system) +- **Dependencies**: Remove protobuf, simplify tree-sitter usage +- **Debugging**: Human-readable JSON vs binary protobuf +- **Call Analysis**: Simple `called_by` tracking vs complex call graph building + +### LLM Integration Benefits +- **Fast Responses**: Sub-100ms query times for any symbol lookup +- **Token Efficiency**: Qualified names eliminate ambiguity +- **Simple Format**: Direct JSON access patterns +- **Focused Data**: Only essential information for code understanding + +## Risk Mitigation + +### Functionality Loss +- **Risk**: Missing advanced SCIP features +- **Mitigation**: Focus on core LLM use cases (function tracing) +- **Validation**: Compare query completeness with existing system + +### Performance Regression +- **Risk**: New implementation slower than expected +- **Mitigation**: Benchmark against SCIP at each phase +- **Fallback**: Maintain SCIP implementation as backup + +### Migration Complexity +- **Risk**: Difficult transition from SCIP +- **Mitigation**: Phased rollout with feature flags +- **Safety**: Comprehensive testing before production use + +## Success Metrics + +### Performance Targets +- Index build time: <5 seconds for 1000 files +- Query response time: <100ms for any operation +- Memory usage: <50MB for typical projects +- Token efficiency: 90% reduction in LLM context usage + +### Quality Targets +- Function detection accuracy: >95% vs SCIP +- Call chain completeness: >90% vs SCIP +- Incremental update correctness: 100% +- File watcher reliability: Zero missed changes + +## Implementation Timeline + +### Week 1-2: Foundation +- Core index structure and storage +- Basic JSON schema implementation +- Simple parser extraction from existing code + +### Week 3-4: Language Integration +- Tree-sitter parser simplification +- Multi-language symbol extraction +- Function call relationship building + +### Week 5-6: MCP Tools +- LLM-optimized tool implementation +- Performance optimization +- Query response formatting + +### Week 7-8: Integration and Testing +- File watcher integration +- Comprehensive testing +- Migration tooling + +### Week 9-10: Production Deployment +- Feature flag rollout +- Performance monitoring +- SCIP deprecation planning + +## Conclusion + +This replacement plan transforms the code-index-mcp from a complex SCIP-based system into a lean, LLM-optimized indexing solution. By focusing on the core use case of function tracing and rapid codebase understanding, we achieve significant performance improvements while maintaining all essential functionality. The simplified architecture reduces maintenance burden and enables faster iteration on LLM-specific features. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2c0d989..9ce51bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,14 +15,13 @@ authors = [ dependencies = [ "mcp>=0.3.0", "watchdog>=3.0.0", - "protobuf>=4.21.0", "tree-sitter>=0.20.0", "tree-sitter-javascript>=0.20.0", "tree-sitter-typescript>=0.20.0", "tree-sitter-java>=0.20.0", "tree-sitter-zig>=0.20.0", "pathspec>=0.12.1", - "libclang>=16.0.0", + "msgpack>=1.0.0", ] [project.urls] diff --git a/src/code_index_mcp/constants.py b/src/code_index_mcp/constants.py index 97713d1..81b3d9b 100644 --- a/src/code_index_mcp/constants.py +++ b/src/code_index_mcp/constants.py @@ -5,10 +5,7 @@ # Directory and file names SETTINGS_DIR = "code_indexer" CONFIG_FILE = "config.json" -SCIP_INDEX_FILE = "index.scip" # SCIP protobuf binary file -# Legacy files -INDEX_FILE = "index.json" # Legacy JSON index file (to be removed) -# CACHE_FILE removed - no longer needed with new indexing system +INDEX_FILE = "index.json" # JSON index file # Supported file extensions for code analysis # This is the authoritative list used by both old and new indexing systems diff --git a/src/code_index_mcp/indexing/__init__.py b/src/code_index_mcp/indexing/__init__.py index edbcf50..51259ee 100644 --- a/src/code_index_mcp/indexing/__init__.py +++ b/src/code_index_mcp/indexing/__init__.py @@ -1,8 +1,7 @@ """ Code indexing utilities for the MCP server. -This module provides utility functions for duplicate detection and -qualified name generation used by the SCIP indexing system. +This module provides simple JSON-based indexing optimized for LLM consumption. """ # Import utility functions that are still used @@ -11,11 +10,17 @@ normalize_file_path ) -# SCIP builder is still used by the new architecture -from .scip_builder import SCIPIndexBuilder +# New JSON-based indexing system +from .json_index_builder import JSONIndexBuilder, SymbolInfo, FileInfo, IndexMetadata +from .json_index_manager import JSONIndexManager, get_index_manager __all__ = [ 'generate_qualified_name', 'normalize_file_path', - 'SCIPIndexBuilder' + 'JSONIndexBuilder', + 'JSONIndexManager', + 'get_index_manager', + 'SymbolInfo', + 'FileInfo', + 'IndexMetadata' ] \ No newline at end of file diff --git a/src/code_index_mcp/indexing/json_index_builder.py b/src/code_index_mcp/indexing/json_index_builder.py new file mode 100644 index 0000000..8e4ddec --- /dev/null +++ b/src/code_index_mcp/indexing/json_index_builder.py @@ -0,0 +1,312 @@ +""" +JSON Index Builder - Clean implementation using Strategy pattern. + +This replaces the monolithic parser implementation with a clean, +maintainable Strategy pattern architecture. +""" + +import logging +import os +import time +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Dict, List, Optional, Any + +from .strategies import StrategyFactory +from .models import SymbolInfo, FileInfo +from ..constants import SUPPORTED_EXTENSIONS + +logger = logging.getLogger(__name__) + + +@dataclass +class IndexMetadata: + """Metadata for the JSON index.""" + project_path: str + indexed_files: int + index_version: str + timestamp: str + languages: List[str] + total_symbols: int = 0 + specialized_parsers: int = 0 + fallback_files: int = 0 + + +class JSONIndexBuilder: + """ + Main index builder using Strategy pattern for language parsing. + + This class orchestrates the index building process by: + 1. Discovering files in the project + 2. Using StrategyFactory to get appropriate parsers + 3. Extracting symbols and metadata + 4. Assembling the final JSON index + """ + + def __init__(self, project_path: str): + self.project_path = project_path + self.in_memory_index: Optional[Dict[str, Any]] = None + self.strategy_factory = StrategyFactory() + + logger.info(f"Initialized JSON index builder for {project_path}") + strategy_info = self.strategy_factory.get_strategy_info() + logger.info(f"Available parsing strategies: {len(strategy_info)} types") + + # Log specialized vs fallback coverage + specialized = len(self.strategy_factory.get_specialized_extensions()) + fallback = len(self.strategy_factory.get_fallback_extensions()) + logger.info(f"Specialized parsers: {specialized} extensions, Fallback coverage: {fallback} extensions") + + def build_index(self) -> Dict[str, Any]: + """ + Build the complete index using Strategy pattern. + + Returns: + Complete JSON index with metadata, symbols, and file information + """ + logger.info("Building JSON index using Strategy pattern...") + start_time = time.time() + + all_symbols = {} + all_files = {} + languages = set() + specialized_count = 0 + fallback_count = 0 + + # Get specialized extensions for tracking + specialized_extensions = set(self.strategy_factory.get_specialized_extensions()) + + # Traverse project files + for file_path in self._get_supported_files(): + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + ext = Path(file_path).suffix.lower() + + # Convert to relative path first + rel_path = os.path.relpath(file_path, self.project_path).replace('\\', '/') + + # Get appropriate strategy + strategy = self.strategy_factory.get_strategy(ext) + + # Track strategy usage + if ext in specialized_extensions: + specialized_count += 1 + else: + fallback_count += 1 + + # Parse file using strategy with relative path + symbols, file_info = strategy.parse_file(rel_path, content) + + # Add to index + all_symbols.update(symbols) + all_files[rel_path] = file_info + languages.add(file_info.language) + + logger.debug(f"Parsed {rel_path}: {len(symbols)} symbols ({file_info.language})") + + except Exception as e: + logger.warning(f"Error processing {file_path}: {e}") + + # Build index metadata + metadata = IndexMetadata( + project_path=self.project_path, + indexed_files=len(all_files), + index_version="2.0.0-strategy", + timestamp=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + languages=sorted(list(languages)), + total_symbols=len(all_symbols), + specialized_parsers=specialized_count, + fallback_files=fallback_count + ) + + # Assemble final index + index = { + "metadata": asdict(metadata), + "symbols": {k: asdict(v) for k, v in all_symbols.items()}, + "files": {k: asdict(v) for k, v in all_files.items()} + } + + # Cache in memory + self.in_memory_index = index + + elapsed = time.time() - start_time + logger.info(f"Built index with {len(all_symbols)} symbols from {len(all_files)} files in {elapsed:.2f}s") + logger.info(f"Languages detected: {sorted(languages)}") + logger.info(f"Strategy usage: {specialized_count} specialized, {fallback_count} fallback") + + return index + + def get_index(self) -> Optional[Dict[str, Any]]: + """Get the current in-memory index.""" + return self.in_memory_index + + def clear_index(self): + """Clear the in-memory index.""" + self.in_memory_index = None + logger.debug("Cleared in-memory index") + + def _get_supported_files(self) -> List[str]: + """ + Get all supported files in the project. + + Returns: + List of file paths that can be parsed + """ + supported_files = [] + supported_extensions = set(SUPPORTED_EXTENSIONS) + + try: + for root, dirs, files in os.walk(self.project_path): + # Skip hidden directories and common ignore patterns + dirs[:] = [d for d in dirs if not d.startswith('.') and d not in { + '__pycache__', 'node_modules', '.git', '.svn', '.hg', + '.vscode', '.idea', 'target', 'build', 'dist' + }] + + for file in files: + if file.startswith('.'): + continue + + file_path = os.path.join(root, file) + ext = Path(file_path).suffix.lower() + + if ext in supported_extensions: + supported_files.append(file_path) + + except Exception as e: + logger.error(f"Error scanning directory {self.project_path}: {e}") + + logger.debug(f"Found {len(supported_files)} supported files") + return supported_files + + def save_index(self, index: Dict[str, Any], index_path: str) -> bool: + """ + Save index to disk. + + Args: + index: Index data to save + index_path: Path where to save the index + + Returns: + True if successful, False otherwise + """ + try: + import json + with open(index_path, 'w', encoding='utf-8') as f: + json.dump(index, f, indent=2, ensure_ascii=False) + logger.info(f"Saved index to {index_path}") + return True + except Exception as e: + logger.error(f"Failed to save index to {index_path}: {e}") + return False + + def load_index(self, index_path: str) -> Optional[Dict[str, Any]]: + """ + Load index from disk. + + Args: + index_path: Path to the index file + + Returns: + Index data if successful, None otherwise + """ + try: + if not os.path.exists(index_path): + logger.debug(f"Index file not found: {index_path}") + return None + + import json + with open(index_path, 'r', encoding='utf-8') as f: + index = json.load(f) + + # Cache in memory + self.in_memory_index = index + logger.info(f"Loaded index from {index_path}") + return index + + except Exception as e: + logger.error(f"Failed to load index from {index_path}: {e}") + return None + + def get_parsing_statistics(self) -> Dict[str, Any]: + """ + Get detailed statistics about parsing capabilities. + + Returns: + Dictionary with parsing statistics and strategy information + """ + strategy_info = self.strategy_factory.get_strategy_info() + + return { + "total_strategies": len(strategy_info), + "specialized_languages": [lang for lang in strategy_info.keys() if not lang.startswith('fallback_')], + "fallback_languages": [lang.replace('fallback_', '') for lang in strategy_info.keys() if lang.startswith('fallback_')], + "total_extensions": len(self.strategy_factory.get_all_supported_extensions()), + "specialized_extensions": len(self.strategy_factory.get_specialized_extensions()), + "fallback_extensions": len(self.strategy_factory.get_fallback_extensions()), + "strategy_details": strategy_info + } + + def get_file_symbols(self, file_path: str) -> List[Dict[str, Any]]: + """ + Get symbols for a specific file. + + Args: + file_path: Relative path to the file + + Returns: + List of symbols in the file + """ + if not self.in_memory_index: + logger.warning("Index not loaded") + return [] + + try: + # Normalize file path + file_path = file_path.replace('\\', '/') + if file_path.startswith('./'): + file_path = file_path[2:] + + # Get file info + file_info = self.in_memory_index["files"].get(file_path) + if not file_info: + logger.warning(f"File not found in index: {file_path}") + return [] + + # Work directly with global symbols for this file + global_symbols = self.in_memory_index.get("symbols", {}) + result = [] + + # Find all symbols for this file directly from global symbols + for symbol_id, symbol_data in global_symbols.items(): + symbol_file = symbol_data.get("file", "").replace("\\", "/") + + # Check if this symbol belongs to our file + if symbol_file == file_path: + symbol_type = symbol_data.get("type", "unknown") + symbol_name = symbol_id.split("::")[-1] # Extract symbol name from ID + + # Create symbol info + symbol_info = { + "name": symbol_name, + "called_by": symbol_data.get("called_by", []), + "line": symbol_data.get("line"), + "signature": symbol_data.get("signature") + } + + # Categorize by type + if symbol_type in ["function", "method"]: + result.append(symbol_info) + elif symbol_type == "class": + result.append(symbol_info) + + # Sort by line number for consistent ordering + result.sort(key=lambda x: x.get("line", 0)) + + return result + + except Exception as e: + logger.error(f"Error getting file symbols for {file_path}: {e}") + return [] diff --git a/src/code_index_mcp/indexing/json_index_manager.py b/src/code_index_mcp/indexing/json_index_manager.py new file mode 100644 index 0000000..d24eb03 --- /dev/null +++ b/src/code_index_mcp/indexing/json_index_manager.py @@ -0,0 +1,355 @@ +""" +JSON Index Manager - Manages the lifecycle of the JSON-based index. + +This replaces the SCIP unified_index_manager with a simpler approach +focused on fast JSON-based indexing and querying. +""" + +import hashlib +import json +import logging +import os +import tempfile +import threading +from pathlib import Path +from typing import Dict, List, Optional, Any + +from .json_index_builder import JSONIndexBuilder +from ..constants import SETTINGS_DIR, INDEX_FILE + +logger = logging.getLogger(__name__) + + +class JSONIndexManager: + """Manages JSON-based code index lifecycle and storage.""" + + def __init__(self): + self.project_path: Optional[str] = None + self.index_builder: Optional[JSONIndexBuilder] = None + self.temp_dir: Optional[str] = None + self.index_path: Optional[str] = None + self._lock = threading.RLock() + logger.info("Initialized JSON Index Manager") + + def set_project_path(self, project_path: str) -> bool: + """Set the project path and initialize index storage.""" + with self._lock: + try: + if not os.path.isdir(project_path): + logger.error(f"Project path does not exist: {project_path}") + return False + + self.project_path = project_path + self.index_builder = JSONIndexBuilder(project_path) + + # Create temp directory for index storage + project_hash = hashlib.md5(project_path.encode()).hexdigest()[:12] + self.temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, project_hash) + os.makedirs(self.temp_dir, exist_ok=True) + + self.index_path = os.path.join(self.temp_dir, INDEX_FILE) + + logger.info(f"Set project path: {project_path}") + logger.info(f"Index storage: {self.index_path}") + return True + + except Exception as e: + logger.error(f"Failed to set project path: {e}") + return False + + def build_index(self, force_rebuild: bool = False) -> bool: + """Build or rebuild the index.""" + with self._lock: + if not self.index_builder or not self.project_path: + logger.error("Index builder not initialized") + return False + + try: + # Check if we need to rebuild + if not force_rebuild and self._is_index_fresh(): + logger.info("Index is fresh, skipping rebuild") + return True + + logger.info("Building JSON index...") + index = self.index_builder.build_index() + + # Save to disk + self.index_builder.save_index(index, self.index_path) + + logger.info(f"Successfully built index with {len(index['symbols'])} symbols") + return True + + except Exception as e: + logger.error(f"Failed to build index: {e}") + return False + + def load_index(self) -> bool: + """Load existing index from disk.""" + with self._lock: + if not self.index_builder or not self.index_path: + logger.error("Index manager not initialized") + return False + + try: + index = self.index_builder.load_index(self.index_path) + if index: + logger.info(f"Loaded index with {len(index['symbols'])} symbols") + return True + else: + logger.warning("No existing index found") + return False + + except Exception as e: + logger.error(f"Failed to load index: {e}") + return False + + def refresh_index(self) -> bool: + """Refresh the index (rebuild and reload).""" + with self._lock: + logger.info("Refreshing index...") + if self.build_index(force_rebuild=True): + return self.load_index() + return False + + def find_files(self, pattern: str = "*") -> List[str]: + """Find files matching a pattern.""" + with self._lock: + if not self.index_builder or not self.index_builder.in_memory_index: + logger.warning("Index not loaded") + return [] + + try: + files = list(self.index_builder.in_memory_index["files"].keys()) + + if pattern == "*": + return files + + # Simple pattern matching + import fnmatch + return [f for f in files if fnmatch.fnmatch(f, pattern)] + + except Exception as e: + logger.error(f"Error finding files: {e}") + return [] + + def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]: + """Get summary information for a file.""" + with self._lock: + # Auto-initialize if not ready but project path can be inferred + if not self.index_builder or not self.index_builder.in_memory_index: + if not self._auto_initialize_from_context(): + logger.warning("Index not loaded and cannot auto-initialize") + return None + + try: + # Normalize file path + file_path = file_path.replace('\\', '/') + if file_path.startswith('./'): + file_path = file_path[2:] + + # Get file info + file_info = self.index_builder.in_memory_index["files"].get(file_path) + if not file_info: + logger.warning(f"File not found in index: {file_path}") + return None + + # Get symbols in file + symbols = self.index_builder.get_file_symbols(file_path) + + # Categorize symbols by signature + functions = [] + classes = [] + methods = [] + + for s in symbols: + signature = s.get("signature", "") + if signature: + if signature.startswith("def ") and "::" in signature: + # Method: contains class context + methods.append(s) + elif signature.startswith("def "): + # Function: starts with def but no class context + functions.append(s) + elif signature.startswith("class ") or signature is None: + # Class: starts with class or has no signature + classes.append(s) + else: + # Default to function for unknown signatures + functions.append(s) + else: + # No signature - try to infer from name patterns or default to function + name = s.get("name", "") + if name and name[0].isupper(): + # Capitalized names are likely classes + classes.append(s) + else: + # Default to function + functions.append(s) + + return { + "file_path": file_path, + "language": file_info["language"], + "line_count": file_info["line_count"], + "symbol_count": len(symbols), + "functions": functions, + "classes": classes, + "methods": methods, + "imports": file_info.get("imports", []), + "exports": file_info.get("exports", []) + } + + except Exception as e: + logger.error(f"Error getting file summary: {e}") + return None + + def search_symbols(self, query: str, symbol_type: Optional[str] = None) -> List[Dict[str, Any]]: + """Search for symbols by name.""" + with self._lock: + if not self.index_builder or not self.index_builder.in_memory_index: + logger.warning("Index not loaded") + return [] + + try: + results = [] + query_lower = query.lower() + + for symbol_id, symbol_data in self.index_builder.in_memory_index["symbols"].items(): + # Filter by type if specified + if symbol_type and symbol_data.get("type") != symbol_type: + continue + + # Check if query matches symbol name + if query_lower in symbol_id.lower(): + results.append({ + "id": symbol_id, + **symbol_data + }) + + return results[:50] # Limit results + + except Exception as e: + logger.error(f"Error searching symbols: {e}") + return [] + + def get_symbol_callers(self, symbol_name: str) -> List[str]: + """Get all symbols that call the given symbol.""" + with self._lock: + if not self.index_builder: + return [] + + return self.index_builder.get_callers(symbol_name) + + def get_index_stats(self) -> Dict[str, Any]: + """Get statistics about the current index.""" + with self._lock: + if not self.index_builder or not self.index_builder.in_memory_index: + return {"status": "not_loaded"} + + try: + index = self.index_builder.in_memory_index + metadata = index["metadata"] + + symbol_counts = {} + for symbol_data in index["symbols"].values(): + symbol_type = symbol_data.get("type", "unknown") + symbol_counts[symbol_type] = symbol_counts.get(symbol_type, 0) + 1 + + return { + "status": "loaded", + "project_path": metadata["project_path"], + "indexed_files": metadata["indexed_files"], + "total_symbols": len(index["symbols"]), + "symbol_types": symbol_counts, + "languages": metadata["languages"], + "index_version": metadata["index_version"], + "timestamp": metadata["timestamp"] + } + + except Exception as e: + logger.error(f"Error getting index stats: {e}") + return {"status": "error", "error": str(e)} + + def _is_index_fresh(self) -> bool: + """Check if the current index is fresh.""" + if not self.index_path or not os.path.exists(self.index_path): + return False + + try: + # Simple freshness check - index exists and is recent + index_mtime = os.path.getmtime(self.index_path) + + # Check if any source files are newer than index + for root, dirs, files in os.walk(self.project_path): + # Skip excluded directories + dirs[:] = [d for d in dirs if d not in {'.git', '__pycache__', 'node_modules', '.venv'}] + + for file in files: + if any(file.endswith(ext) for ext in ['.py', '.js', '.ts', '.java']): + file_path = os.path.join(root, file) + if os.path.getmtime(file_path) > index_mtime: + return False + + return True + + except Exception as e: + logger.warning(f"Error checking index freshness: {e}") + return False + + def _auto_initialize_from_context(self) -> bool: + """ + Auto-initialize from the most recent project context. + This handles the case where MCP tools run in separate processes. + """ + try: + import glob + import tempfile + + # Find the most recent index file + pattern = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, "*", INDEX_FILE) + index_files = glob.glob(pattern) + + if not index_files: + logger.debug("No index files found for auto-initialization") + return False + + # Get the most recently modified index + latest_file = max(index_files, key=os.path.getmtime) + logger.info(f"Auto-initializing from latest index: {latest_file}") + + # Extract project path from the index + with open(latest_file, 'r', encoding='utf-8') as f: + import json + index_data = json.load(f) + project_path = index_data.get('metadata', {}).get('project_path') + + if not project_path or not os.path.exists(project_path): + logger.warning(f"Invalid project path in index: {project_path}") + return False + + # Initialize with this project path + if self.set_project_path(project_path): + return self.load_index() + + return False + + except Exception as e: + logger.warning(f"Auto-initialization failed: {e}") + return False + + def cleanup(self): + """Clean up resources.""" + with self._lock: + self.project_path = None + self.index_builder = None + self.temp_dir = None + self.index_path = None + logger.info("Cleaned up JSON Index Manager") + + +# Global instance +_index_manager = JSONIndexManager() + + +def get_index_manager() -> JSONIndexManager: + """Get the global index manager instance.""" + return _index_manager \ No newline at end of file diff --git a/src/code_index_mcp/indexing/models/__init__.py b/src/code_index_mcp/indexing/models/__init__.py new file mode 100644 index 0000000..b120a34 --- /dev/null +++ b/src/code_index_mcp/indexing/models/__init__.py @@ -0,0 +1,8 @@ +""" +Model classes for the indexing system. +""" + +from .symbol_info import SymbolInfo +from .file_info import FileInfo + +__all__ = ['SymbolInfo', 'FileInfo'] \ No newline at end of file diff --git a/src/code_index_mcp/indexing/models/file_info.py b/src/code_index_mcp/indexing/models/file_info.py new file mode 100644 index 0000000..0678774 --- /dev/null +++ b/src/code_index_mcp/indexing/models/file_info.py @@ -0,0 +1,24 @@ +""" +FileInfo model for representing file metadata. +""" + +from dataclasses import dataclass +from typing import Dict, List, Optional, Any + + +@dataclass +class FileInfo: + """Information about a source code file.""" + + language: str # programming language + line_count: int # total lines in file + symbols: Dict[str, List[str]] # symbol categories (functions, classes, etc.) + imports: List[str] # imported modules/packages + exports: Optional[List[str]] = None # exported symbols (for JS/TS modules) + package: Optional[str] = None # package name (for Java, Go, etc.) + docstring: Optional[str] = None # file-level documentation + + def __post_init__(self): + """Initialize mutable defaults.""" + if self.exports is None: + self.exports = [] \ No newline at end of file diff --git a/src/code_index_mcp/indexing/models/symbol_info.py b/src/code_index_mcp/indexing/models/symbol_info.py new file mode 100644 index 0000000..1659330 --- /dev/null +++ b/src/code_index_mcp/indexing/models/symbol_info.py @@ -0,0 +1,23 @@ +""" +SymbolInfo model for representing code symbols. +""" + +from dataclasses import dataclass +from typing import Optional, List + + +@dataclass +class SymbolInfo: + """Information about a code symbol (function, class, method, etc.).""" + + type: str # function, class, method, interface, etc. + file: str # file path where symbol is defined + line: int # line number where symbol starts + signature: Optional[str] = None # function/method signature + docstring: Optional[str] = None # documentation string + called_by: Optional[List[str]] = None # list of symbols that call this symbol + + def __post_init__(self): + """Initialize mutable defaults.""" + if self.called_by is None: + self.called_by = [] \ No newline at end of file diff --git a/src/code_index_mcp/indexing/scip_builder.py b/src/code_index_mcp/indexing/scip_builder.py deleted file mode 100644 index 0a58e13..0000000 --- a/src/code_index_mcp/indexing/scip_builder.py +++ /dev/null @@ -1,260 +0,0 @@ -"""SCIP Index Builder - main orchestrator for SCIP-based indexing.""" - -import os -import fnmatch -import pathspec -import logging -from pathlib import Path -from datetime import datetime -from typing import List, Dict, Any, Optional, Tuple -from concurrent.futures import ThreadPoolExecutor, as_completed -from dataclasses import dataclass, field - -from ..scip.language_manager import SCIPLanguageManager, LanguageNotSupportedException -from ..scip.proto import scip_pb2 - - -logger = logging.getLogger(__name__) - - -@dataclass -class ValidationResult: - """Result of SCIP index validation.""" - is_valid: bool - errors: List[str] = field(default_factory=list) - warnings: List[str] = field(default_factory=list) - - -@dataclass -class ScanResult: - """Result of a project scan.""" - file_list: List[Dict[str, Any]] - project_metadata: Dict[str, Any] - - -class SCIPIndexBuilder: - """Main builder class that orchestrates SCIP-based indexing with new language manager.""" - - def __init__(self, max_workers: Optional[int] = None): - self.max_workers = max_workers - self.language_manager: Optional[SCIPLanguageManager] = None - self.project_path = "" - - def build_scip_index(self, project_path: str) -> scip_pb2.Index: - """Build complete SCIP index for a project.""" - start_time = datetime.now() - self.project_path = project_path - - # Initialize language manager for this project - self.language_manager = SCIPLanguageManager(project_path) - - logger.info("🚀 Starting SCIP index build for project: %s", project_path) - logger.debug("Build configuration: max_workers=%s", self.max_workers) - - try: - logger.info("📁 Phase 1: Scanning project files...") - # Phase 1: scan files - scan_result = self._scan_project_files(project_path) - total_files_considered = len(scan_result.file_list) - logger.info("✅ File scan completed, found %d valid files", total_files_considered) - - logger.info("🏷️ Phase 2: Analyzing language distribution...") - file_paths = [str(f['path']) for f in scan_result.file_list] - language_stats = self.language_manager.get_language_statistics(file_paths) - - for language, count in language_stats.items(): - logger.info(" 📋 %s: %d files", language, count) - logger.debug("Language analysis completed") - - logger.info("⚙️ Phase 3: Processing files with language manager...") - # Use the new language manager to create the complete index directly - scip_index = self.language_manager.create_complete_index(file_paths) - logger.info("✅ File processing completed, generated %d documents", len(scip_index.documents)) - - logger.info("🔗 Phase 4: Adding metadata...") - self._add_build_metadata(scip_index, scan_result, start_time) - logger.debug("Metadata addition completed") - - logger.info("🎉 SCIP index build completed successfully") - - logger.info("🔍 Phase 5: Validating SCIP index...") - validation_result = self._validate_scip_index(scip_index) - if not validation_result.is_valid: - logger.warning("⚠️ Index validation found issues: %s", validation_result.errors) - else: - logger.info("✅ Index validation passed") - - return scip_index - except Exception as e: - logger.error("❌ SCIP index build failed: %s", e, exc_info=True) - return self._create_fallback_scip_index(project_path, str(e)) - - def _add_build_metadata(self, scip_index: scip_pb2.Index, scan_result: ScanResult, start_time: datetime) -> None: - """Add build metadata to the SCIP index.""" - build_time = datetime.now() - start_time - - # Add tool info to metadata if not already present - if not scip_index.metadata.tool_info.name: - scip_index.metadata.tool_info.name = "code-index-mcp" - scip_index.metadata.tool_info.version = "2.1.0" # Version with new architecture - - # Add project info - if not scip_index.metadata.project_root: - scip_index.metadata.project_root = self.project_path - - logger.debug(f"Added build metadata: {len(scip_index.documents)} documents, build time: {build_time}") - - def _create_fallback_scip_index(self, project_path: str, error_message: str) -> scip_pb2.Index: - """Create a minimal fallback SCIP index when build fails.""" - logger.warning("Creating fallback SCIP index due to error: %s", error_message) - - try: - # Use fallback language manager - fallback_manager = SCIPLanguageManager(project_path) - fallback_factory = fallback_manager.get_factory('fallback') - - # Create minimal index with just metadata - index = scip_pb2.Index() - index.metadata.CopyFrom(fallback_factory.create_metadata(project_path)) - - # Add error document - error_doc = scip_pb2.Document() - error_doc.relative_path = "BUILD_ERROR.md" - error_doc.language = "markdown" - error_doc.text = f"# Build Error\n\nSCIP indexing failed: {error_message}\n" - index.documents.append(error_doc) - - logger.info("Created fallback SCIP index with basic metadata") - return index - - except Exception as e: - logger.error(f"Failed to create fallback index: {e}") - # Return completely empty index as last resort - return scip_pb2.Index() - - def _scan_project_files(self, project_path: str) -> ScanResult: - """Scan project directory to get a list of files and metadata.""" - logger.debug("📂 Starting file system scan of: %s", project_path) - files = [] - - # Use project settings for exclude patterns - logger.debug("🚫 Loading exclude patterns...") - default_exclude = self._get_default_exclude_patterns() - gitignore_spec = self._load_gitignore_patterns(project_path) - - total_scanned = 0 - excluded_count = 0 - included_count = 0 - - try: - for root, dirs, filenames in os.walk(project_path): - total_scanned += len(filenames) - - # Filter directories to skip excluded ones - dirs[:] = [d for d in dirs if not any(pattern in d for pattern in default_exclude)] - - for filename in filenames: - file_path = os.path.join(root, filename) - - # Check default exclude patterns - if any(pattern in file_path for pattern in default_exclude): - excluded_count += 1 - continue - - # Check gitignore patterns - if self._is_gitignored(file_path, project_path, gitignore_spec): - excluded_count += 1 - continue - - # Include file - file_info = { - 'path': Path(file_path), - 'relative_path': os.path.relpath(file_path, project_path), - 'size': os.path.getsize(file_path), - 'extension': os.path.splitext(filename)[1].lower() - } - files.append(file_info) - included_count += 1 - - except Exception as e: - logger.error("❌ File scan failed: %s", e) - raise - - logger.debug("📊 File scan results: %d total, %d included, %d excluded", - total_scanned, included_count, excluded_count) - - project_metadata = { - 'project_path': project_path, - 'project_name': os.path.basename(project_path), - 'total_files_scanned': total_scanned, - 'files_included': included_count, - 'files_excluded': excluded_count, - 'scan_timestamp': datetime.now().isoformat() - } - - return ScanResult(file_list=files, project_metadata=project_metadata) - - def _get_default_exclude_patterns(self) -> set: - """Get default patterns to exclude from indexing.""" - return {'.git', '.svn', '.hg', '__pycache__', 'node_modules', '.venv', 'venv', - 'build', 'dist', 'target', '.idea', '.vscode'} - - def _load_gitignore_patterns(self, project_path: str): - """Load patterns from .gitignore file using pathspec.""" - gitignore_path = os.path.join(project_path, '.gitignore') - - if os.path.exists(gitignore_path): - try: - with open(gitignore_path, 'r', encoding='utf-8') as f: - spec = pathspec.PathSpec.from_lines('gitignorestyle', f) - return spec - except Exception: - logger.debug("Failed to load .gitignore via pathspec") - return None - - return None - - def _is_gitignored(self, file_path: str, project_path: str, gitignore_spec) -> bool: - """Check if a file or directory is ignored by .gitignore patterns using pathspec.""" - if not gitignore_spec: - return False - - try: - # Get relative path from project root - rel_path = os.path.relpath(file_path, project_path) - # Normalize path separators for cross-platform compatibility - rel_path = rel_path.replace('\\', '/') - - return gitignore_spec.match_file(rel_path) - except Exception: - return False - - def _validate_scip_index(self, scip_index: scip_pb2.Index) -> ValidationResult: - """Validate the completed SCIP index.""" - errors, warnings = [], [] - - if not scip_index.metadata.project_root: - errors.append("Missing project_root in metadata") - if not scip_index.documents: - warnings.append("No documents in SCIP index") - - for i, doc in enumerate(scip_index.documents): - if not doc.relative_path: - errors.append(f"Document {i} missing relative_path") - if not doc.language: - warnings.append(f"Document {i} ({doc.relative_path}) missing language") - - if not scip_index.metadata.tool_info.name: - warnings.append("Missing tool name in metadata") - - return ValidationResult(is_valid=not errors, errors=errors, warnings=warnings) - - def get_language_summary(self) -> Dict[str, Any]: - """Get a summary of available languages.""" - if not self.language_manager: - return {"error": "Language manager not initialized"} - - return { - 'supported_languages': list(self.language_manager.get_supported_languages()), - 'project_path': self.project_path - } \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/__init__.py b/src/code_index_mcp/indexing/strategies/__init__.py new file mode 100644 index 0000000..0f51274 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/__init__.py @@ -0,0 +1,8 @@ +""" +Parsing strategies for different programming languages. +""" + +from .base_strategy import ParsingStrategy +from .strategy_factory import StrategyFactory + +__all__ = ['ParsingStrategy', 'StrategyFactory'] \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/base_strategy.py b/src/code_index_mcp/indexing/strategies/base_strategy.py new file mode 100644 index 0000000..531478c --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/base_strategy.py @@ -0,0 +1,91 @@ +""" +Abstract base class for language parsing strategies. +""" + +import os +from abc import ABC, abstractmethod +from typing import Dict, List, Tuple, Optional +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + + +class ParsingStrategy(ABC): + """Abstract base class for language parsing strategies.""" + + @abstractmethod + def get_language_name(self) -> str: + """Return the language name this strategy handles.""" + pass + + @abstractmethod + def get_supported_extensions(self) -> List[str]: + """Return list of file extensions this strategy supports.""" + pass + + @abstractmethod + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """ + Parse file content and extract symbols. + + Args: + file_path: Path to the file being parsed + content: File content as string + + Returns: + Tuple of (symbols_dict, file_info) + - symbols_dict: Maps symbol_id -> SymbolInfo + - file_info: FileInfo with metadata about the file + """ + pass + + def _create_symbol_id(self, file_path: str, symbol_name: str) -> str: + """ + Create a unique symbol ID. + + Args: + file_path: Path to the file containing the symbol + symbol_name: Name of the symbol + + Returns: + Unique symbol identifier in format "relative_path::symbol_name" + """ + relative_path = self._get_relative_path(file_path) + return f"{relative_path}::{symbol_name}" + + def _get_relative_path(self, file_path: str) -> str: + """Convert absolute file path to relative path.""" + parts = file_path.replace('\\', '/').split('/') + + # Priority order: test > src (outermost project roots first) + for root_dir in ['test', 'src']: + if root_dir in parts: + root_index = parts.index(root_dir) + relative_parts = parts[root_index:] + return '/'.join(relative_parts) + + # Fallback: use just filename + return os.path.basename(file_path) + + def _extract_line_number(self, content: str, symbol_position: int) -> int: + """ + Extract line number from character position in content. + + Args: + content: File content + symbol_position: Character position in content + + Returns: + Line number (1-based) + """ + return content[:symbol_position].count('\n') + 1 + + def _get_file_name(self, file_path: str) -> str: + """Get just the filename from a full path.""" + return os.path.basename(file_path) + + def _safe_extract_text(self, content: str, start: int, end: int) -> str: + """Safely extract text from content, handling bounds.""" + try: + return content[start:end].strip() + except (IndexError, TypeError): + return "" \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/fallback_strategy.py b/src/code_index_mcp/indexing/strategies/fallback_strategy.py new file mode 100644 index 0000000..01d7135 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/fallback_strategy.py @@ -0,0 +1,47 @@ +""" +Fallback parsing strategy for unsupported languages and file types. +""" + +import os +from typing import Dict, List, Tuple +from .base_strategy import ParsingStrategy +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + + +class FallbackParsingStrategy(ParsingStrategy): + """Fallback parser for unsupported languages and file types.""" + + def __init__(self, language_name: str = "unknown"): + self.language_name = language_name + + def get_language_name(self) -> str: + return self.language_name + + def get_supported_extensions(self) -> List[str]: + return [] # Fallback supports any extension + + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Basic parsing: extract file information without symbol parsing.""" + symbols = {} + + # For document files, we can at least index their existence + file_info = FileInfo( + language=self.language_name, + line_count=len(content.splitlines()), + symbols={"functions": [], "classes": []}, + imports=[] + ) + + # For document files (e.g. .md, .txt, .json), we can add a symbol representing the file itself + if self.language_name in ['markdown', 'text', 'json', 'yaml', 'xml', 'config', 'css', 'html']: + filename = os.path.basename(file_path) + symbol_id = self._create_symbol_id(file_path, f"file:{filename}") + symbols[symbol_id] = SymbolInfo( + type="file", + file=file_path, + line=1, + signature=f"{self.language_name} file: {filename}" + ) + + return symbols, file_info \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/go_strategy.py b/src/code_index_mcp/indexing/strategies/go_strategy.py new file mode 100644 index 0000000..2116ee7 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/go_strategy.py @@ -0,0 +1,162 @@ +""" +Go parsing strategy using regex patterns. +""" + +import re +from typing import Dict, List, Tuple, Optional +from .base_strategy import ParsingStrategy +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + + +class GoParsingStrategy(ParsingStrategy): + """Go-specific parsing strategy using regex patterns.""" + + def get_language_name(self) -> str: + return "go" + + def get_supported_extensions(self) -> List[str]: + return ['.go'] + + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse Go file using regex patterns.""" + symbols = {} + functions = [] + classes = [] # Go doesn't have classes, but we'll track structs/interfaces + imports = [] + package = None + + lines = content.splitlines() + + for i, line in enumerate(lines): + line = line.strip() + + # Package declaration + if line.startswith('package '): + package = line.split('package ')[1].strip() + + # Import statements + elif line.startswith('import '): + import_match = re.search(r'import\s+"([^"]+)"', line) + if import_match: + imports.append(import_match.group(1)) + + # Function declarations + elif line.startswith('func '): + func_match = re.match(r'func\s+(\w+)\s*\(', line) + if func_match: + func_name = func_match.group(1) + symbol_id = self._create_symbol_id(file_path, func_name) + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=i + 1, + signature=line + ) + functions.append(func_name) + + # Method declarations (func (receiver) methodName) + method_match = re.match(r'func\s+\([^)]+\)\s+(\w+)\s*\(', line) + if method_match: + method_name = method_match.group(1) + symbol_id = self._create_symbol_id(file_path, method_name) + symbols[symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=i + 1, + signature=line + ) + functions.append(method_name) + + # Struct declarations + elif re.match(r'type\s+\w+\s+struct\s*\{', line): + struct_match = re.match(r'type\s+(\w+)\s+struct', line) + if struct_match: + struct_name = struct_match.group(1) + symbol_id = self._create_symbol_id(file_path, struct_name) + symbols[symbol_id] = SymbolInfo( + type="struct", + file=file_path, + line=i + 1 + ) + classes.append(struct_name) + + # Interface declarations + elif re.match(r'type\s+\w+\s+interface\s*\{', line): + interface_match = re.match(r'type\s+(\w+)\s+interface', line) + if interface_match: + interface_name = interface_match.group(1) + symbol_id = self._create_symbol_id(file_path, interface_name) + symbols[symbol_id] = SymbolInfo( + type="interface", + file=file_path, + line=i + 1 + ) + classes.append(interface_name) + + # Phase 2: Add call relationship analysis + self._analyze_go_calls(content, symbols, file_path) + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(lines), + symbols={"functions": functions, "classes": classes}, + imports=imports, + package=package + ) + + return symbols, file_info + + def _analyze_go_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): + """Analyze Go function calls for relationships.""" + lines = content.splitlines() + current_function = None + + for i, line in enumerate(lines): + original_line = line + line = line.strip() + + # Track current function context + if line.startswith('func '): + func_name = self._extract_go_function_name(line) + if func_name: + current_function = self._create_symbol_id(file_path, func_name) + + # Find function calls: functionName() or obj.methodName() + if current_function and ('(' in line and ')' in line): + called_functions = self._extract_go_called_functions(line) + for called_func in called_functions: + # Find the called function in symbols and add relationship + for symbol_id, symbol_info in symbols.items(): + if called_func in symbol_id.split("::")[-1]: + if current_function not in symbol_info.called_by: + symbol_info.called_by.append(current_function) + + def _extract_go_function_name(self, line: str) -> Optional[str]: + """Extract function name from Go function declaration.""" + try: + # func functionName(...) or func (receiver) methodName(...) + import re + match = re.match(r'func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(', line) + if match: + return match.group(1) + except: + pass + return None + + def _extract_go_called_functions(self, line: str) -> List[str]: + """Extract function names that are being called in this line.""" + import re + called_functions = [] + + # Find patterns like: functionName( or obj.methodName( + patterns = [ + r'(\w+)\s*\(', # functionName( + r'\.(\w+)\s*\(', # .methodName( + ] + + for pattern in patterns: + matches = re.findall(pattern, line) + called_functions.extend(matches) + + return called_functions \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/java_strategy.py b/src/code_index_mcp/indexing/strategies/java_strategy.py new file mode 100644 index 0000000..b258862 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/java_strategy.py @@ -0,0 +1,222 @@ +""" +Java parsing strategy using tree-sitter with regex fallback. +""" + +import logging +import re +from typing import Dict, List, Tuple, Optional +from .base_strategy import ParsingStrategy +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + +logger = logging.getLogger(__name__) + +try: + import tree_sitter + import tree_sitter_java + TREE_SITTER_AVAILABLE = True +except ImportError: + TREE_SITTER_AVAILABLE = False + logger.warning("tree-sitter-java not available, using regex fallback") + + +class JavaParsingStrategy(ParsingStrategy): + """Java-specific parsing strategy.""" + + def __init__(self): + if TREE_SITTER_AVAILABLE: + self.java_language = tree_sitter.Language(tree_sitter_java.language()) + else: + self.java_language = None + + def get_language_name(self) -> str: + return "java" + + def get_supported_extensions(self) -> List[str]: + return ['.java'] + + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse Java file using tree-sitter or regex fallback.""" + if TREE_SITTER_AVAILABLE and self.java_language: + return self._tree_sitter_parse(file_path, content) + else: + return self._regex_parse(file_path, content) + + def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse using tree-sitter.""" + symbols = {} + functions = [] + classes = [] + imports = [] + package = None + + parser = tree_sitter.Parser(self.java_language) + + try: + tree = parser.parse(content.encode('utf8')) + # Phase 1: Extract symbol definitions + self._traverse_java_node(tree.root_node, content, file_path, symbols, functions, classes, imports) + # Phase 2: Analyze method calls and build relationships + self._analyze_java_calls(tree, content, symbols, file_path) + + # Extract package info + for node in tree.root_node.children: + if node.type == 'package_declaration': + package = self._extract_java_package(node, content) + break + except Exception as e: + logger.warning(f"Error parsing Java file {file_path}: {e}") + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(content.splitlines()), + symbols={"functions": functions, "classes": classes}, + imports=imports, + package=package + ) + + return symbols, file_info + + def _regex_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse using regex patterns.""" + symbols = {} + functions = [] + classes = [] + imports = [] + package = None + + lines = content.splitlines() + + for i, line in enumerate(lines): + line = line.strip() + + # Package declaration + if line.startswith('package '): + package = line.split('package ')[1].split(';')[0].strip() + + # Import statements + elif line.startswith('import '): + import_name = line.split('import ')[1].split(';')[0].strip() + imports.append(import_name) + + # Class declarations + elif re.match(r'(public\s+|private\s+|protected\s+)?(class|interface|enum)\s+\w+', line): + class_match = re.search(r'(class|interface|enum)\s+(\w+)', line) + if class_match: + class_name = class_match.group(2) + symbol_id = self._create_symbol_id(file_path, class_name) + symbols[symbol_id] = SymbolInfo( + type=class_match.group(1), # class, interface, or enum + file=file_path, + line=i + 1 + ) + classes.append(class_name) + + # Method declarations + elif re.match(r'\s*(public|private|protected).*\s+\w+\s*\(.*\)\s*\{?', line): + method_match = re.search(r'\s+(\w+)\s*\(', line) + if method_match: + method_name = method_match.group(1) + # Skip keywords like 'if', 'for', etc. + if method_name not in ['if', 'for', 'while', 'switch', 'try', 'catch']: + symbol_id = self._create_symbol_id(file_path, method_name) + symbols[symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=i + 1, + signature=line.strip() + ) + functions.append(method_name) + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(lines), + symbols={"functions": functions, "classes": classes}, + imports=imports, + package=package + ) + + return symbols, file_info + + def _traverse_java_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], + functions: List[str], classes: List[str], imports: List[str]): + """Traverse Java AST node.""" + if node.type == 'class_declaration': + name = self._get_java_class_name(node, content) + if name: + symbol_id = self._create_symbol_id(file_path, name) + symbols[symbol_id] = SymbolInfo( + type="class", + file=file_path, + line=node.start_point[0] + 1 + ) + classes.append(name) + + elif node.type == 'method_declaration': + name = self._get_java_method_name(node, content) + if name: + symbol_id = self._create_symbol_id(file_path, name) + symbols[symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=node.start_point[0] + 1, + signature=self._get_java_method_signature(node, content) + ) + functions.append(name) + + # Continue traversing children + for child in node.children: + self._traverse_java_node(child, content, file_path, symbols, functions, classes, imports) + + def _get_java_class_name(self, node, content: str) -> Optional[str]: + for child in node.children: + if child.type == 'identifier': + return content[child.start_byte:child.end_byte] + return None + + def _get_java_method_name(self, node, content: str) -> Optional[str]: + for child in node.children: + if child.type == 'identifier': + return content[child.start_byte:child.end_byte] + return None + + def _get_java_method_signature(self, node, content: str) -> str: + return content[node.start_byte:node.end_byte].split('\n')[0].strip() + + def _extract_java_package(self, node, content: str) -> Optional[str]: + for child in node.children: + if child.type == 'scoped_identifier': + return content[child.start_byte:child.end_byte] + return None + + def _analyze_java_calls(self, tree, content: str, symbols: Dict[str, SymbolInfo], file_path: str): + """Analyze Java method calls for relationships.""" + self._find_java_calls(tree.root_node, content, symbols, file_path) + + def _find_java_calls(self, node, content: str, symbols: Dict[str, SymbolInfo], file_path: str, current_method: str = None): + """Recursively find Java method calls.""" + if node.type == 'method_declaration': + method_name = self._get_java_method_name(node, content) + if method_name: + current_method = self._create_symbol_id(file_path, method_name) + + elif node.type == 'method_invocation': + if current_method: + called_method = self._get_called_method_name(node, content) + if called_method: + # Find the called method in symbols and add relationship + for symbol_id, symbol_info in symbols.items(): + if called_method in symbol_id.split("::")[-1]: + if current_method not in symbol_info.called_by: + symbol_info.called_by.append(current_method) + + # Continue traversing children + for child in node.children: + self._find_java_calls(child, content, symbols, file_path, current_method) + + def _get_called_method_name(self, node, content: str) -> Optional[str]: + """Extract called method name from method invocation node.""" + for child in node.children: + if child.type == 'identifier': + return content[child.start_byte:child.end_byte] + return None \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/javascript_strategy.py b/src/code_index_mcp/indexing/strategies/javascript_strategy.py new file mode 100644 index 0000000..dcdc970 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/javascript_strategy.py @@ -0,0 +1,353 @@ +""" +JavaScript parsing strategy using tree-sitter. +""" + +import logging +from typing import Dict, List, Tuple, Optional +from .base_strategy import ParsingStrategy +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + +logger = logging.getLogger(__name__) + +try: + import tree_sitter + import tree_sitter_javascript + TREE_SITTER_AVAILABLE = True +except ImportError: + TREE_SITTER_AVAILABLE = False + logger.warning("tree-sitter not available, JavaScript parsing will be limited") + + +class JavaScriptParsingStrategy(ParsingStrategy): + """JavaScript-specific parsing strategy using tree-sitter.""" + + def __init__(self): + if TREE_SITTER_AVAILABLE: + self.js_language = tree_sitter.Language(tree_sitter_javascript.language()) + else: + self.js_language = None + + def get_language_name(self) -> str: + return "javascript" + + def get_supported_extensions(self) -> List[str]: + return ['.js', '.jsx', '.mjs', '.cjs'] + + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse JavaScript file using tree-sitter.""" + symbols = {} + functions = [] + classes = [] + imports = [] + exports = [] + + if not TREE_SITTER_AVAILABLE or not self.js_language: + logger.info(f"Tree-sitter not available, using fallback for {file_path}") + return self._fallback_parse(file_path, content) + + try: + parser = tree_sitter.Parser(self.js_language) + tree = parser.parse(content.encode('utf8')) + self._traverse_js_node(tree.root_node, content, file_path, symbols, functions, classes, imports, exports) + except Exception as e: + logger.warning(f"Error parsing JavaScript file {file_path}: {e}, falling back to regex parsing") + return self._fallback_parse(file_path, content) + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(content.splitlines()), + symbols={"functions": functions, "classes": classes}, + imports=imports, + exports=exports + ) + + return symbols, file_info + + def _traverse_js_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], + functions: List[str], classes: List[str], imports: List[str], exports: List[str]): + """Traverse JavaScript AST node.""" + if node.type == 'function_declaration': + name = self._get_function_name(node, content) + if name: + symbol_id = self._create_symbol_id(file_path, name) + signature = self._get_js_function_signature(node, content) + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=node.start_point[0] + 1, + signature=signature + ) + functions.append(name) + + elif node.type == 'class_declaration': + name = self._get_class_name(node, content) + if name: + symbol_id = self._create_symbol_id(file_path, name) + symbols[symbol_id] = SymbolInfo( + type="class", + file=file_path, + line=node.start_point[0] + 1 + ) + classes.append(name) + + elif node.type == 'method_definition': + method_name = self._get_method_name(node, content) + class_name = self._find_parent_class(node, content) + if method_name and class_name: + symbol_id = self._create_symbol_id(file_path, f"{class_name}.{method_name}") + signature = self._get_js_function_signature(node, content) + symbols[symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=node.start_point[0] + 1, + signature=signature + ) + + # Continue traversing children + for child in node.children: + self._traverse_js_node(child, content, file_path, symbols, functions, classes, imports, exports) + + def _get_function_name(self, node, content: str) -> Optional[str]: + """Extract function name from tree-sitter node.""" + for child in node.children: + if child.type == 'identifier': + return content[child.start_byte:child.end_byte] + return None + + def _get_class_name(self, node, content: str) -> Optional[str]: + """Extract class name from tree-sitter node.""" + for child in node.children: + if child.type == 'identifier': + return content[child.start_byte:child.end_byte] + return None + + def _get_method_name(self, node, content: str) -> Optional[str]: + """Extract method name from tree-sitter node.""" + for child in node.children: + if child.type == 'property_identifier': + return content[child.start_byte:child.end_byte] + return None + + def _find_parent_class(self, node, content: str) -> Optional[str]: + """Find the parent class of a method.""" + parent = node.parent + while parent: + if parent.type == 'class_declaration': + return self._get_class_name(parent, content) + parent = parent.parent + return None + + def _get_js_function_signature(self, node, content: str) -> str: + """Extract JavaScript function signature.""" + return content[node.start_byte:node.end_byte].split('\n')[0].strip() + + def _fallback_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Enhanced fallback parsing when tree-sitter is not available.""" + symbols = {} + functions = [] + classes = [] + imports = [] + + # Phase 1: Extract symbols using enhanced regex-based parsing + lines = content.splitlines() + current_class = None + + for i, line in enumerate(lines): + original_line = line + line = line.strip() + + # Import/require statements + if line.startswith('const ') and 'require(' in line: + import_name = self._extract_js_require(line) + if import_name: + imports.append(import_name) + elif line.startswith('import ') and ' from ' in line: + import_name = self._extract_js_import(line) + if import_name: + imports.append(import_name) + + # Class declarations + elif line.startswith('class '): + class_name = self._extract_js_class_name(line) + if class_name: + current_class = class_name + symbol_id = self._create_symbol_id(file_path, class_name) + symbols[symbol_id] = SymbolInfo( + type="class", + file=file_path, + line=i + 1 + ) + classes.append(class_name) + + # Function declarations (standalone) + elif line.startswith('function '): + func_name = self._extract_js_function_name(line) + if func_name: + symbol_id = self._create_symbol_id(file_path, func_name) + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=i + 1, + signature=line + ) + functions.append(func_name) + + # Method declarations (inside classes) - async method() { or method() { + elif current_class and (line.endswith('{') or '{' in line) and '(' in line and ')' in line: + method_name = self._extract_js_method_name(line) + if method_name and not line.startswith('//') and 'function' not in line: + symbol_id = self._create_symbol_id(file_path, f"{current_class}.{method_name}") + symbols[symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=i + 1, + signature=line.replace('{', '').strip() + ) + functions.append(method_name) # Add to functions list for summary + + # Reset class context on closing brace (simplified) + elif line == '}' and current_class: + # Very basic heuristic - this could be improved + if original_line.strip() == '}' and i < len(lines) - 1: + current_class = None + + # Phase 2: Add call relationship analysis + self._analyze_js_calls(content, symbols, file_path) + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(lines), + symbols={"functions": functions, "classes": classes}, + imports=imports + ) + + return symbols, file_info + + def _extract_js_function_name(self, line: str) -> Optional[str]: + """Extract function name from JavaScript function declaration.""" + try: + # function functionName(...) or function functionName(...) + parts = line.split('(')[0].split() + if len(parts) >= 2 and parts[0] == 'function': + return parts[1] + except: + pass + return None + + def _extract_js_class_name(self, line: str) -> Optional[str]: + """Extract class name from JavaScript class declaration.""" + try: + # class ClassName { or class ClassName extends ... + parts = line.split() + if len(parts) >= 2 and parts[0] == 'class': + class_name = parts[1] + # Remove any trailing characters like { or extends + if '{' in class_name: + class_name = class_name.split('{')[0] + if 'extends' in class_name: + class_name = class_name.split('extends')[0] + return class_name.strip() + except: + pass + return None + + def _extract_js_method_name(self, line: str) -> Optional[str]: + """Extract method name from JavaScript method declaration.""" + try: + # async methodName(params) { or methodName(params) { + line = line.strip() + if line.startswith('async '): + line = line[6:].strip() + + if '(' in line: + method_name = line.split('(')[0].strip() + # Remove access modifiers and keywords + for modifier in ['static', 'get', 'set']: + if method_name.startswith(modifier + ' '): + method_name = method_name[len(modifier):].strip() + + return method_name if method_name and method_name.replace('_', '').isalnum() else None + except: + pass + return None + + def _extract_js_require(self, line: str) -> Optional[str]: + """Extract module name from require statement.""" + try: + # const something = require('module') or require('module') + if 'require(' in line: + start = line.find("require('") + 9 + if start == 8: # require(" format + start = line.find('require("') + 9 + if start > 8: + end = line.find("'", start) + if end == -1: + end = line.find('"', start) + if end > start: + return line[start:end] + except: + pass + return None + + def _extract_js_import(self, line: str) -> Optional[str]: + """Extract module name from ES6 import statement.""" + try: + # import { something } from 'module' or import something from 'module' + if ' from ' in line: + module_part = line.split(' from ')[-1].strip() + module_name = module_part.strip('\'"').replace("'", "").replace('"', '').replace(';', '') + return module_name + except: + pass + return None + + def _analyze_js_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): + """Analyze JavaScript function calls for relationships.""" + lines = content.splitlines() + current_function = None + + for i, line in enumerate(lines): + original_line = line + line = line.strip() + + # Track current function context + if 'function ' in line or (line.endswith('{') and '(' in line and ')' in line): + func_name = self._extract_function_from_line(line) + if func_name: + current_function = self._create_symbol_id(file_path, func_name) + + # Find function calls: functionName() or obj.methodName() + if current_function and ('(' in line and ')' in line): + called_functions = self._extract_called_functions(line) + for called_func in called_functions: + # Find the called function in symbols and add relationship + for symbol_id, symbol_info in symbols.items(): + if called_func in symbol_id.split("::")[-1]: + if current_function not in symbol_info.called_by: + symbol_info.called_by.append(current_function) + + def _extract_function_from_line(self, line: str) -> Optional[str]: + """Extract function name from a line that defines a function.""" + if 'function ' in line: + return self._extract_js_function_name(line) + elif line.endswith('{') and '(' in line: + return self._extract_js_method_name(line) + return None + + def _extract_called_functions(self, line: str) -> List[str]: + """Extract function names that are being called in this line.""" + import re + called_functions = [] + + # Find patterns like: functionName( or obj.methodName( + patterns = [ + r'(\w+)\s*\(', # functionName( + r'\.(\w+)\s*\(', # .methodName( + ] + + for pattern in patterns: + matches = re.findall(pattern, line) + called_functions.extend(matches) + + return called_functions \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/objective_c_strategy.py b/src/code_index_mcp/indexing/strategies/objective_c_strategy.py new file mode 100644 index 0000000..c3e4a64 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/objective_c_strategy.py @@ -0,0 +1,157 @@ +""" +Objective-C parsing strategy using regex patterns. +""" + +import re +from typing import Dict, List, Tuple, Optional +from .base_strategy import ParsingStrategy +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + + +class ObjectiveCParsingStrategy(ParsingStrategy): + """Objective-C parsing strategy using regex patterns.""" + + def get_language_name(self) -> str: + return "objective-c" + + def get_supported_extensions(self) -> List[str]: + return ['.m', '.mm'] + + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse Objective-C file using regex patterns.""" + symbols = {} + functions = [] + classes = [] + imports = [] + + lines = content.splitlines() + current_class = None + + for i, line in enumerate(lines): + line = line.strip() + + # Import statements + if line.startswith('#import ') or line.startswith('#include '): + import_match = re.search(r'#(?:import|include)\s+[<"]([^>"]+)[>"]', line) + if import_match: + imports.append(import_match.group(1)) + + # Interface declarations + elif line.startswith('@interface '): + interface_match = re.match(r'@interface\s+(\w+)', line) + if interface_match: + class_name = interface_match.group(1) + current_class = class_name + symbol_id = self._create_symbol_id(file_path, class_name) + symbols[symbol_id] = SymbolInfo( + type="class", + file=file_path, + line=i + 1 + ) + classes.append(class_name) + + # Implementation declarations + elif line.startswith('@implementation '): + impl_match = re.match(r'@implementation\s+(\w+)', line) + if impl_match: + current_class = impl_match.group(1) + + # Method declarations + elif line.startswith(('- (', '+ (')): + method_match = re.search(r'[+-]\s*\([^)]+\)\s*(\w+)', line) + if method_match: + method_name = method_match.group(1) + full_name = f"{current_class}.{method_name}" if current_class else method_name + symbol_id = self._create_symbol_id(file_path, full_name) + symbols[symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=i + 1, + signature=line + ) + functions.append(full_name) + + # C function declarations + elif re.match(r'\w+.*\s+\w+\s*\([^)]*\)\s*\{?', line) and not line.startswith(('if', 'for', 'while')): + func_match = re.search(r'\s(\w+)\s*\([^)]*\)', line) + if func_match: + func_name = func_match.group(1) + symbol_id = self._create_symbol_id(file_path, func_name) + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=i + 1, + signature=line + ) + functions.append(func_name) + + # End of class + elif line == '@end': + current_class = None + + # Phase 2: Add call relationship analysis + self._analyze_objc_calls(content, symbols, file_path) + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(lines), + symbols={"functions": functions, "classes": classes}, + imports=imports + ) + + return symbols, file_info + + def _analyze_objc_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): + """Analyze Objective-C method calls for relationships.""" + lines = content.splitlines() + current_function = None + + for i, line in enumerate(lines): + original_line = line + line = line.strip() + + # Track current method context + if line.startswith('- (') or line.startswith('+ ('): + func_name = self._extract_objc_method_name(line) + if func_name: + current_function = self._create_symbol_id(file_path, func_name) + + # Find method calls: [obj methodName] or functionName() + if current_function and ('[' in line and ']' in line or ('(' in line and ')' in line)): + called_functions = self._extract_objc_called_functions(line) + for called_func in called_functions: + # Find the called function in symbols and add relationship + for symbol_id, symbol_info in symbols.items(): + if called_func in symbol_id.split("::")[-1]: + if current_function not in symbol_info.called_by: + symbol_info.called_by.append(current_function) + + def _extract_objc_method_name(self, line: str) -> Optional[str]: + """Extract method name from Objective-C method declaration.""" + try: + # - (returnType)methodName:(params) or + (returnType)methodName + import re + match = re.search(r'[+-]\s*\([^)]*\)\s*(\w+)', line) + if match: + return match.group(1) + except: + pass + return None + + def _extract_objc_called_functions(self, line: str) -> List[str]: + """Extract method names that are being called in this line.""" + import re + called_functions = [] + + # Find patterns like: [obj methodName] or functionName( + patterns = [ + r'\[\s*\w+\s+(\w+)\s*[\]:]', # [obj methodName] + r'(\w+)\s*\(', # functionName( + ] + + for pattern in patterns: + matches = re.findall(pattern, line) + called_functions.extend(matches) + + return called_functions \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/python_strategy.py b/src/code_index_mcp/indexing/strategies/python_strategy.py new file mode 100644 index 0000000..89062bd --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/python_strategy.py @@ -0,0 +1,203 @@ +""" +Python parsing strategy using AST. +""" + +import ast +import logging +from typing import Dict, List, Tuple, Optional +from .base_strategy import ParsingStrategy +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + +logger = logging.getLogger(__name__) + + +class PythonParsingStrategy(ParsingStrategy): + """Python-specific parsing strategy using Python's built-in AST.""" + + def get_language_name(self) -> str: + return "python" + + def get_supported_extensions(self) -> List[str]: + return ['.py', '.pyw'] + + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse Python file using AST.""" + symbols = {} + functions = [] + classes = [] + imports = [] + + try: + tree = ast.parse(content) + # Phase 1: Extract symbol definitions + self._visit_ast_node(tree, symbols, functions, classes, imports, file_path, content) + # Phase 2: Analyze function calls and build relationships + self._analyze_calls(tree, symbols, file_path) + except SyntaxError as e: + logger.warning(f"Syntax error in Python file {file_path}: {e}") + except Exception as e: + logger.warning(f"Error parsing Python file {file_path}: {e}") + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(content.splitlines()), + symbols={"functions": functions, "classes": classes}, + imports=imports + ) + + return symbols, file_info + + def _visit_ast_node(self, node: ast.AST, symbols: Dict, functions: List, + classes: List, imports: List, file_path: str, content: str): + """Visit AST nodes and extract symbols.""" + for child in ast.walk(node): + if isinstance(child, ast.FunctionDef): + self._handle_function(child, symbols, functions, file_path) + elif isinstance(child, ast.ClassDef): + self._handle_class(child, symbols, classes, file_path) + elif isinstance(child, (ast.Import, ast.ImportFrom)): + self._handle_import(child, imports) + + def _handle_function(self, node: ast.FunctionDef, symbols: Dict, functions: List, file_path: str): + """Handle function definition.""" + func_name = node.name + symbol_id = self._create_symbol_id(file_path, func_name) + + # Extract function signature + signature = self._extract_function_signature(node) + + # Extract docstring + docstring = ast.get_docstring(node) + + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=node.lineno, + signature=signature, + docstring=docstring + ) + functions.append(func_name) + + def _handle_class(self, node: ast.ClassDef, symbols: Dict, classes: List, file_path: str): + """Handle class definition.""" + class_name = node.name + symbol_id = self._create_symbol_id(file_path, class_name) + + # Extract docstring + docstring = ast.get_docstring(node) + + symbols[symbol_id] = SymbolInfo( + type="class", + file=file_path, + line=node.lineno, + docstring=docstring + ) + classes.append(class_name) + + # Handle methods within the class + for child in node.body: + if isinstance(child, ast.FunctionDef): + method_name = f"{class_name}.{child.name}" + method_symbol_id = self._create_symbol_id(file_path, method_name) + + method_signature = self._extract_function_signature(child) + method_docstring = ast.get_docstring(child) + + symbols[method_symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=child.lineno, + signature=method_signature, + docstring=method_docstring + ) + + def _handle_import(self, node, imports: List): + """Handle import statements.""" + if isinstance(node, ast.Import): + for alias in node.names: + imports.append(alias.name) + elif isinstance(node, ast.ImportFrom): + if node.module: + for alias in node.names: + imports.append(f"{node.module}.{alias.name}") + + def _extract_function_signature(self, node: ast.FunctionDef) -> str: + """Extract function signature from AST node.""" + # Build basic signature + args = [] + + # Regular arguments + for arg in node.args.args: + args.append(arg.arg) + + # Varargs (*args) + if node.args.vararg: + args.append(f"*{node.args.vararg.arg}") + + # Keyword arguments (**kwargs) + if node.args.kwarg: + args.append(f"**{node.args.kwarg.arg}") + + signature = f"def {node.name}({', '.join(args)}):" + return signature + + def _analyze_calls(self, tree: ast.AST, symbols: Dict[str, SymbolInfo], file_path: str): + """Analyze function calls and build caller-callee relationships.""" + visitor = CallAnalysisVisitor(symbols, file_path) + visitor.visit(tree) + + +class CallAnalysisVisitor(ast.NodeVisitor): + """AST visitor to analyze function calls and build caller-callee relationships.""" + + def __init__(self, symbols: Dict[str, SymbolInfo], file_path: str): + self.symbols = symbols + self.file_path = file_path + self.current_function_stack = [] + + def visit_FunctionDef(self, node: ast.FunctionDef): + """Visit function definition and track context.""" + # Create symbol ID for this function using relative path + relative_path = self._get_relative_path(self.file_path) + function_id = f"{relative_path}::{node.name}" + self.current_function_stack.append(function_id) + + # Visit all child nodes within this function + self.generic_visit(node) + + # Pop the function from stack when done + self.current_function_stack.pop() + + def visit_Call(self, node: ast.Call): + """Visit function call and record relationship.""" + try: + # Get the function name being called + called_function = None + + if isinstance(node.func, ast.Name): + # Direct function call: function_name() + called_function = node.func.id + elif isinstance(node.func, ast.Attribute): + # Method call: obj.method() or module.function() + called_function = node.func.attr + + if called_function and self.current_function_stack: + # Get the current calling function + caller_function = self.current_function_stack[-1] + + # Look for the called function in our symbols and add relationship + for symbol_id, symbol_info in self.symbols.items(): + if (symbol_info.type in ["function", "method"] and + called_function in symbol_id.split("::")[-1]): # Match function name part + # Add caller to the called function's called_by list + if caller_function not in symbol_info.called_by: + symbol_info.called_by.append(caller_function) + break + except Exception: + # Silently handle parsing errors for complex call patterns + pass + + # Continue visiting child nodes + self.generic_visit(node) + diff --git a/src/code_index_mcp/indexing/strategies/strategy_factory.py b/src/code_index_mcp/indexing/strategies/strategy_factory.py new file mode 100644 index 0000000..4564138 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/strategy_factory.py @@ -0,0 +1,180 @@ +""" +Strategy factory for creating appropriate parsing strategies. +""" + +from typing import Dict, List +from .base_strategy import ParsingStrategy +from .python_strategy import PythonParsingStrategy +from .javascript_strategy import JavaScriptParsingStrategy +from .typescript_strategy import TypeScriptParsingStrategy +from .java_strategy import JavaParsingStrategy +from .go_strategy import GoParsingStrategy +from .objective_c_strategy import ObjectiveCParsingStrategy +from .zig_strategy import ZigParsingStrategy +from .fallback_strategy import FallbackParsingStrategy + + +class StrategyFactory: + """Factory for creating appropriate parsing strategies.""" + + def __init__(self): + # Initialize all strategies + self._strategies: Dict[str, ParsingStrategy] = {} + self._initialize_strategies() + + # File type mappings for fallback parser + self._file_type_mappings = { + # Web and markup + '.html': 'html', '.htm': 'html', + '.css': 'css', '.scss': 'css', '.sass': 'css', + '.less': 'css', '.stylus': 'css', '.styl': 'css', + '.md': 'markdown', '.mdx': 'markdown', + '.json': 'json', '.jsonc': 'json', + '.xml': 'xml', + '.yml': 'yaml', '.yaml': 'yaml', + + # Frontend frameworks + '.vue': 'vue', + '.svelte': 'svelte', + '.astro': 'astro', + + # Template engines + '.hbs': 'handlebars', '.handlebars': 'handlebars', + '.ejs': 'ejs', + '.pug': 'pug', + + # Database and SQL + '.sql': 'sql', '.ddl': 'sql', '.dml': 'sql', + '.mysql': 'sql', '.postgresql': 'sql', '.psql': 'sql', + '.sqlite': 'sql', '.mssql': 'sql', '.oracle': 'sql', + '.ora': 'sql', '.db2': 'sql', + '.proc': 'sql', '.procedure': 'sql', + '.func': 'sql', '.function': 'sql', + '.view': 'sql', '.trigger': 'sql', '.index': 'sql', + '.migration': 'sql', '.seed': 'sql', '.fixture': 'sql', + '.schema': 'sql', + '.cql': 'sql', '.cypher': 'sql', '.sparql': 'sql', + '.gql': 'graphql', + '.liquibase': 'sql', '.flyway': 'sql', + + # Config and text files + '.txt': 'text', + '.ini': 'config', '.cfg': 'config', '.conf': 'config', + '.toml': 'config', + '.properties': 'config', + '.env': 'config', + '.gitignore': 'config', + '.dockerignore': 'config', + '.editorconfig': 'config', + + # Other programming languages (will use fallback) + '.c': 'c', '.cpp': 'cpp', '.h': 'h', '.hpp': 'hpp', + '.cxx': 'cpp', '.cc': 'cpp', '.hxx': 'hpp', '.hh': 'hpp', + '.cs': 'csharp', + '.rb': 'ruby', + '.php': 'php', + '.swift': 'swift', + '.kt': 'kotlin', '.kts': 'kotlin', + '.rs': 'rust', + '.scala': 'scala', + '.sh': 'shell', '.bash': 'shell', '.zsh': 'shell', + '.ps1': 'powershell', + '.bat': 'batch', '.cmd': 'batch', + '.r': 'r', '.R': 'r', + '.pl': 'perl', '.pm': 'perl', + '.lua': 'lua', + '.dart': 'dart', + '.hs': 'haskell', + '.ml': 'ocaml', '.mli': 'ocaml', + '.fs': 'fsharp', '.fsx': 'fsharp', + '.clj': 'clojure', '.cljs': 'clojure', + '.vim': 'vim', + } + + def _initialize_strategies(self): + """Initialize all parsing strategies.""" + # Python + python_strategy = PythonParsingStrategy() + for ext in python_strategy.get_supported_extensions(): + self._strategies[ext] = python_strategy + + # JavaScript + js_strategy = JavaScriptParsingStrategy() + for ext in js_strategy.get_supported_extensions(): + self._strategies[ext] = js_strategy + + # TypeScript + ts_strategy = TypeScriptParsingStrategy() + for ext in ts_strategy.get_supported_extensions(): + self._strategies[ext] = ts_strategy + + # Java + java_strategy = JavaParsingStrategy() + for ext in java_strategy.get_supported_extensions(): + self._strategies[ext] = java_strategy + + # Go + go_strategy = GoParsingStrategy() + for ext in go_strategy.get_supported_extensions(): + self._strategies[ext] = go_strategy + + # Objective-C + objc_strategy = ObjectiveCParsingStrategy() + for ext in objc_strategy.get_supported_extensions(): + self._strategies[ext] = objc_strategy + + # Zig + zig_strategy = ZigParsingStrategy() + for ext in zig_strategy.get_supported_extensions(): + self._strategies[ext] = zig_strategy + + def get_strategy(self, file_extension: str) -> ParsingStrategy: + """ + Get appropriate strategy for file extension. + + Args: + file_extension: File extension (e.g., '.py', '.js') + + Returns: + Appropriate parsing strategy + """ + # Check for specialized strategies first + if file_extension in self._strategies: + return self._strategies[file_extension] + + # Use fallback strategy with appropriate language name + language_name = self._file_type_mappings.get(file_extension, 'unknown') + return FallbackParsingStrategy(language_name) + + def get_all_supported_extensions(self) -> List[str]: + """Get all supported extensions across strategies.""" + specialized = list(self._strategies.keys()) + fallback = list(self._file_type_mappings.keys()) + return specialized + fallback + + def get_specialized_extensions(self) -> List[str]: + """Get extensions that have specialized parsers.""" + return list(self._strategies.keys()) + + def get_fallback_extensions(self) -> List[str]: + """Get extensions that use fallback parsing.""" + return list(self._file_type_mappings.keys()) + + def get_strategy_info(self) -> Dict[str, List[str]]: + """Get information about available strategies.""" + info = {} + + # Group extensions by strategy type + for ext, strategy in self._strategies.items(): + strategy_name = strategy.get_language_name() + if strategy_name not in info: + info[strategy_name] = [] + info[strategy_name].append(ext) + + # Add fallback info + fallback_languages = set(self._file_type_mappings.values()) + for lang in fallback_languages: + extensions = [ext for ext, mapped_lang in self._file_type_mappings.items() if mapped_lang == lang] + info[f"fallback_{lang}"] = extensions + + return info \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/typescript_strategy.py b/src/code_index_mcp/indexing/strategies/typescript_strategy.py new file mode 100644 index 0000000..43be6f6 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/typescript_strategy.py @@ -0,0 +1,376 @@ +""" +TypeScript parsing strategy using tree-sitter. +""" + +import logging +from typing import Dict, List, Tuple, Optional +from .base_strategy import ParsingStrategy +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + +logger = logging.getLogger(__name__) + +try: + import tree_sitter + import tree_sitter_typescript + TREE_SITTER_AVAILABLE = True +except ImportError: + TREE_SITTER_AVAILABLE = False + logger.warning("tree-sitter not available, TypeScript parsing will be limited") + + +class TypeScriptParsingStrategy(ParsingStrategy): + """TypeScript-specific parsing strategy using tree-sitter.""" + + def __init__(self): + if TREE_SITTER_AVAILABLE: + self.ts_language = tree_sitter.Language(tree_sitter_typescript.language_typescript()) + else: + self.ts_language = None + + def get_language_name(self) -> str: + return "typescript" + + def get_supported_extensions(self) -> List[str]: + return ['.ts', '.tsx'] + + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse TypeScript file using tree-sitter.""" + symbols = {} + functions = [] + classes = [] + imports = [] + exports = [] + + if not TREE_SITTER_AVAILABLE or not self.ts_language: + logger.info(f"Tree-sitter not available, using fallback for {file_path}") + return self._fallback_parse(file_path, content) + + try: + parser = tree_sitter.Parser(self.ts_language) + tree = parser.parse(content.encode('utf8')) + self._traverse_ts_node(tree.root_node, content, file_path, symbols, functions, classes, imports, exports) + except Exception as e: + logger.warning(f"Error parsing TypeScript file {file_path}: {e}, falling back to regex parsing") + return self._fallback_parse(file_path, content) + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(content.splitlines()), + symbols={"functions": functions, "classes": classes}, + imports=imports, + exports=exports + ) + + return symbols, file_info + + def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], + functions: List[str], classes: List[str], imports: List[str], exports: List[str]): + """Traverse TypeScript AST node.""" + if node.type == 'function_declaration': + name = self._get_function_name(node, content) + if name: + symbol_id = self._create_symbol_id(file_path, name) + signature = self._get_ts_function_signature(node, content) + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=node.start_point[0] + 1, + signature=signature + ) + functions.append(name) + + elif node.type == 'class_declaration': + name = self._get_class_name(node, content) + if name: + symbol_id = self._create_symbol_id(file_path, name) + symbols[symbol_id] = SymbolInfo( + type="class", + file=file_path, + line=node.start_point[0] + 1 + ) + classes.append(name) + + elif node.type == 'interface_declaration': + name = self._get_interface_name(node, content) + if name: + symbol_id = self._create_symbol_id(file_path, name) + symbols[symbol_id] = SymbolInfo( + type="interface", + file=file_path, + line=node.start_point[0] + 1 + ) + classes.append(name) # Group interfaces with classes for simplicity + + elif node.type == 'method_definition': + method_name = self._get_method_name(node, content) + class_name = self._find_parent_class(node, content) + if method_name and class_name: + symbol_id = self._create_symbol_id(file_path, f"{class_name}.{method_name}") + signature = self._get_ts_function_signature(node, content) + symbols[symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=node.start_point[0] + 1, + signature=signature + ) + + # Continue traversing children + for child in node.children: + self._traverse_ts_node(child, content, file_path, symbols, functions, classes, imports, exports) + + def _get_function_name(self, node, content: str) -> Optional[str]: + """Extract function name from tree-sitter node.""" + for child in node.children: + if child.type == 'identifier': + return content[child.start_byte:child.end_byte] + return None + + def _get_class_name(self, node, content: str) -> Optional[str]: + """Extract class name from tree-sitter node.""" + for child in node.children: + if child.type == 'identifier': + return content[child.start_byte:child.end_byte] + return None + + def _get_interface_name(self, node, content: str) -> Optional[str]: + """Extract interface name from tree-sitter node.""" + for child in node.children: + if child.type == 'type_identifier': + return content[child.start_byte:child.end_byte] + return None + + def _get_method_name(self, node, content: str) -> Optional[str]: + """Extract method name from tree-sitter node.""" + for child in node.children: + if child.type == 'property_identifier': + return content[child.start_byte:child.end_byte] + return None + + def _find_parent_class(self, node, content: str) -> Optional[str]: + """Find the parent class of a method.""" + parent = node.parent + while parent: + if parent.type in ['class_declaration', 'interface_declaration']: + return self._get_class_name(parent, content) or self._get_interface_name(parent, content) + parent = parent.parent + return None + + def _get_ts_function_signature(self, node, content: str) -> str: + """Extract TypeScript function signature.""" + return content[node.start_byte:node.end_byte].split('\n')[0].strip() + + def _fallback_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Fallback parsing when tree-sitter is not available.""" + symbols = {} + functions = [] + classes = [] + imports = [] + + # Phase 1: Extract symbols using regex-based parsing for TypeScript + lines = content.splitlines() + current_class = None + + for i, line in enumerate(lines): + original_line = line + line = line.strip() + + # Import statements + if line.startswith('import ') and ' from ' in line: + import_match = self._extract_ts_import(line) + if import_match: + imports.extend(import_match) + + # Class declarations + elif line.startswith('class '): + class_name = self._extract_ts_class_name(line) + if class_name: + current_class = class_name + symbol_id = self._create_symbol_id(file_path, class_name) + symbols[symbol_id] = SymbolInfo( + type="class", + file=file_path, + line=i + 1 + ) + classes.append(class_name) + + # Interface declarations + elif line.startswith('interface '): + interface_name = self._extract_ts_interface_name(line) + if interface_name: + symbol_id = self._create_symbol_id(file_path, interface_name) + symbols[symbol_id] = SymbolInfo( + type="interface", + file=file_path, + line=i + 1 + ) + classes.append(interface_name) + current_class = interface_name + + # Function declarations (standalone) + elif line.startswith('function ') or ' function ' in line: + func_name = self._extract_ts_function_name(line) + if func_name: + symbol_id = self._create_symbol_id(file_path, func_name) + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=i + 1, + signature=line + ) + functions.append(func_name) + + # Method declarations (inside classes) + elif current_class and ('(' in line and ')' in line and ':' in line): + method_name = self._extract_ts_method_name(line) + if method_name and not line.startswith('//') and 'function' not in line: + symbol_id = self._create_symbol_id(file_path, f"{current_class}.{method_name}") + symbols[symbol_id] = SymbolInfo( + type="method", + file=file_path, + line=i + 1, + signature=line + ) + functions.append(method_name) # Add to functions list for summary + + # Reset class context on closing brace (simplified) + elif line == '}' and current_class: + current_class = None + + # Phase 2: Add call relationship analysis (similar to Python approach) + self._analyze_ts_calls(content, symbols, file_path) + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(lines), + symbols={"functions": functions, "classes": classes}, + imports=imports + ) + + return symbols, file_info + + def _extract_ts_function_name(self, line: str) -> Optional[str]: + """Extract function name from TypeScript function declaration.""" + try: + # function functionName(...): ReturnType or function functionName(...) + if 'function ' in line: + parts = line.split('function ')[1].split('(')[0].strip() + return parts if parts and parts.isidentifier() else None + except: + pass + return None + + def _extract_ts_class_name(self, line: str) -> Optional[str]: + """Extract class name from TypeScript class declaration.""" + try: + # class ClassName { or class ClassName extends ... or class ClassName implements ... + parts = line.split() + if len(parts) >= 2 and parts[0] == 'class': + class_name = parts[1] + # Remove any trailing characters + for separator in ['{', 'extends', 'implements']: + if separator in class_name: + class_name = class_name.split(separator)[0] + return class_name.strip() + except: + pass + return None + + def _extract_ts_interface_name(self, line: str) -> Optional[str]: + """Extract interface name from TypeScript interface declaration.""" + try: + # interface InterfaceName { or interface InterfaceName extends ... + parts = line.split() + if len(parts) >= 2 and parts[0] == 'interface': + interface_name = parts[1] + # Remove any trailing characters + for separator in ['{', 'extends']: + if separator in interface_name: + interface_name = interface_name.split(separator)[0] + return interface_name.strip() + except: + pass + return None + + def _extract_ts_method_name(self, line: str) -> Optional[str]: + """Extract method name from TypeScript method declaration.""" + try: + # async methodName(params): ReturnType or methodName(params): ReturnType + line = line.strip() + if line.startswith('async '): + line = line[6:].strip() + + if '(' in line: + method_name = line.split('(')[0].strip() + # Remove access modifiers + for modifier in ['public', 'private', 'protected', 'static']: + if method_name.startswith(modifier + ' '): + method_name = method_name[len(modifier):].strip() + + return method_name if method_name and method_name.replace('_', '').isalnum() else None + except: + pass + return None + + def _extract_ts_import(self, line: str) -> List[str]: + """Extract imports from TypeScript import statement.""" + imports = [] + try: + # import { something } from 'module' or import something from 'module' + if ' from ' in line: + module_part = line.split(' from ')[-1].strip() + module_name = module_part.strip('\'"').replace("'", "").replace('"', '').replace(';', '') + imports.append(module_name) + except: + pass + return imports + + def _analyze_ts_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): + """Analyze TypeScript function calls for relationships.""" + lines = content.splitlines() + current_function = None + + for i, line in enumerate(lines): + original_line = line + line = line.strip() + + # Track current function context + if 'function ' in line or (': ' in line and '(' in line and ')' in line): + func_name = self._extract_function_from_line(line) + if func_name: + current_function = self._create_symbol_id(file_path, func_name) + + # Find function calls: functionName() or obj.methodName() + if current_function and ('(' in line and ')' in line): + called_functions = self._extract_called_functions(line) + for called_func in called_functions: + # Find the called function in symbols and add relationship + for symbol_id, symbol_info in symbols.items(): + if called_func in symbol_id.split("::")[-1]: + if current_function not in symbol_info.called_by: + symbol_info.called_by.append(current_function) + + def _extract_function_from_line(self, line: str) -> Optional[str]: + """Extract function name from a line that defines a function.""" + if 'function ' in line: + return self._extract_ts_function_name(line) + elif ': ' in line and '(' in line: + return self._extract_ts_method_name(line) + return None + + def _extract_called_functions(self, line: str) -> List[str]: + """Extract function names that are being called in this line.""" + import re + called_functions = [] + + # Find patterns like: functionName( or obj.methodName( + patterns = [ + r'(\w+)\s*\(', # functionName( + r'\.(\w+)\s*\(', # .methodName( + ] + + for pattern in patterns: + matches = re.findall(pattern, line) + called_functions.extend(matches) + + return called_functions \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/zig_strategy.py b/src/code_index_mcp/indexing/strategies/zig_strategy.py new file mode 100644 index 0000000..ca3f5f6 --- /dev/null +++ b/src/code_index_mcp/indexing/strategies/zig_strategy.py @@ -0,0 +1,179 @@ +""" +Zig parsing strategy using regex patterns with tree-sitter fallback. +""" + +import re +import logging +from typing import Dict, List, Tuple, Optional +from .base_strategy import ParsingStrategy +from ..models.symbol_info import SymbolInfo +from ..models.file_info import FileInfo + +logger = logging.getLogger(__name__) + +try: + import tree_sitter + import tree_sitter_zig + TREE_SITTER_AVAILABLE = True +except ImportError: + TREE_SITTER_AVAILABLE = False + logger.warning("tree-sitter-zig not available, using regex fallback") + + +class ZigParsingStrategy(ParsingStrategy): + """Zig parsing strategy using regex patterns with tree-sitter fallback.""" + + def __init__(self): + if TREE_SITTER_AVAILABLE: + self.zig_language = tree_sitter.Language(tree_sitter_zig.language()) + else: + self.zig_language = None + + def get_language_name(self) -> str: + return "zig" + + def get_supported_extensions(self) -> List[str]: + return ['.zig', '.zon'] + + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse Zig file using regex patterns.""" + # For now, use regex parsing even if tree-sitter is available + # Tree-sitter-zig might not be stable yet + return self._regex_parse(file_path, content) + + def _regex_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse Zig file using regex patterns.""" + symbols = {} + functions = [] + classes = [] # Zig uses structs, not classes + imports = [] + + lines = content.splitlines() + + for i, line in enumerate(lines): + line = line.strip() + + # Import statements (const x = @import(...)) + if '@import(' in line: + import_match = re.search(r'@import\("([^"]+)"\)', line) + if import_match: + imports.append(import_match.group(1)) + + # Function declarations (pub fn, fn) + elif re.match(r'(pub\s+)?fn\s+\w+', line): + func_match = re.match(r'(?:pub\s+)?fn\s+(\w+)', line) + if func_match: + func_name = func_match.group(1) + symbol_id = self._create_symbol_id(file_path, func_name) + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=i + 1, + signature=line + ) + functions.append(func_name) + + # Struct declarations + elif re.match(r'const\s+\w+\s*=\s*struct\s*\{', line): + struct_match = re.match(r'const\s+(\w+)\s*=\s*struct', line) + if struct_match: + struct_name = struct_match.group(1) + symbol_id = self._create_symbol_id(file_path, struct_name) + symbols[symbol_id] = SymbolInfo( + type="struct", + file=file_path, + line=i + 1 + ) + classes.append(struct_name) + + # Union declarations + elif re.match(r'const\s+\w+\s*=\s*union', line): + union_match = re.match(r'const\s+(\w+)\s*=\s*union', line) + if union_match: + union_name = union_match.group(1) + symbol_id = self._create_symbol_id(file_path, union_name) + symbols[symbol_id] = SymbolInfo( + type="union", + file=file_path, + line=i + 1 + ) + classes.append(union_name) + + # Enum declarations + elif re.match(r'const\s+\w+\s*=\s*enum', line): + enum_match = re.match(r'const\s+(\w+)\s*=\s*enum', line) + if enum_match: + enum_name = enum_match.group(1) + symbol_id = self._create_symbol_id(file_path, enum_name) + symbols[symbol_id] = SymbolInfo( + type="enum", + file=file_path, + line=i + 1 + ) + classes.append(enum_name) + + # Phase 2: Add call relationship analysis + self._analyze_zig_calls(content, symbols, file_path) + + file_info = FileInfo( + language=self.get_language_name(), + line_count=len(lines), + symbols={"functions": functions, "classes": classes}, + imports=imports + ) + + return symbols, file_info + + def _analyze_zig_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): + """Analyze Zig function calls for relationships.""" + lines = content.splitlines() + current_function = None + + for i, line in enumerate(lines): + original_line = line + line = line.strip() + + # Track current function context + if line.startswith('fn '): + func_name = self._extract_zig_function_name(line) + if func_name: + current_function = self._create_symbol_id(file_path, func_name) + + # Find function calls: functionName() or obj.methodName() + if current_function and ('(' in line and ')' in line): + called_functions = self._extract_zig_called_functions(line) + for called_func in called_functions: + # Find the called function in symbols and add relationship + for symbol_id, symbol_info in symbols.items(): + if called_func in symbol_id.split("::")[-1]: + if current_function not in symbol_info.called_by: + symbol_info.called_by.append(current_function) + + def _extract_zig_function_name(self, line: str) -> Optional[str]: + """Extract function name from Zig function declaration.""" + try: + # fn functionName(...) or pub fn functionName(...) + import re + match = re.search(r'fn\s+(\w+)\s*\(', line) + if match: + return match.group(1) + except: + pass + return None + + def _extract_zig_called_functions(self, line: str) -> List[str]: + """Extract function names that are being called in this line.""" + import re + called_functions = [] + + # Find patterns like: functionName( or obj.methodName( + patterns = [ + r'(\w+)\s*\(', # functionName( + r'\.(\w+)\s*\(', # .methodName( + ] + + for pattern in patterns: + matches = re.findall(pattern, line) + called_functions.extend(matches) + + return called_functions \ No newline at end of file diff --git a/src/code_index_mcp/indexing/unified_index_manager.py b/src/code_index_mcp/indexing/unified_index_manager.py deleted file mode 100644 index 052f3ed..0000000 --- a/src/code_index_mcp/indexing/unified_index_manager.py +++ /dev/null @@ -1,433 +0,0 @@ -""" -统一索引管理器 - 提供项目索引的统一访问接口 - -这个模块实现了一个中央化的索引管理器,统一处理所有索引相关操作, -包括SCIP索引、遗留索引格式的兼容,以及内存缓存管理。 -""" - -import os -import logging -import time -from typing import Dict, Any, List, Optional, Union -from pathlib import Path - -from .index_provider import IIndexProvider, IIndexManager, IndexMetadata, SymbolInfo, FileInfo -from ..project_settings import ProjectSettings - -# Try to import SCIP proto, handle if not available -try: - from ..scip.proto.scip_pb2 import Index as SCIPIndex, Document as SCIPDocument - SCIP_PROTO_AVAILABLE = True -except ImportError: - SCIPIndex = None - SCIPDocument = None - SCIP_PROTO_AVAILABLE = False - -logger = logging.getLogger(__name__) - - -class UnifiedIndexManager: - """ - 统一索引管理器 - - 负责协调不同索引格式,提供统一的访问接口, - 并处理索引的生命周期管理。 - """ - - def __init__(self, project_path: str, settings: Optional[ProjectSettings] = None): - self.project_path = project_path - self.settings = settings or ProjectSettings(project_path) - - # 核心组件 - 延迟导入避免循环依赖 - self._scip_tool = None - self._current_provider: Optional[IIndexProvider] = None - self._metadata: Optional[IndexMetadata] = None - - # 状态管理 - self._is_initialized = False - self._last_check_time = 0 - self._check_interval = 30 # 30秒检查间隔 - - def _get_scip_tool(self): - """延迟导入SCIP工具以避免循环依赖""" - if self._scip_tool is None: - from ..tools.scip.scip_index_tool import SCIPIndexTool - self._scip_tool = SCIPIndexTool() - return self._scip_tool - - def initialize(self) -> bool: - """ - 初始化索引管理器 - - Returns: - True if initialization successful - """ - try: - # 1. 尝试加载现有索引 - if self._load_existing_index(): - logger.info("Successfully loaded existing index") - self._is_initialized = True - return True - - # 2. 如果没有现有索引,构建新索引 - if self._build_new_index(): - logger.info("Successfully built new index") - self._is_initialized = True - return True - - logger.warning("Failed to initialize index") - return False - - except Exception as e: - logger.error(f"Index initialization failed: {e}") - return False - - def get_provider(self) -> Optional[IIndexProvider]: - """ - 获取当前索引提供者 - - Returns: - 当前活跃的索引提供者,如果索引不可用则返回None - """ - if not self._is_initialized: - self.initialize() - - # 定期检查索引状态 - current_time = time.time() - if current_time - self._last_check_time > self._check_interval: - self._check_index_health() - self._last_check_time = current_time - - return self._current_provider - - def refresh_index(self, force: bool = False) -> bool: - """ - 刷新索引 - - Args: - force: 是否强制重建索引 - - Returns: - True if refresh successful - """ - try: - if force or self._needs_rebuild(): - return self._build_new_index() - else: - # 尝试增量更新 - return self._incremental_update() - except Exception as e: - logger.error(f"Index refresh failed: {e}") - return False - - def save_index(self) -> bool: - """ - 保存当前索引状态 - - Returns: - True if save successful - """ - try: - if self._current_provider and isinstance(self._current_provider, SCIPIndexProvider): - return self._get_scip_tool().save_index() - return False - except Exception as e: - logger.error(f"Index save failed: {e}") - return False - - def clear_index(self) -> None: - """清理索引状态""" - try: - if self._scip_tool: - self._scip_tool.clear_index() - self._current_provider = None - self._metadata = None - self._is_initialized = False - logger.info("Index cleared successfully") - except Exception as e: - logger.error(f"Index clear failed: {e}") - - def get_index_status(self) -> Dict[str, Any]: - """ - 获取索引状态信息 - - Returns: - 包含索引状态的字典 - """ - status = { - 'is_initialized': self._is_initialized, - 'is_available': self._current_provider is not None, - 'provider_type': type(self._current_provider).__name__ if self._current_provider else None, - 'metadata': self._metadata.__dict__ if self._metadata else None, - 'last_check': self._last_check_time - } - - if self._current_provider: - status['file_count'] = len(self._current_provider.get_file_list()) - - return status - - def _load_existing_index(self) -> bool: - """尝试加载现有索引""" - try: - # 1. 尝试SCIP索引 - scip_tool = self._get_scip_tool() - if scip_tool.load_existing_index(self.project_path): - self._current_provider = SCIPIndexProvider(scip_tool) - self._metadata = self._create_metadata_from_scip() - logger.info("Loaded SCIP index") - return True - - # 2. 尝试遗留索引(如果需要兼容) - legacy_data = self.settings.load_existing_index() - if legacy_data and self._is_valid_legacy_index(legacy_data): - self._current_provider = LegacyIndexProvider(legacy_data) - self._metadata = self._create_metadata_from_legacy(legacy_data) - logger.info("Loaded legacy index") - return True - - return False - - except Exception as e: - logger.error(f"Failed to load existing index: {e}") - return False - - def _build_new_index(self) -> bool: - """构建新索引""" - try: - scip_tool = self._get_scip_tool() - file_count = scip_tool.build_index(self.project_path) - if file_count > 0: - self._current_provider = SCIPIndexProvider(scip_tool) - self._metadata = self._create_metadata_from_scip() - - # 保存索引 - scip_tool.save_index() - - logger.info(f"Built new SCIP index with {file_count} files") - return True - - return False - - except Exception as e: - logger.error(f"Failed to build new index: {e}") - return False - - def _check_index_health(self) -> None: - """检查索引健康状态""" - if self._current_provider and not self._current_provider.is_available(): - logger.warning("Index provider became unavailable, attempting recovery") - self.initialize() - - def _needs_rebuild(self) -> bool: - """检查是否需要重建索引""" - if not self._metadata: - return True - - # 检查项目文件是否有更新 - try: - latest_mtime = 0 - for root, _, files in os.walk(self.project_path): - for file in files: - file_path = os.path.join(root, file) - mtime = os.path.getmtime(file_path) - latest_mtime = max(latest_mtime, mtime) - - return latest_mtime > self._metadata.last_updated - - except Exception: - return True # 如果检查失败,保守地重建 - - def _incremental_update(self) -> bool: - """增量更新索引(如果支持)""" - # 目前简化为完全重建 - # 在未来版本中可以实现真正的增量更新 - return self._build_new_index() - - def _create_metadata_from_scip(self) -> IndexMetadata: - """从SCIP索引创建元数据""" - scip_tool = self._get_scip_tool() - metadata_dict = scip_tool.get_project_metadata() - return IndexMetadata( - version="4.0-scip", - format_type="scip", - created_at=time.time(), - last_updated=time.time(), - file_count=metadata_dict.get('total_files', 0), - project_root=metadata_dict.get('project_root', self.project_path), - tool_version=metadata_dict.get('tool_version', 'unknown') - ) - - def _create_metadata_from_legacy(self, legacy_data: Dict[str, Any]) -> IndexMetadata: - """从遗留索引创建元数据""" - return IndexMetadata( - version="3.0-legacy", - format_type="legacy", - created_at=legacy_data.get('created_at', time.time()), - last_updated=legacy_data.get('last_updated', time.time()), - file_count=legacy_data.get('project_metadata', {}).get('total_files', 0), - project_root=self.project_path, - tool_version="legacy" - ) - - def _is_valid_legacy_index(self, index_data: Dict[str, Any]) -> bool: - """验证遗留索引是否有效""" - return ( - isinstance(index_data, dict) and - 'index_metadata' in index_data and - index_data.get('index_metadata', {}).get('version', '') >= '3.0' - ) - - -class SCIPIndexProvider: - """SCIP索引提供者实现""" - - def __init__(self, scip_tool): - self._scip_tool = scip_tool - - def get_file_list(self) -> List[FileInfo]: - return self._scip_tool.get_file_list() - - def get_file_info(self, file_path: str) -> Optional[FileInfo]: - file_list = self.get_file_list() - for file_info in file_list: - if file_info.relative_path == file_path: - return file_info - return None - - def query_symbols(self, file_path: str) -> List[SymbolInfo]: - # This method is deprecated - use CodeIntelligenceService for symbol analysis - return [] - - def search_files(self, pattern: str) -> List[FileInfo]: - # 延迟导入避免循环依赖 - from ..tools.filesystem.file_matching_tool import FileMatchingTool - matcher = FileMatchingTool() - return matcher.match_glob_pattern(self.get_file_list(), pattern) - - def get_metadata(self) -> IndexMetadata: - metadata_dict = self._scip_tool.get_project_metadata() - return IndexMetadata( - version="4.0-scip", - format_type="scip", - created_at=time.time(), - last_updated=time.time(), - file_count=metadata_dict.get('total_files', 0), - project_root=metadata_dict.get('project_root', ''), - tool_version=metadata_dict.get('tool_version', 'unknown') - ) - - def is_available(self) -> bool: - return self._scip_tool.is_index_available() - - -class LegacyIndexProvider: - """遗留索引提供者实现(兼容性支持)""" - - def __init__(self, legacy_data: Dict[str, Any]): - self._data = legacy_data - - def get_file_list(self) -> List[FileInfo]: - # 从遗留数据转换为标准格式 - files = [] - file_dict = self._data.get('files', {}) - - for file_path, file_data in file_dict.items(): - file_info = FileInfo( - relative_path=file_path, - language=file_data.get('language', 'unknown'), - absolute_path=file_data.get('absolute_path', '') - ) - files.append(file_info) - - return files - - def get_file_info(self, file_path: str) -> Optional[FileInfo]: - file_dict = self._data.get('files', {}) - if file_path in file_dict: - file_data = file_dict[file_path] - return FileInfo( - relative_path=file_path, - language=file_data.get('language', 'unknown'), - absolute_path=file_data.get('absolute_path', '') - ) - return None - - def query_symbols(self, file_path: str) -> List[SymbolInfo]: - # 遗留格式的符号信息有限,转换为标准格式 - file_dict = self._data.get('files', {}) - if file_path in file_dict: - legacy_symbols = file_dict[file_path].get('symbols', []) - symbols = [] - for symbol_data in legacy_symbols: - if isinstance(symbol_data, dict): - symbol = SymbolInfo( - name=symbol_data.get('name', ''), - kind=symbol_data.get('kind', 'unknown'), - location=symbol_data.get('location', {'line': 1, 'column': 1}), - scope=symbol_data.get('scope', 'global'), - documentation=symbol_data.get('documentation', []) - ) - symbols.append(symbol) - return symbols - return [] - - def search_files(self, pattern: str) -> List[FileInfo]: - import fnmatch - matched_files = [] - - for file_info in self.get_file_list(): - if fnmatch.fnmatch(file_info.relative_path, pattern): - matched_files.append(file_info) - - return matched_files - - def get_metadata(self) -> IndexMetadata: - meta = self._data.get('index_metadata', {}) - return IndexMetadata( - version=meta.get('version', '3.0-legacy'), - format_type="legacy", - created_at=meta.get('created_at', time.time()), - last_updated=meta.get('last_updated', time.time()), - file_count=len(self._data.get('files', {})), - project_root=meta.get('project_root', ''), - tool_version="legacy" - ) - - def is_available(self) -> bool: - return bool(self._data.get('files')) - - -# 全局索引管理器实例 -_global_index_manager: Optional[UnifiedIndexManager] = None - - -def get_unified_index_manager(project_path: str = None, settings: ProjectSettings = None) -> UnifiedIndexManager: - """ - 获取全局统一索引管理器实例 - - Args: - project_path: 项目路径(首次初始化时需要) - settings: 项目设置(可选) - - Returns: - UnifiedIndexManager实例 - """ - global _global_index_manager - - if _global_index_manager is None and project_path: - _global_index_manager = UnifiedIndexManager(project_path, settings) - - if _global_index_manager and project_path and _global_index_manager.project_path != project_path: - # 项目路径改变,重新创建管理器 - _global_index_manager = UnifiedIndexManager(project_path, settings) - - return _global_index_manager - - -def clear_global_index_manager() -> None: - """清理全局索引管理器""" - global _global_index_manager - if _global_index_manager: - _global_index_manager.clear_index() - _global_index_manager = None diff --git a/src/code_index_mcp/project_settings.py b/src/code_index_mcp/project_settings.py index ffbf1c1..d3c3965 100644 --- a/src/code_index_mcp/project_settings.py +++ b/src/code_index_mcp/project_settings.py @@ -13,16 +13,9 @@ from datetime import datetime -# SCIP protobuf import -try: - from .scip.proto.scip_pb2 import Index as SCIPIndex - SCIP_AVAILABLE = True -except ImportError: - SCIPIndex = None - SCIP_AVAILABLE = False from .constants import ( - SETTINGS_DIR, CONFIG_FILE, SCIP_INDEX_FILE, INDEX_FILE + SETTINGS_DIR, CONFIG_FILE, INDEX_FILE ) from .search.base import SearchStrategy from .search.ugrep import UgrepStrategy @@ -188,14 +181,6 @@ def get_config_path(self): else: return os.path.join(os.path.expanduser("~"), CONFIG_FILE) - def get_scip_index_path(self): - """Get the path to the SCIP index file""" - path = os.path.join(self.settings_path, SCIP_INDEX_FILE) - # Ensure directory exists - os.makedirs(os.path.dirname(path), exist_ok=True) - return path - - # get_cache_path method removed - no longer needed with new indexing system def _get_timestamp(self): """Get current timestamp""" @@ -346,133 +331,7 @@ def load_index(self): except Exception: return None - def save_scip_index(self, scip_index): - """Save SCIP index in protobuf binary format - - Args: - scip_index: SCIP Index protobuf object - """ - if not SCIP_AVAILABLE: - raise RuntimeError("SCIP protobuf not available. Cannot save SCIP index.") - - if not isinstance(scip_index, SCIPIndex): - raise ValueError("scip_index must be a SCIP Index protobuf object") - - try: - scip_path = self.get_scip_index_path() - - # Ensure directory exists - dir_path = os.path.dirname(scip_path) - if not os.path.exists(dir_path): - os.makedirs(dir_path, exist_ok=True) - - # Serialize to binary format - binary_data = scip_index.SerializeToString() - - # Save binary data - with open(scip_path, 'wb') as f: - f.write(binary_data) - - - - except Exception: - # Try saving to project or home directory - try: - if self.base_path and os.path.exists(self.base_path): - fallback_path = os.path.join(self.base_path, SCIP_INDEX_FILE) - else: - fallback_path = os.path.join(os.path.expanduser("~"), SCIP_INDEX_FILE) - - - binary_data = scip_index.SerializeToString() - with open(fallback_path, 'wb') as f: - f.write(binary_data) - except Exception: - raise - - def load_scip_index(self): - """Load SCIP index from protobuf binary format - - Returns: - SCIP Index object, or None if file doesn't exist or has errors - """ - if not SCIP_AVAILABLE: - return None - - # If skip_load is set, return None directly - if self.skip_load: - return None - - try: - scip_path = self.get_scip_index_path() - - if os.path.exists(scip_path): - - try: - with open(scip_path, 'rb') as f: - binary_data = f.read() - - # Deserialize from binary format - scip_index = SCIPIndex() - scip_index.ParseFromString(binary_data) - - - return scip_index - - except Exception: - return None - else: - # Try fallback paths - fallback_paths = [] - if self.base_path and os.path.exists(self.base_path): - fallback_paths.append(os.path.join(self.base_path, SCIP_INDEX_FILE)) - fallback_paths.append(os.path.join(os.path.expanduser("~"), SCIP_INDEX_FILE)) - - for fallback_path in fallback_paths: - if os.path.exists(fallback_path): - - try: - with open(fallback_path, 'rb') as f: - binary_data = f.read() - - scip_index = SCIPIndex() - scip_index.ParseFromString(binary_data) - - - return scip_index - except Exception: - continue - - return None - - except Exception: - return None - - # save_cache and load_cache methods removed - no longer needed with new indexing system - def is_latest_index(self) -> bool: - """Check if SCIP index exists and is the latest version. - - Returns: - bool: True if latest SCIP index exists, False if needs rebuild - """ - try: - # Only check for SCIP index at settings_path - scip_path = os.path.join(self.settings_path, SCIP_INDEX_FILE) - - if not os.path.exists(scip_path): - return False - - # Basic file integrity check - try: - with open(scip_path, 'rb') as f: - # Check if file is readable and has content - return f.read(1) != b'' - except: - return False - - except Exception: - return False def cleanup_legacy_files(self) -> None: """Clean up any legacy index files found.""" diff --git a/src/code_index_mcp/scip/__init__.py b/src/code_index_mcp/scip/__init__.py deleted file mode 100644 index 47939ef..0000000 --- a/src/code_index_mcp/scip/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -SCIP (Source Code Intelligence Protocol) indexing module. - -This module provides SCIP-based code indexing capabilities using a modern -language manager approach to support various programming languages and tools. -""" - -from .language_manager import SCIPLanguageManager, LanguageNotSupportedException, create_language_manager - -__all__ = ['SCIPLanguageManager', 'LanguageNotSupportedException', 'create_language_manager'] \ No newline at end of file diff --git a/src/code_index_mcp/scip/core/__init__.py b/src/code_index_mcp/scip/core/__init__.py deleted file mode 100644 index cbd4fc0..0000000 --- a/src/code_index_mcp/scip/core/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""SCIP core components for standard-compliant indexing.""" \ No newline at end of file diff --git a/src/code_index_mcp/scip/core/local_reference_resolver.py b/src/code_index_mcp/scip/core/local_reference_resolver.py deleted file mode 100644 index cef4da8..0000000 --- a/src/code_index_mcp/scip/core/local_reference_resolver.py +++ /dev/null @@ -1,470 +0,0 @@ -"""Local Reference Resolver - Cross-file reference resolution within a project.""" - -import logging -from typing import Dict, List, Optional, Set, Tuple, Any -from dataclasses import dataclass -from pathlib import Path - -from ..proto import scip_pb2 - - -logger = logging.getLogger(__name__) - - -@dataclass -class SymbolDefinition: - """Information about a symbol definition.""" - symbol_id: str - file_path: str - definition_range: scip_pb2.Range - symbol_kind: int - display_name: str - documentation: List[str] - - -@dataclass -class SymbolReference: - """Information about a symbol reference.""" - symbol_id: str - file_path: str - reference_range: scip_pb2.Range - context_scope: List[str] - - -@dataclass -class SymbolRelationship: - """Information about a relationship between symbols.""" - source_symbol_id: str - target_symbol_id: str - relationship_type: str # InternalRelationshipType enum value - relationship_data: Dict[str, Any] # Additional relationship metadata - - -class LocalReferenceResolver: - """ - Resolves references within a local project. - - This class maintains a symbol table for all definitions in the project - and helps resolve references to their definitions. - """ - - def __init__(self, project_path: str): - """ - Initialize reference resolver for a project. - - Args: - project_path: Absolute path to project root - """ - self.project_path = Path(project_path).resolve() - - # Symbol tables - self.symbol_definitions: Dict[str, SymbolDefinition] = {} - self.symbol_references: Dict[str, List[SymbolReference]] = {} - - # Relationship storage - self.symbol_relationships: Dict[str, List[SymbolRelationship]] = {} # source_symbol_id -> relationships - self.reverse_relationships: Dict[str, List[SymbolRelationship]] = {} # target_symbol_id -> relationships - - # File-based indexes for faster lookup - self.file_symbols: Dict[str, Set[str]] = {} # file_path -> symbol_ids - self.symbol_by_name: Dict[str, List[str]] = {} # display_name -> symbol_ids - - logger.debug(f"LocalReferenceResolver initialized for project: {project_path}") - - def register_symbol_definition(self, - symbol_id: str, - file_path: str, - definition_range: scip_pb2.Range, - symbol_kind: int, - display_name: str, - documentation: List[str] = None) -> None: - """ - Register a symbol definition. - - Args: - symbol_id: SCIP symbol ID - file_path: File path relative to project root - definition_range: SCIP Range of definition - symbol_kind: SCIP symbol kind - display_name: Human-readable symbol name - documentation: Optional documentation - """ - definition = SymbolDefinition( - symbol_id=symbol_id, - file_path=file_path, - definition_range=definition_range, - symbol_kind=symbol_kind, - display_name=display_name, - documentation=documentation or [] - ) - - self.symbol_definitions[symbol_id] = definition - - # Update file index - if file_path not in self.file_symbols: - self.file_symbols[file_path] = set() - self.file_symbols[file_path].add(symbol_id) - - # Update name index - if display_name not in self.symbol_by_name: - self.symbol_by_name[display_name] = [] - if symbol_id not in self.symbol_by_name[display_name]: - self.symbol_by_name[display_name].append(symbol_id) - - logger.debug(f"Registered symbol definition: {display_name} -> {symbol_id}") - - def register_symbol_reference(self, - symbol_id: str, - file_path: str, - reference_range: scip_pb2.Range, - context_scope: List[str] = None) -> None: - """ - Register a symbol reference. - - Args: - symbol_id: SCIP symbol ID being referenced - file_path: File path where reference occurs - reference_range: SCIP Range of reference - context_scope: Scope context where reference occurs - """ - reference = SymbolReference( - symbol_id=symbol_id, - file_path=file_path, - reference_range=reference_range, - context_scope=context_scope or [] - ) - - if symbol_id not in self.symbol_references: - self.symbol_references[symbol_id] = [] - self.symbol_references[symbol_id].append(reference) - - logger.debug(f"Registered symbol reference: {symbol_id} in {file_path}") - - def resolve_reference_by_name(self, - symbol_name: str, - context_file: str, - context_scope: List[str] = None) -> Optional[str]: - """ - Resolve a symbol reference by name to its definition symbol ID. - - Args: - symbol_name: Name of symbol to resolve - context_file: File where reference occurs - context_scope: Scope context of reference - - Returns: - Symbol ID of definition or None if not found - """ - context_scope = context_scope or [] - - # Look for exact name matches - if symbol_name not in self.symbol_by_name: - return None - - candidate_symbols = self.symbol_by_name[symbol_name] - - if len(candidate_symbols) == 1: - return candidate_symbols[0] - - # Multiple candidates - use scope-based resolution - return self._resolve_with_scope(candidate_symbols, context_file, context_scope) - - def get_symbol_definition(self, symbol_id: str) -> Optional[SymbolDefinition]: - """ - Get symbol definition by ID. - - Args: - symbol_id: SCIP symbol ID - - Returns: - SymbolDefinition or None if not found - """ - return self.symbol_definitions.get(symbol_id) - - def get_symbol_references(self, symbol_id: str) -> List[SymbolReference]: - """ - Get all references to a symbol. - - Args: - symbol_id: SCIP symbol ID - - Returns: - List of SymbolReference objects - """ - return self.symbol_references.get(symbol_id, []) - - def get_file_symbols(self, file_path: str) -> Set[str]: - """ - Get all symbols defined in a file. - - Args: - file_path: File path relative to project root - - Returns: - Set of symbol IDs defined in the file - """ - return self.file_symbols.get(file_path, set()) - - def find_symbols_by_pattern(self, pattern: str) -> List[SymbolDefinition]: - """ - Find symbols matching a pattern. - - Args: - pattern: Search pattern (simple substring match) - - Returns: - List of matching SymbolDefinition objects - """ - matches = [] - pattern_lower = pattern.lower() - - for symbol_def in self.symbol_definitions.values(): - if (pattern_lower in symbol_def.display_name.lower() or - pattern_lower in symbol_def.symbol_id.lower()): - matches.append(symbol_def) - - return matches - - def get_project_statistics(self) -> Dict[str, int]: - """ - Get statistics about the symbol table including relationships. - - Returns: - Dictionary with statistics - """ - total_references = sum(len(refs) for refs in self.symbol_references.values()) - total_relationships = sum(len(rels) for rels in self.symbol_relationships.values()) - - return { - 'total_definitions': len(self.symbol_definitions), - 'total_references': total_references, - 'total_relationships': total_relationships, - 'files_with_symbols': len(self.file_symbols), - 'unique_symbol_names': len(self.symbol_by_name), - 'symbols_with_relationships': len(self.symbol_relationships) - } - - def _resolve_with_scope(self, - candidate_symbols: List[str], - context_file: str, - context_scope: List[str]) -> Optional[str]: - """ - Resolve symbol using scope-based heuristics. - - Args: - candidate_symbols: List of candidate symbol IDs - context_file: File where reference occurs - context_scope: Scope context - - Returns: - Best matching symbol ID or None - """ - # Scoring system for symbol resolution - scored_candidates = [] - - for symbol_id in candidate_symbols: - definition = self.symbol_definitions.get(symbol_id) - if not definition: - continue - - score = 0 - - # Prefer symbols from the same file - if definition.file_path == context_file: - score += 100 - - # Prefer symbols from similar scope depth - symbol_scope_depth = symbol_id.count('/') - context_scope_depth = len(context_scope) - scope_diff = abs(symbol_scope_depth - context_scope_depth) - score += max(0, 50 - scope_diff * 10) - - # Prefer symbols with matching scope components - for scope_component in context_scope: - if scope_component in symbol_id: - score += 20 - - scored_candidates.append((score, symbol_id)) - - if not scored_candidates: - return None - - # Return highest scoring candidate - scored_candidates.sort(key=lambda x: x[0], reverse=True) - best_symbol = scored_candidates[0][1] - - logger.debug(f"Resolved '{candidate_symbols}' to '{best_symbol}' " - f"(score: {scored_candidates[0][0]})") - - return best_symbol - - def clear(self) -> None: - """Clear all symbol tables.""" - self.symbol_definitions.clear() - self.symbol_references.clear() - self.file_symbols.clear() - self.symbol_by_name.clear() - - logger.debug("Symbol tables cleared") - - def export_symbol_table(self) -> Dict[str, any]: - """ - Export symbol table for debugging or persistence. - - Returns: - Dictionary representation of symbol table - """ - return { - 'definitions': { - symbol_id: { - 'file_path': defn.file_path, - 'display_name': defn.display_name, - 'symbol_kind': defn.symbol_kind, - 'documentation': defn.documentation - } - for symbol_id, defn in self.symbol_definitions.items() - }, - 'references': { - symbol_id: len(refs) - for symbol_id, refs in self.symbol_references.items() - }, - 'relationships': { - symbol_id: len(rels) - for symbol_id, rels in self.symbol_relationships.items() - }, - 'statistics': self.get_project_statistics() - } - - def add_symbol_relationship(self, - source_symbol_id: str, - target_symbol_id: str, - relationship_type: str, - relationship_data: Dict[str, Any] = None) -> None: - """ - Add a relationship between symbols. - - Args: - source_symbol_id: Source symbol ID - target_symbol_id: Target symbol ID - relationship_type: Type of relationship (enum value as string) - relationship_data: Additional relationship metadata - """ - relationship = SymbolRelationship( - source_symbol_id=source_symbol_id, - target_symbol_id=target_symbol_id, - relationship_type=relationship_type, - relationship_data=relationship_data or {} - ) - - # Add to forward relationships - if source_symbol_id not in self.symbol_relationships: - self.symbol_relationships[source_symbol_id] = [] - self.symbol_relationships[source_symbol_id].append(relationship) - - # Add to reverse relationships for quick lookup - if target_symbol_id not in self.reverse_relationships: - self.reverse_relationships[target_symbol_id] = [] - self.reverse_relationships[target_symbol_id].append(relationship) - - logger.debug(f"Added relationship: {source_symbol_id} --{relationship_type}--> {target_symbol_id}") - - def get_symbol_relationships(self, symbol_id: str) -> List[SymbolRelationship]: - """ - Get all relationships where the symbol is the source. - - Args: - symbol_id: Symbol ID - - Returns: - List of relationships - """ - return self.symbol_relationships.get(symbol_id, []) - - def get_reverse_relationships(self, symbol_id: str) -> List[SymbolRelationship]: - """ - Get all relationships where the symbol is the target. - - Args: - symbol_id: Symbol ID - - Returns: - List of relationships where this symbol is the target - """ - return self.reverse_relationships.get(symbol_id, []) - - def get_all_relationships_for_symbol(self, symbol_id: str) -> Dict[str, List[SymbolRelationship]]: - """ - Get both forward and reverse relationships for a symbol. - - Args: - symbol_id: Symbol ID - - Returns: - Dictionary with 'outgoing' and 'incoming' relationship lists - """ - return { - 'outgoing': self.get_symbol_relationships(symbol_id), - 'incoming': self.get_reverse_relationships(symbol_id) - } - - def find_relationships_by_type(self, relationship_type: str) -> List[SymbolRelationship]: - """ - Find all relationships of a specific type. - - Args: - relationship_type: Type of relationship to find - - Returns: - List of matching relationships - """ - matches = [] - for relationships in self.symbol_relationships.values(): - for rel in relationships: - if rel.relationship_type == relationship_type: - matches.append(rel) - return matches - - def remove_symbol_relationships(self, symbol_id: str) -> None: - """ - Remove all relationships for a symbol (both as source and target). - - Args: - symbol_id: Symbol ID to remove relationships for - """ - # Remove as source - if symbol_id in self.symbol_relationships: - del self.symbol_relationships[symbol_id] - - # Remove as target - if symbol_id in self.reverse_relationships: - del self.reverse_relationships[symbol_id] - - # Remove from other symbols' relationships where this symbol is referenced - for source_id, relationships in self.symbol_relationships.items(): - self.symbol_relationships[source_id] = [ - rel for rel in relationships if rel.target_symbol_id != symbol_id - ] - - logger.debug(f"Removed all relationships for symbol: {symbol_id}") - - def get_relationship_statistics(self) -> Dict[str, Any]: - """ - Get statistics about relationships. - - Returns: - Dictionary with relationship statistics - """ - total_relationships = sum(len(rels) for rels in self.symbol_relationships.values()) - relationship_types = {} - - for relationships in self.symbol_relationships.values(): - for rel in relationships: - rel_type = rel.relationship_type - relationship_types[rel_type] = relationship_types.get(rel_type, 0) + 1 - - return { - 'total_relationships': total_relationships, - 'symbols_with_outgoing_relationships': len(self.symbol_relationships), - 'symbols_with_incoming_relationships': len(self.reverse_relationships), - 'relationship_types': relationship_types - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/core/moniker_manager.py b/src/code_index_mcp/scip/core/moniker_manager.py deleted file mode 100644 index 64fe640..0000000 --- a/src/code_index_mcp/scip/core/moniker_manager.py +++ /dev/null @@ -1,375 +0,0 @@ -""" -Moniker Manager - handles import/export monikers for cross-repository navigation. - -Monikers in SCIP enable cross-repository symbol resolution by providing standardized -identifiers for external packages, modules, and dependencies. -""" - -import logging -import re -from typing import Dict, List, Optional, Set, Tuple, NamedTuple -from pathlib import Path -from dataclasses import dataclass, field - -from ..proto import scip_pb2 - - -logger = logging.getLogger(__name__) - - -@dataclass -class PackageInfo: - """Information about an external package.""" - manager: str # e.g., "npm", "pip", "maven", "cargo" - name: str # Package name - version: str # Package version (optional) - - def to_scip_package(self) -> str: - """Convert to SCIP package format.""" - if self.version: - return f"{self.manager} {self.name} {self.version}" - return f"{self.manager} {self.name}" - - -@dataclass -class ImportedSymbol: - """Represents an imported symbol from external package.""" - package_info: PackageInfo - module_path: str # Module path within package - symbol_name: str # Symbol name - alias: Optional[str] = None # Local alias if any - import_kind: str = "default" # "default", "named", "namespace", "side_effect" - - @property - def local_name(self) -> str: - """Get the local name used in code.""" - return self.alias or self.symbol_name - - -@dataclass -class ExportedSymbol: - """Represents a symbol exported by this package.""" - symbol_name: str - symbol_kind: str # "function", "class", "variable", "type", etc. - module_path: str # Path within this package - is_default: bool = False - - -class MonikerManager: - """ - Manages import/export monikers for cross-repository symbol resolution. - - Key responsibilities: - 1. Track external package dependencies - 2. Generate SCIP symbols for imported symbols - 3. Create external symbol information - 4. Support package manager integration (npm, pip, maven, etc.) - """ - - def __init__(self, project_path: str, project_name: str): - """ - Initialize moniker manager. - - Args: - project_path: Root path of the current project - project_name: Name of the current project - """ - self.project_path = project_path - self.project_name = project_name - - # Track imported symbols from external packages - self.imported_symbols: Dict[str, ImportedSymbol] = {} - - # Track symbols exported by this project - self.exported_symbols: Dict[str, ExportedSymbol] = {} - - # Package dependency information - self.dependencies: Dict[str, PackageInfo] = {} - - # Cache for generated SCIP symbol IDs - self._symbol_cache: Dict[str, str] = {} - - # Registry of known package managers and their patterns - self.package_managers = { - "npm": PackageManagerConfig( - name="npm", - config_files=["package.json", "package-lock.json", "yarn.lock"], - import_patterns=[ - r"import\s+.*?from\s+['\"]([^'\"]+)['\"]", - r"require\s*\(\s*['\"]([^'\"]+)['\"]\s*\)" - ] - ), - "pip": PackageManagerConfig( - name="pip", - config_files=["requirements.txt", "pyproject.toml", "setup.py", "Pipfile"], - import_patterns=[ - r"from\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)", - r"import\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)" - ] - ), - "maven": PackageManagerConfig( - name="maven", - config_files=["pom.xml", "build.gradle", "build.gradle.kts"], - import_patterns=[ - r"import\s+([a-zA-Z_][a-zA-Z0-9_.]*)" - ] - ), - "cargo": PackageManagerConfig( - name="cargo", - config_files=["Cargo.toml", "Cargo.lock"], - import_patterns=[ - r"use\s+([a-zA-Z_][a-zA-Z0-9_]*(?:::[a-zA-Z_][a-zA-Z0-9_]*)*)" - ] - ) - } - - # Detect project package manager - self.detected_manager = self._detect_package_manager() - - logger.debug(f"Initialized MonikerManager for {project_name} with {self.detected_manager or 'no'} package manager") - - def register_import(self, - package_name: str, - symbol_name: str, - module_path: str = "", - alias: Optional[str] = None, - import_kind: str = "named", - version: Optional[str] = None) -> str: - """ - Register an imported symbol from external package. - - Args: - package_name: Name of the external package - symbol_name: Name of the imported symbol - module_path: Module path within package - alias: Local alias for the symbol - import_kind: Type of import (default, named, namespace, side_effect) - version: Package version if known - - Returns: - SCIP symbol ID for the imported symbol - """ - # Create package info - manager = self.detected_manager or "unknown" - package_info = PackageInfo(manager, package_name, version or "") - - # Create imported symbol - imported_symbol = ImportedSymbol( - package_info=package_info, - module_path=module_path, - symbol_name=symbol_name, - alias=alias, - import_kind=import_kind - ) - - # Generate cache key - cache_key = f"{package_name}.{module_path}.{symbol_name}" - - # Store imported symbol - self.imported_symbols[cache_key] = imported_symbol - self.dependencies[package_name] = package_info - - # Generate SCIP symbol ID - symbol_id = self._generate_external_symbol_id(imported_symbol) - self._symbol_cache[cache_key] = symbol_id - - logger.debug(f"Registered import: {cache_key} -> {symbol_id}") - return symbol_id - - def register_export(self, - symbol_name: str, - symbol_kind: str, - module_path: str, - is_default: bool = False) -> str: - """ - Register a symbol exported by this project. - - Args: - symbol_name: Name of the exported symbol - symbol_kind: Kind of symbol (function, class, etc.) - module_path: Module path within this project - is_default: Whether this is a default export - - Returns: - SCIP symbol ID for the exported symbol - """ - exported_symbol = ExportedSymbol( - symbol_name=symbol_name, - symbol_kind=symbol_kind, - module_path=module_path, - is_default=is_default - ) - - cache_key = f"export.{module_path}.{symbol_name}" - self.exported_symbols[cache_key] = exported_symbol - - # Generate local symbol ID (this will be accessible to other projects) - symbol_id = self._generate_export_symbol_id(exported_symbol) - self._symbol_cache[cache_key] = symbol_id - - logger.debug(f"Registered export: {cache_key} -> {symbol_id}") - return symbol_id - - def get_external_symbol_information(self) -> List[scip_pb2.SymbolInformation]: - """ - Generate external symbol information for all imported symbols. - - Returns: - List of SymbolInformation for external symbols - """ - external_symbols = [] - - for cache_key, imported_symbol in self.imported_symbols.items(): - symbol_id = self._symbol_cache.get(cache_key) - if not symbol_id: - continue - - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = imported_symbol.local_name - symbol_info.kind = self._infer_symbol_kind(imported_symbol.symbol_name) - - # Add package information to documentation - pkg = imported_symbol.package_info - documentation = [ - f"External symbol from {pkg.name}", - f"Package manager: {pkg.manager}" - ] - if pkg.version: - documentation.append(f"Version: {pkg.version}") - if imported_symbol.module_path: - documentation.append(f"Module: {imported_symbol.module_path}") - - symbol_info.documentation.extend(documentation) - - external_symbols.append(symbol_info) - - logger.info(f"Generated {len(external_symbols)} external symbol information entries") - return external_symbols - - def resolve_import_reference(self, symbol_name: str, context_file: str) -> Optional[str]: - """ - Resolve a symbol reference to an imported symbol. - - Args: - symbol_name: Name of the symbol being referenced - context_file: File where the reference occurs - - Returns: - SCIP symbol ID if the symbol is an import, None otherwise - """ - # Look for exact matches first - for cache_key, imported_symbol in self.imported_symbols.items(): - if imported_symbol.local_name == symbol_name: - return self._symbol_cache.get(cache_key) - - # Look for partial matches (e.g., module.symbol) - for cache_key, imported_symbol in self.imported_symbols.items(): - if symbol_name.startswith(imported_symbol.local_name + "."): - # This might be a member access on imported module - base_symbol_id = self._symbol_cache.get(cache_key) - if base_symbol_id: - # Create symbol ID for the member - member_name = symbol_name[len(imported_symbol.local_name) + 1:] - return self._generate_member_symbol_id(imported_symbol, member_name) - - return None - - def get_dependency_info(self) -> Dict[str, PackageInfo]: - """Get information about all detected dependencies.""" - return self.dependencies.copy() - - def _detect_package_manager(self) -> Optional[str]: - """Detect which package manager this project uses.""" - project_root = Path(self.project_path) - - for manager_name, config in self.package_managers.items(): - for config_file in config.config_files: - if (project_root / config_file).exists(): - logger.info(f"Detected {manager_name} package manager") - return manager_name - - return None - - def _generate_external_symbol_id(self, imported_symbol: ImportedSymbol) -> str: - """Generate SCIP symbol ID for external symbol.""" - pkg = imported_symbol.package_info - - # SCIP format: scheme manager package version descriptors - parts = ["scip-python" if pkg.manager == "pip" else f"scip-{pkg.manager}"] - parts.append(pkg.manager) - parts.append(pkg.name) - - if pkg.version: - parts.append(pkg.version) - - # Add module path if present - if imported_symbol.module_path: - parts.append(imported_symbol.module_path.replace("/", ".")) - - # Add symbol descriptor - if imported_symbol.symbol_name: - parts.append(f"{imported_symbol.symbol_name}.") - - return " ".join(parts) - - def _generate_export_symbol_id(self, exported_symbol: ExportedSymbol) -> str: - """Generate SCIP symbol ID for exported symbol.""" - # For exports, use local scheme but make it accessible - manager = self.detected_manager or "local" - - parts = [f"scip-{manager}", manager, self.project_name] - - if exported_symbol.module_path: - parts.append(exported_symbol.module_path.replace("/", ".")) - - # Add appropriate descriptor based on symbol kind - descriptor = self._get_symbol_descriptor(exported_symbol.symbol_kind) - parts.append(f"{exported_symbol.symbol_name}{descriptor}") - - return " ".join(parts) - - def _generate_member_symbol_id(self, imported_symbol: ImportedSymbol, member_name: str) -> str: - """Generate symbol ID for a member of an imported symbol.""" - base_id = self._generate_external_symbol_id(imported_symbol) - - # Remove the trailing descriptor and add member - if base_id.endswith("."): - base_id = base_id[:-1] - - return f"{base_id}#{member_name}." - - def _get_symbol_descriptor(self, symbol_kind: str) -> str: - """Get SCIP descriptor suffix for symbol kind.""" - descriptors = { - "function": "().", - "method": "().", - "class": "#", - "interface": "#", - "type": "#", - "variable": ".", - "constant": ".", - "module": "/", - "namespace": "/" - } - return descriptors.get(symbol_kind.lower(), ".") - - def _infer_symbol_kind(self, symbol_name: str) -> int: - """Infer SCIP symbol kind from symbol name.""" - # Simple heuristics - could be enhanced with actual type information - if symbol_name.istitle(): # CamelCase suggests class/type - return scip_pb2.Class - elif symbol_name.isupper(): # UPPER_CASE suggests constant - return scip_pb2.Constant - elif "." in symbol_name: # Dotted suggests module/namespace - return scip_pb2.Module - else: - return scip_pb2.Function # Default assumption - - -@dataclass -class PackageManagerConfig: - """Configuration for a specific package manager.""" - name: str - config_files: List[str] = field(default_factory=list) - import_patterns: List[str] = field(default_factory=list) \ No newline at end of file diff --git a/src/code_index_mcp/scip/core/position_calculator.py b/src/code_index_mcp/scip/core/position_calculator.py deleted file mode 100644 index 1f46139..0000000 --- a/src/code_index_mcp/scip/core/position_calculator.py +++ /dev/null @@ -1,306 +0,0 @@ -"""SCIP Position Calculator - Accurate position calculation for SCIP ranges.""" - -import ast -import logging -from typing import Tuple, List, Optional -try: - import tree_sitter - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - -from ..proto import scip_pb2 - - -logger = logging.getLogger(__name__) - - -class PositionCalculator: - """ - Accurate position calculator for SCIP ranges. - - Handles conversion from various source positions (AST nodes, Tree-sitter nodes, - line/column positions) to precise SCIP Range objects. - """ - - def __init__(self, content: str, encoding: str = 'utf-8'): - """ - Initialize position calculator with file content. - - Args: - content: File content as string - encoding: File encoding (default: utf-8) - """ - self.content = content - self.encoding = encoding - self.lines = content.split('\n') - - # Build byte offset mapping for accurate position calculation - self._build_position_maps() - - logger.debug(f"PositionCalculator initialized for {len(self.lines)} lines") - - def _build_position_maps(self): - """Build mapping tables for efficient position conversion.""" - # Build line start byte offsets - self.line_start_bytes: List[int] = [0] - - content_bytes = self.content.encode(self.encoding) - current_byte = 0 - - for line in self.lines[:-1]: # Exclude last line - line_bytes = line.encode(self.encoding) - current_byte += len(line_bytes) + 1 # +1 for newline - self.line_start_bytes.append(current_byte) - - def ast_node_to_range(self, node: ast.AST) -> scip_pb2.Range: - """ - Convert Python AST node to SCIP Range. - - Args: - node: Python AST node - - Returns: - SCIP Range object - """ - range_obj = scip_pb2.Range() - - if hasattr(node, 'lineno') and hasattr(node, 'col_offset'): - # Python AST uses 1-based line numbers, SCIP uses 0-based - start_line = node.lineno - 1 - start_col = node.col_offset - - # Try to get end position - if hasattr(node, 'end_lineno') and hasattr(node, 'end_col_offset'): - end_line = node.end_lineno - 1 - end_col = node.end_col_offset - else: - # Estimate end position - end_line, end_col = self._estimate_ast_end_position(node, start_line, start_col) - - range_obj.start.extend([start_line, start_col]) - range_obj.end.extend([end_line, end_col]) - else: - # Fallback for nodes without position info - range_obj.start.extend([0, 0]) - range_obj.end.extend([0, 1]) - - return range_obj - - def tree_sitter_node_to_range(self, node) -> scip_pb2.Range: - """ - Convert Tree-sitter node to SCIP Range. - - Args: - node: Tree-sitter Node object - - Returns: - SCIP Range object - """ - if not TREE_SITTER_AVAILABLE: - logger.warning("Tree-sitter not available, using fallback range") - range_obj = scip_pb2.Range() - range_obj.start.extend([0, 0]) - range_obj.end.extend([0, 1]) - return range_obj - - range_obj = scip_pb2.Range() - - # Tree-sitter provides byte offsets, convert to line/column - start_line, start_col = self.byte_to_line_col(node.start_byte) - end_line, end_col = self.byte_to_line_col(node.end_byte) - - range_obj.start.extend([start_line, start_col]) - range_obj.end.extend([end_line, end_col]) - - return range_obj - - def line_col_to_range(self, - start_line: int, - start_col: int, - end_line: Optional[int] = None, - end_col: Optional[int] = None, - name_length: int = 1) -> scip_pb2.Range: - """ - Create SCIP Range from line/column positions. - - Args: - start_line: Start line (0-based) - start_col: Start column (0-based) - end_line: End line (optional) - end_col: End column (optional) - name_length: Length of symbol name for end position estimation - - Returns: - SCIP Range object - """ - range_obj = scip_pb2.Range() - - # Use provided end position or estimate - if end_line is not None and end_col is not None: - final_end_line = end_line - final_end_col = end_col - else: - final_end_line = start_line - final_end_col = start_col + name_length - - range_obj.start.extend([start_line, start_col]) - range_obj.end.extend([final_end_line, final_end_col]) - - return range_obj - - def byte_to_line_col(self, byte_offset: int) -> Tuple[int, int]: - """ - Convert byte offset to line/column position. - - Args: - byte_offset: Byte offset in file - - Returns: - Tuple of (line, column) - both 0-based - """ - if byte_offset < 0: - return (0, 0) - - # Find the line containing this byte offset - line_num = 0 - for i, line_start in enumerate(self.line_start_bytes): - if byte_offset < line_start: - line_num = i - 1 - break - else: - line_num = len(self.line_start_bytes) - 1 - - # Ensure line_num is valid - line_num = max(0, min(line_num, len(self.lines) - 1)) - - # Calculate column within the line - line_start_byte = self.line_start_bytes[line_num] - byte_in_line = byte_offset - line_start_byte - - # Convert byte offset to character offset within line - if line_num < len(self.lines): - line_content = self.lines[line_num] - try: - # Convert byte offset to character offset - line_bytes = line_content.encode(self.encoding) - if byte_in_line <= len(line_bytes): - char_offset = len(line_bytes[:byte_in_line].decode(self.encoding, errors='ignore')) - else: - char_offset = len(line_content) - except (UnicodeDecodeError, UnicodeEncodeError): - # Fallback to byte offset as character offset - char_offset = min(byte_in_line, len(line_content)) - else: - char_offset = 0 - - return (line_num, char_offset) - - def find_name_in_line(self, line_num: int, name: str) -> Tuple[int, int]: - """ - Find the position of a name within a line. - - Args: - line_num: Line number (0-based) - name: Name to find - - Returns: - Tuple of (start_col, end_col) or (0, len(name)) if not found - """ - if line_num < 0 or line_num >= len(self.lines): - return (0, len(name)) - - line_content = self.lines[line_num] - start_col = line_content.find(name) - - if start_col == -1: - # Try to find word boundary match - import re - pattern = r'\b' + re.escape(name) + r'\b' - match = re.search(pattern, line_content) - if match: - start_col = match.start() - else: - start_col = 0 - - end_col = start_col + len(name) - return (start_col, end_col) - - def _estimate_ast_end_position(self, - node: ast.AST, - start_line: int, - start_col: int) -> Tuple[int, int]: - """ - Estimate end position for AST nodes without end position info. - - Args: - node: AST node - start_line: Start line - start_col: Start column - - Returns: - Tuple of (end_line, end_col) - """ - # Try to get name length from common node types - name_length = 1 - - if hasattr(node, 'id'): # Name nodes - name_length = len(node.id) - elif hasattr(node, 'name'): # Function/Class definition nodes - name_length = len(node.name) - elif hasattr(node, 'arg'): # Argument nodes - name_length = len(node.arg) - elif hasattr(node, 'attr'): # Attribute nodes - name_length = len(node.attr) - elif isinstance(node, ast.Constant) and isinstance(node.value, str): - name_length = len(str(node.value)) + 2 # Add quotes - - # For most cases, end position is on the same line - end_line = start_line - end_col = start_col + name_length - - # Ensure end position doesn't exceed line length - if start_line < len(self.lines): - line_length = len(self.lines[start_line]) - end_col = min(end_col, line_length) - - return (end_line, end_col) - - def validate_range(self, range_obj: scip_pb2.Range) -> bool: - """ - Validate that a SCIP Range is within file bounds. - - Args: - range_obj: SCIP Range to validate - - Returns: - True if range is valid - """ - if len(range_obj.start) != 2 or len(range_obj.end) != 2: - return False - - start_line, start_col = range_obj.start[0], range_obj.start[1] - end_line, end_col = range_obj.end[0], range_obj.end[1] - - # Check line bounds - if start_line < 0 or start_line >= len(self.lines): - return False - if end_line < 0 or end_line >= len(self.lines): - return False - - # Check column bounds - if start_line < len(self.lines): - if start_col < 0 or start_col > len(self.lines[start_line]): - return False - - if end_line < len(self.lines): - if end_col < 0 or end_col > len(self.lines[end_line]): - return False - - # Check that start <= end - if start_line > end_line: - return False - if start_line == end_line and start_col > end_col: - return False - - return True \ No newline at end of file diff --git a/src/code_index_mcp/scip/core/relationship_manager.py b/src/code_index_mcp/scip/core/relationship_manager.py deleted file mode 100644 index e16c33f..0000000 --- a/src/code_index_mcp/scip/core/relationship_manager.py +++ /dev/null @@ -1,286 +0,0 @@ -"""SCIP 關係管理器 - 負責將內部關係轉換為標準 SCIP Relationship""" - -import logging -from typing import List, Dict, Optional, Any, Set -from enum import Enum - -from ..proto import scip_pb2 - -logger = logging.getLogger(__name__) - - -class RelationshipType(Enum): - """內部關係類型定義""" - CALLS = "calls" # 函數調用關係 - CALLED_BY = "called_by" # 被調用關係 - INHERITS = "inherits" # 繼承關係 - IMPLEMENTS = "implements" # 實現關係 - REFERENCES = "references" # 引用關係 - TYPE_DEFINITION = "type_definition" # 類型定義關係 - DEFINITION = "definition" # 定義關係 - - -class SCIPRelationshipManager: - """ - SCIP 關係轉換和管理核心 - - 負責將內部關係格式轉換為標準 SCIP Relationship 對象, - 並管理符號間的各種關係類型。 - """ - - def __init__(self): - """初始化關係管理器""" - self.relationship_cache: Dict[str, List[scip_pb2.Relationship]] = {} - self.symbol_relationships: Dict[str, Set[str]] = {} - - logger.debug("SCIPRelationshipManager initialized") - - def create_relationship(self, - target_symbol: str, - relationship_type: RelationshipType) -> scip_pb2.Relationship: - """ - 創建標準 SCIP Relationship 對象 - - Args: - target_symbol: 目標符號的 SCIP 符號 ID - relationship_type: 關係類型 - - Returns: - SCIP Relationship 對象 - """ - relationship = scip_pb2.Relationship() - relationship.symbol = target_symbol - - # 根據關係類型設置相應的布爾標誌 - if relationship_type == RelationshipType.REFERENCES: - relationship.is_reference = True - elif relationship_type == RelationshipType.IMPLEMENTS: - relationship.is_implementation = True - elif relationship_type == RelationshipType.TYPE_DEFINITION: - relationship.is_type_definition = True - elif relationship_type == RelationshipType.DEFINITION: - relationship.is_definition = True - else: - # 對於 CALLS, CALLED_BY, INHERITS 等關係,使用 is_reference - # 這些關係在 SCIP 標準中主要通過 is_reference 表示 - relationship.is_reference = True - - logger.debug(f"Created SCIP relationship: {target_symbol} ({relationship_type.value})") - return relationship - - def add_relationships_to_symbol(self, - symbol_info: scip_pb2.SymbolInformation, - relationships: List[scip_pb2.Relationship]) -> None: - """ - 將關係列表添加到 SCIP 符號信息中 - - Args: - symbol_info: SCIP 符號信息對象 - relationships: 要添加的關係列表 - """ - if not relationships: - return - - # 清除現有關係(如果有的話) - del symbol_info.relationships[:] - - # 添加新關係 - symbol_info.relationships.extend(relationships) - - logger.debug(f"Added {len(relationships)} relationships to symbol {symbol_info.symbol}") - - def convert_call_relationships(self, - call_relationships: Any, - symbol_manager: Any) -> List[scip_pb2.Relationship]: - """ - 將內部 CallRelationships 轉換為 SCIP Relationship 列表 - - Args: - call_relationships: 內部 CallRelationships 對象 - symbol_manager: 符號管理器,用於生成符號 ID - - Returns: - SCIP Relationship 對象列表 - """ - relationships = [] - - # 處理本地調用關係 - if hasattr(call_relationships, 'local') and call_relationships.local: - for function_name in call_relationships.local: - # 嘗試生成目標符號 ID - target_symbol_id = self._generate_local_symbol_id( - function_name, symbol_manager - ) - if target_symbol_id: - relationship = self.create_relationship( - target_symbol_id, RelationshipType.CALLS - ) - relationships.append(relationship) - - # 處理外部調用關係 - if hasattr(call_relationships, 'external') and call_relationships.external: - for call_info in call_relationships.external: - if isinstance(call_info, dict) and 'name' in call_info: - # 為外部調用生成符號 ID - target_symbol_id = self._generate_external_symbol_id( - call_info, symbol_manager - ) - if target_symbol_id: - relationship = self.create_relationship( - target_symbol_id, RelationshipType.CALLS - ) - relationships.append(relationship) - - logger.debug(f"Converted call relationships: {len(relationships)} relationships") - return relationships - - def add_inheritance_relationship(self, - child_symbol_id: str, - parent_symbol_id: str) -> scip_pb2.Relationship: - """ - 添加繼承關係 - - Args: - child_symbol_id: 子類符號 ID - parent_symbol_id: 父類符號 ID - - Returns: - SCIP Relationship 對象 - """ - relationship = self.create_relationship(parent_symbol_id, RelationshipType.INHERITS) - - # 記錄關係到緩存 - if child_symbol_id not in self.symbol_relationships: - self.symbol_relationships[child_symbol_id] = set() - self.symbol_relationships[child_symbol_id].add(parent_symbol_id) - - logger.debug(f"Added inheritance: {child_symbol_id} -> {parent_symbol_id}") - return relationship - - def add_implementation_relationship(self, - implementer_symbol_id: str, - interface_symbol_id: str) -> scip_pb2.Relationship: - """ - 添加實現關係(介面實現) - - Args: - implementer_symbol_id: 實現者符號 ID - interface_symbol_id: 介面符號 ID - - Returns: - SCIP Relationship 對象 - """ - relationship = self.create_relationship(interface_symbol_id, RelationshipType.IMPLEMENTS) - - # 記錄關係到緩存 - if implementer_symbol_id not in self.symbol_relationships: - self.symbol_relationships[implementer_symbol_id] = set() - self.symbol_relationships[implementer_symbol_id].add(interface_symbol_id) - - logger.debug(f"Added implementation: {implementer_symbol_id} -> {interface_symbol_id}") - return relationship - - def get_symbol_relationships(self, symbol_id: str) -> List[scip_pb2.Relationship]: - """ - 獲取符號的所有關係 - - Args: - symbol_id: 符號 ID - - Returns: - 關係列表 - """ - if symbol_id in self.relationship_cache: - return self.relationship_cache[symbol_id] - return [] - - def cache_relationships(self, symbol_id: str, relationships: List[scip_pb2.Relationship]) -> None: - """ - 緩存符號的關係 - - Args: - symbol_id: 符號 ID - relationships: 關係列表 - """ - self.relationship_cache[symbol_id] = relationships - logger.debug(f"Cached {len(relationships)} relationships for {symbol_id}") - - def clear_cache(self) -> None: - """清除關係緩存""" - self.relationship_cache.clear() - self.symbol_relationships.clear() - logger.debug("Relationship cache cleared") - - def get_statistics(self) -> Dict[str, int]: - """ - 獲取關係統計信息 - - Returns: - 統計信息字典 - """ - total_relationships = sum(len(rels) for rels in self.relationship_cache.values()) - return { - 'symbols_with_relationships': len(self.relationship_cache), - 'total_relationships': total_relationships, - 'cached_symbol_connections': len(self.symbol_relationships) - } - - def _generate_local_symbol_id(self, function_name: str, symbol_manager: Any) -> Optional[str]: - """ - 為本地函數生成符號 ID - - Args: - function_name: 函數名稱 - symbol_manager: 符號管理器 - - Returns: - 符號 ID 或 None - """ - try: - if hasattr(symbol_manager, 'create_local_symbol'): - # 假設這是一個本地符號,使用基本路徑 - return symbol_manager.create_local_symbol( - language="unknown", # 將在具體策略中設置正確的語言 - file_path="", # 將在具體策略中設置正確的文件路徑 - symbol_path=[function_name], - descriptor="()." # 函數描述符 - ) - except Exception as e: - logger.warning(f"Failed to generate local symbol ID for {function_name}: {e}") - return None - - def _generate_external_symbol_id(self, call_info: Dict[str, Any], symbol_manager: Any) -> Optional[str]: - """ - 為外部調用生成符號 ID - - Args: - call_info: 外部調用信息 - symbol_manager: 符號管理器 - - Returns: - 符號 ID 或 None - """ - try: - function_name = call_info.get('name', '') - file_path = call_info.get('file', '') - - if function_name and hasattr(symbol_manager, 'create_local_symbol'): - return symbol_manager.create_local_symbol( - language="unknown", # 將在具體策略中設置正確的語言 - file_path=file_path, - symbol_path=[function_name], - descriptor="()." # 函數描述符 - ) - except Exception as e: - logger.warning(f"Failed to generate external symbol ID for {call_info}: {e}") - return None - - -class RelationshipError(Exception): - """關係處理相關錯誤""" - pass - - -class RelationshipConversionError(RelationshipError): - """關係轉換錯誤""" - pass \ No newline at end of file diff --git a/src/code_index_mcp/scip/core/relationship_types.py b/src/code_index_mcp/scip/core/relationship_types.py deleted file mode 100644 index 7088448..0000000 --- a/src/code_index_mcp/scip/core/relationship_types.py +++ /dev/null @@ -1,389 +0,0 @@ -"""SCIP 關係類型定義和映射 - -這個模組定義了內部關係類型到 SCIP 標準關係的映射, -並提供關係驗證和規範化功能。 -""" - -import logging -from typing import Dict, List, Optional, Set, Any -from enum import Enum -from dataclasses import dataclass - -from ..proto import scip_pb2 - -logger = logging.getLogger(__name__) - - -class InternalRelationshipType(Enum): - """內部關係類型定義 - 擴展版本支援更多關係類型""" - - # 函數調用關係 - CALLS = "calls" # A 調用 B - CALLED_BY = "called_by" # A 被 B 調用 - - # 類型關係 - INHERITS = "inherits" # A 繼承 B - INHERITED_BY = "inherited_by" # A 被 B 繼承 - IMPLEMENTS = "implements" # A 實現 B (介面) - IMPLEMENTED_BY = "implemented_by" # A 被 B 實現 - - # 定義和引用關係 - DEFINES = "defines" # A 定義 B - DEFINED_BY = "defined_by" # A 被 B 定義 - REFERENCES = "references" # A 引用 B - REFERENCED_BY = "referenced_by" # A 被 B 引用 - - # 類型相關關係 - TYPE_OF = "type_of" # A 是 B 的類型 - HAS_TYPE = "has_type" # A 有類型 B - - # 模組和包關係 - IMPORTS = "imports" # A 導入 B - IMPORTED_BY = "imported_by" # A 被 B 導入 - EXPORTS = "exports" # A 導出 B - EXPORTED_BY = "exported_by" # A 被 B 導出 - - # 組合關係 - CONTAINS = "contains" # A 包含 B (類包含方法) - CONTAINED_BY = "contained_by" # A 被 B 包含 - - # 重寫關係 - OVERRIDES = "overrides" # A 重寫 B - OVERRIDDEN_BY = "overridden_by" # A 被 B 重寫 - - -@dataclass -class RelationshipMapping: - """關係映射配置""" - scip_is_reference: bool = False - scip_is_implementation: bool = False - scip_is_type_definition: bool = False - scip_is_definition: bool = False - description: str = "" - - -class SCIPRelationshipMapper: - """ - SCIP 關係映射器 - - 負責將內部關係類型映射到標準 SCIP Relationship 格式, - 並提供關係驗證和查詢功能。 - """ - - # 內部關係類型到 SCIP 標準的映射表 - RELATIONSHIP_MAPPINGS: Dict[InternalRelationshipType, RelationshipMapping] = { - # 函數調用關係 - 使用 is_reference - InternalRelationshipType.CALLS: RelationshipMapping( - scip_is_reference=True, - description="Function call relationship" - ), - InternalRelationshipType.CALLED_BY: RelationshipMapping( - scip_is_reference=True, - description="Reverse function call relationship" - ), - - # 繼承關係 - 使用 is_reference - InternalRelationshipType.INHERITS: RelationshipMapping( - scip_is_reference=True, - description="Class inheritance relationship" - ), - InternalRelationshipType.INHERITED_BY: RelationshipMapping( - scip_is_reference=True, - description="Reverse inheritance relationship" - ), - - # 實現關係 - 使用 is_implementation - InternalRelationshipType.IMPLEMENTS: RelationshipMapping( - scip_is_implementation=True, - description="Interface implementation relationship" - ), - InternalRelationshipType.IMPLEMENTED_BY: RelationshipMapping( - scip_is_implementation=True, - description="Reverse implementation relationship" - ), - - # 定義關係 - 使用 is_definition - InternalRelationshipType.DEFINES: RelationshipMapping( - scip_is_definition=True, - description="Symbol definition relationship" - ), - InternalRelationshipType.DEFINED_BY: RelationshipMapping( - scip_is_definition=True, - description="Reverse definition relationship" - ), - - # 引用關係 - 使用 is_reference - InternalRelationshipType.REFERENCES: RelationshipMapping( - scip_is_reference=True, - description="Symbol reference relationship" - ), - InternalRelationshipType.REFERENCED_BY: RelationshipMapping( - scip_is_reference=True, - description="Reverse reference relationship" - ), - - # 類型關係 - 使用 is_type_definition - InternalRelationshipType.TYPE_OF: RelationshipMapping( - scip_is_type_definition=True, - description="Type definition relationship" - ), - InternalRelationshipType.HAS_TYPE: RelationshipMapping( - scip_is_type_definition=True, - description="Has type relationship" - ), - - # 導入/導出關係 - 使用 is_reference - InternalRelationshipType.IMPORTS: RelationshipMapping( - scip_is_reference=True, - description="Module import relationship" - ), - InternalRelationshipType.IMPORTED_BY: RelationshipMapping( - scip_is_reference=True, - description="Reverse import relationship" - ), - InternalRelationshipType.EXPORTS: RelationshipMapping( - scip_is_reference=True, - description="Module export relationship" - ), - InternalRelationshipType.EXPORTED_BY: RelationshipMapping( - scip_is_reference=True, - description="Reverse export relationship" - ), - - # 包含關係 - 使用 is_reference - InternalRelationshipType.CONTAINS: RelationshipMapping( - scip_is_reference=True, - description="Containment relationship" - ), - InternalRelationshipType.CONTAINED_BY: RelationshipMapping( - scip_is_reference=True, - description="Reverse containment relationship" - ), - - # 重寫關係 - 使用 is_implementation - InternalRelationshipType.OVERRIDES: RelationshipMapping( - scip_is_implementation=True, - description="Method override relationship" - ), - InternalRelationshipType.OVERRIDDEN_BY: RelationshipMapping( - scip_is_implementation=True, - description="Reverse override relationship" - ), - } - - def __init__(self): - """初始化關係映射器""" - self.custom_mappings: Dict[str, RelationshipMapping] = {} - logger.debug("SCIPRelationshipMapper initialized") - - def map_to_scip_relationship(self, - target_symbol: str, - relationship_type: InternalRelationshipType) -> scip_pb2.Relationship: - """ - 將內部關係類型映射為 SCIP Relationship 對象 - - Args: - target_symbol: 目標符號 ID - relationship_type: 內部關係類型 - - Returns: - SCIP Relationship 對象 - - Raises: - ValueError: 如果關係類型不支援 - """ - if relationship_type not in self.RELATIONSHIP_MAPPINGS: - raise ValueError(f"Unsupported relationship type: {relationship_type}") - - mapping = self.RELATIONSHIP_MAPPINGS[relationship_type] - - relationship = scip_pb2.Relationship() - relationship.symbol = target_symbol - relationship.is_reference = mapping.scip_is_reference - relationship.is_implementation = mapping.scip_is_implementation - relationship.is_type_definition = mapping.scip_is_type_definition - relationship.is_definition = mapping.scip_is_definition - - logger.debug(f"Mapped {relationship_type.value} -> SCIP relationship for {target_symbol}") - return relationship - - def batch_map_relationships(self, - relationships: List[tuple]) -> List[scip_pb2.Relationship]: - """ - 批量映射關係 - - Args: - relationships: (target_symbol, relationship_type) 元組列表 - - Returns: - SCIP Relationship 對象列表 - """ - scip_relationships = [] - - for target_symbol, relationship_type in relationships: - try: - scip_rel = self.map_to_scip_relationship(target_symbol, relationship_type) - scip_relationships.append(scip_rel) - except ValueError as e: - logger.warning(f"Failed to map relationship: {e}") - continue - - logger.debug(f"Batch mapped {len(scip_relationships)} relationships") - return scip_relationships - - def validate_relationship_type(self, relationship_type: str) -> bool: - """ - 驗證關係類型是否支援 - - Args: - relationship_type: 關係類型字符串 - - Returns: - 是否支援 - """ - try: - InternalRelationshipType(relationship_type) - return True - except ValueError: - return relationship_type in self.custom_mappings - - def get_supported_relationship_types(self) -> List[str]: - """ - 獲取所有支援的關係類型 - - Returns: - 關係類型字符串列表 - """ - builtin_types = [rt.value for rt in InternalRelationshipType] - custom_types = list(self.custom_mappings.keys()) - return builtin_types + custom_types - - def get_relationship_description(self, relationship_type: InternalRelationshipType) -> str: - """ - 獲取關係類型的描述 - - Args: - relationship_type: 關係類型 - - Returns: - 描述字符串 - """ - mapping = self.RELATIONSHIP_MAPPINGS.get(relationship_type) - return mapping.description if mapping else "Unknown relationship" - - def add_custom_mapping(self, - relationship_type: str, - mapping: RelationshipMapping) -> None: - """ - 添加自定義關係映射 - - Args: - relationship_type: 自定義關係類型名稱 - mapping: 關係映射配置 - """ - self.custom_mappings[relationship_type] = mapping - logger.debug(f"Added custom relationship mapping: {relationship_type}") - - def get_reverse_relationship(self, relationship_type: InternalRelationshipType) -> Optional[InternalRelationshipType]: - """ - 獲取關係的反向關係 - - Args: - relationship_type: 關係類型 - - Returns: - 反向關係類型或 None - """ - reverse_mappings = { - InternalRelationshipType.CALLS: InternalRelationshipType.CALLED_BY, - InternalRelationshipType.CALLED_BY: InternalRelationshipType.CALLS, - InternalRelationshipType.INHERITS: InternalRelationshipType.INHERITED_BY, - InternalRelationshipType.INHERITED_BY: InternalRelationshipType.INHERITS, - InternalRelationshipType.IMPLEMENTS: InternalRelationshipType.IMPLEMENTED_BY, - InternalRelationshipType.IMPLEMENTED_BY: InternalRelationshipType.IMPLEMENTS, - InternalRelationshipType.DEFINES: InternalRelationshipType.DEFINED_BY, - InternalRelationshipType.DEFINED_BY: InternalRelationshipType.DEFINES, - InternalRelationshipType.REFERENCES: InternalRelationshipType.REFERENCED_BY, - InternalRelationshipType.REFERENCED_BY: InternalRelationshipType.REFERENCES, - InternalRelationshipType.TYPE_OF: InternalRelationshipType.HAS_TYPE, - InternalRelationshipType.HAS_TYPE: InternalRelationshipType.TYPE_OF, - InternalRelationshipType.IMPORTS: InternalRelationshipType.IMPORTED_BY, - InternalRelationshipType.IMPORTED_BY: InternalRelationshipType.IMPORTS, - InternalRelationshipType.EXPORTS: InternalRelationshipType.EXPORTED_BY, - InternalRelationshipType.EXPORTED_BY: InternalRelationshipType.EXPORTS, - InternalRelationshipType.CONTAINS: InternalRelationshipType.CONTAINED_BY, - InternalRelationshipType.CONTAINED_BY: InternalRelationshipType.CONTAINS, - InternalRelationshipType.OVERRIDES: InternalRelationshipType.OVERRIDDEN_BY, - InternalRelationshipType.OVERRIDDEN_BY: InternalRelationshipType.OVERRIDES, - } - - return reverse_mappings.get(relationship_type) - - def is_directional_relationship(self, relationship_type: InternalRelationshipType) -> bool: - """ - 檢查關係是否是有向的 - - Args: - relationship_type: 關係類型 - - Returns: - 是否有向 - """ - # 大多數關係都是有向的 - non_directional = { - # 可以在這裡添加非有向關係類型 - } - return relationship_type not in non_directional - - def group_relationships_by_type(self, - relationships: List[scip_pb2.Relationship]) -> Dict[str, List[scip_pb2.Relationship]]: - """ - 按關係的 SCIP 標誌分組 - - Args: - relationships: SCIP 關係列表 - - Returns: - 按類型分組的關係字典 - """ - groups = { - 'references': [], - 'implementations': [], - 'type_definitions': [], - 'definitions': [] - } - - for rel in relationships: - if rel.is_reference: - groups['references'].append(rel) - if rel.is_implementation: - groups['implementations'].append(rel) - if rel.is_type_definition: - groups['type_definitions'].append(rel) - if rel.is_definition: - groups['definitions'].append(rel) - - return groups - - def get_statistics(self) -> Dict[str, Any]: - """ - 獲取映射器統計信息 - - Returns: - 統計信息字典 - """ - return { - 'builtin_relationship_types': len(InternalRelationshipType), - 'custom_relationship_types': len(self.custom_mappings), - 'total_supported_types': len(InternalRelationshipType) + len(self.custom_mappings) - } - - -class RelationshipTypeError(Exception): - """關係類型相關錯誤""" - pass - - -class UnsupportedRelationshipError(RelationshipTypeError): - """不支援的關係類型錯誤""" - pass \ No newline at end of file diff --git a/src/code_index_mcp/scip/core/symbol_manager.py b/src/code_index_mcp/scip/core/symbol_manager.py deleted file mode 100644 index 73a2e99..0000000 --- a/src/code_index_mcp/scip/core/symbol_manager.py +++ /dev/null @@ -1,323 +0,0 @@ -"""SCIP Symbol Manager - Standard-compliant symbol ID generation with moniker support.""" - -import os -import logging -from typing import List, Optional, Dict, Any -from pathlib import Path -from dataclasses import dataclass - -from .moniker_manager import MonikerManager, PackageInfo - - -logger = logging.getLogger(__name__) - - -@dataclass -class SCIPSymbolInfo: - """Information about a SCIP symbol.""" - scheme: str # scip-python, scip-javascript, etc. - manager: str # local, pypi, npm, maven, etc. - package: str # package/project name - version: str # version (for external packages) - descriptors: str # symbol path with descriptors - - -class SCIPSymbolManager: - """ - Standard SCIP Symbol Manager for local projects with cross-repository support. - - Generates symbol IDs that comply with SCIP specification: - Format: {scheme} {manager} {package} {version} {descriptors} - - For local projects: - - scheme: scip-{language} - - manager: local - - package: project name - - version: empty (local projects don't have versions) - - descriptors: file_path/symbol_path{descriptor} - - For external packages: - - scheme: scip-{language} - - manager: npm, pip, maven, etc. - - package: external package name - - version: package version - - descriptors: module_path/symbol_path{descriptor} - """ - - def __init__(self, project_path: str, project_name: Optional[str] = None): - """ - Initialize symbol manager for a project. - - Args: - project_path: Absolute path to project root - project_name: Project name (defaults to directory name) - """ - self.project_path = Path(project_path).resolve() - self.project_name = project_name or self.project_path.name - - # Normalize project name for SCIP (replace invalid characters) - self.project_name = self._normalize_package_name(self.project_name) - - # Initialize moniker manager for cross-repository support - self.moniker_manager = MonikerManager(str(self.project_path), self.project_name) - - logger.debug(f"SCIPSymbolManager initialized for project: {self.project_name}") - - def create_local_symbol(self, - language: str, - file_path: str, - symbol_path: List[str], - descriptor: str = "") -> str: - """ - Create a local symbol ID following SCIP standard. - - Args: - language: Programming language (python, javascript, java, etc.) - file_path: File path relative to project root - symbol_path: List of symbol components (module, class, function, etc.) - descriptor: SCIP descriptor ((), #, ., etc.) - - Returns: - Standard SCIP symbol ID - - Example: - create_local_symbol("python", "src/main.py", ["MyClass", "method"], "()") - -> "scip-python local myproject src/main.py/MyClass#method()." - """ - # Normalize inputs - scheme = f"scip-{language.lower()}" - manager = "local" - package = self.project_name - version = "" # Local projects don't have versions - - # Build descriptors path - normalized_file_path = self._normalize_file_path(file_path) - symbol_components = symbol_path.copy() - - if symbol_components: - # Last component gets the descriptor - last_symbol = symbol_components[-1] + descriptor - symbol_components[-1] = last_symbol - - descriptors = f"{normalized_file_path}/{'/'.join(symbol_components)}" - else: - descriptors = normalized_file_path - - # Build final symbol ID - parts = [scheme, manager, package] - if version: - parts.append(version) - parts.append(descriptors) - - symbol_id = " ".join(parts) - - logger.debug(f"Created local symbol: {symbol_id}") - return symbol_id - - def create_builtin_symbol(self, language: str, builtin_name: str) -> str: - """ - Create a symbol ID for built-in language constructs. - - Args: - language: Programming language - builtin_name: Name of built-in (str, int, Object, etc.) - - Returns: - SCIP symbol ID for built-in - """ - scheme = f"scip-{language.lower()}" - manager = "builtin" - package = language.lower() - descriptors = builtin_name - - return f"{scheme} {manager} {package} {descriptors}" - - def create_stdlib_symbol(self, - language: str, - module_name: str, - symbol_name: str, - descriptor: str = "") -> str: - """ - Create a symbol ID for standard library symbols. - - Args: - language: Programming language - module_name: Standard library module name - symbol_name: Symbol name within module - descriptor: SCIP descriptor - - Returns: - SCIP symbol ID for standard library symbol - """ - scheme = f"scip-{language.lower()}" - manager = "stdlib" - package = language.lower() - descriptors = f"{module_name}/{symbol_name}{descriptor}" - - return f"{scheme} {manager} {package} {descriptors}" - - def create_external_symbol(self, - language: str, - package_name: str, - module_path: str, - symbol_name: str, - descriptor: str = "", - version: Optional[str] = None, - alias: Optional[str] = None) -> str: - """ - Create a symbol ID for external package symbols using moniker manager. - - Args: - language: Programming language - package_name: External package name - module_path: Module path within package - symbol_name: Symbol name - descriptor: SCIP descriptor - version: Package version - alias: Local alias for the symbol - - Returns: - SCIP symbol ID for external symbol - """ - return self.moniker_manager.register_import( - package_name=package_name, - symbol_name=symbol_name, - module_path=module_path, - alias=alias, - version=version - ) - - def register_export(self, - symbol_name: str, - symbol_kind: str, - file_path: str, - is_default: bool = False) -> str: - """ - Register a symbol as exportable from this project. - - Args: - symbol_name: Name of the exported symbol - symbol_kind: Kind of symbol (function, class, etc.) - file_path: File path where symbol is defined - is_default: Whether this is a default export - - Returns: - SCIP symbol ID for the exported symbol - """ - normalized_file_path = self._normalize_file_path(file_path) - return self.moniker_manager.register_export( - symbol_name=symbol_name, - symbol_kind=symbol_kind, - module_path=normalized_file_path, - is_default=is_default - ) - - def resolve_import_reference(self, symbol_name: str, context_file: str) -> Optional[str]: - """ - Resolve a symbol reference to an imported external symbol. - - Args: - symbol_name: Name of the symbol being referenced - context_file: File where the reference occurs - - Returns: - SCIP symbol ID if resolved to external import, None otherwise - """ - return self.moniker_manager.resolve_import_reference(symbol_name, context_file) - - def get_external_symbols(self): - """Get external symbol information for the index.""" - return self.moniker_manager.get_external_symbol_information() - - def get_dependencies(self) -> Dict[str, PackageInfo]: - """Get information about detected external dependencies.""" - return self.moniker_manager.get_dependency_info() - - def parse_symbol(self, symbol_id: str) -> Optional[SCIPSymbolInfo]: - """ - Parse a SCIP symbol ID into components. - - Args: - symbol_id: SCIP symbol ID to parse - - Returns: - SCIPSymbolInfo object or None if parsing fails - """ - try: - parts = symbol_id.split(" ", 4) - if len(parts) < 4: - return None - - scheme = parts[0] - manager = parts[1] - package = parts[2] - - # Handle version (optional) - if len(parts) == 5: - version = parts[3] - descriptors = parts[4] - else: - version = "" - descriptors = parts[3] - - return SCIPSymbolInfo( - scheme=scheme, - manager=manager, - package=package, - version=version, - descriptors=descriptors - ) - - except Exception as e: - logger.warning(f"Failed to parse symbol ID '{symbol_id}': {e}") - return None - - def get_file_path_from_symbol(self, symbol_id: str) -> Optional[str]: - """ - Extract file path from a local symbol ID. - - Args: - symbol_id: SCIP symbol ID - - Returns: - File path or None if not a local symbol - """ - symbol_info = self.parse_symbol(symbol_id) - if not symbol_info or symbol_info.manager != "local": - return None - - # Extract file path from descriptors (before first '/') - descriptors = symbol_info.descriptors - if "/" in descriptors: - return descriptors.split("/", 1)[0] - - return descriptors - - def _normalize_package_name(self, name: str) -> str: - """Normalize package name for SCIP compatibility.""" - # Replace invalid characters with underscores - import re - normalized = re.sub(r'[^a-zA-Z0-9_-]', '_', name) - - # Ensure it starts with a letter or underscore - if normalized and not normalized[0].isalpha() and normalized[0] != '_': - normalized = f"_{normalized}" - - return normalized.lower() - - def _normalize_file_path(self, file_path: str) -> str: - """Normalize file path for SCIP descriptors.""" - # Convert to forward slashes and remove leading slash - normalized = file_path.replace('\\', '/') - if normalized.startswith('/'): - normalized = normalized[1:] - - return normalized - - def get_project_info(self) -> Dict[str, Any]: - """Get project information.""" - return { - 'project_path': str(self.project_path), - 'project_name': self.project_name, - 'normalized_name': self.project_name - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/__init__.py b/src/code_index_mcp/scip/framework/__init__.py deleted file mode 100644 index bbd2f12..0000000 --- a/src/code_index_mcp/scip/framework/__init__.py +++ /dev/null @@ -1,157 +0,0 @@ -"""SCIP Framework Infrastructure - Complete framework for SCIP standard compliance.""" - -# Core framework components -from .types import SCIPSymbolDescriptor, SCIPPositionInfo, SCIPSymbolContext, SCIPSymbolExtractor -from .standard_framework import SCIPStandardFramework -from .symbol_generator import SCIPSymbolGenerator -from .position_calculator import SCIPPositionCalculator -from .compliance_validator import SCIPComplianceValidator -from .relationship_manager import SCIPRelationshipManager, RelationshipType, SymbolRelationship - -# Language-specific implementations (legacy - being phased out) -# NOTE: Old java_factory.py has been removed and replaced with java/ module - -# Base abstract classes for all language implementations -from .base import ( - SCIPIndexFactory as BaseSCIPIndexFactory, - BaseRelationshipExtractor, - BaseEnumMapper, - BaseLanguageAnalyzer -) - -# New modular Python framework components -from .python import ( - PythonSCIPIndexFactory as ModularPythonSCIPIndexFactory, - create_python_scip_factory, - PythonRelationshipExtractor as ModularPythonRelationshipExtractor, - PythonEnumMapper as ModularPythonEnumMapper, - PythonASTAnalyzer -) - -# New modular JavaScript framework components -from .javascript import ( - JavaScriptSCIPIndexFactory as ModularJavaScriptSCIPIndexFactory, - create_javascript_scip_factory, - JavaScriptRelationshipExtractor as ModularJavaScriptRelationshipExtractor, - JavaScriptEnumMapper as ModularJavaScriptEnumMapper, - JavaScriptSyntaxAnalyzer -) - -# New modular Java framework components -from .java import ( - JavaSCIPIndexFactory as ModularJavaSCIPIndexFactory, - create_java_scip_factory, - JavaRelationshipExtractor as ModularJavaRelationshipExtractor, - JavaEnumMapper as ModularJavaEnumMapper, - JavaTreeSitterAnalyzer -) - -# New modular Objective-C framework components -from .objective_c import ( - ObjectiveCSCIPIndexFactory as ModularObjectiveCSCIPIndexFactory, - create_objective_c_scip_factory, - ObjectiveCRelationshipExtractor as ModularObjectiveCRelationshipExtractor, - ObjectiveCEnumMapper as ModularObjectiveCEnumMapper, - ObjectiveCClangAnalyzer -) - -# New modular Zig framework components -from .zig import ( - ZigSCIPIndexFactory as ModularZigSCIPIndexFactory, - create_zig_scip_factory, - ZigRelationshipExtractor as ModularZigRelationshipExtractor, - ZigEnumMapper as ModularZigEnumMapper, - ZigTreeSitterAnalyzer -) - -# New modular Fallback framework components -from .fallback import ( - FallbackSCIPIndexFactory as ModularFallbackSCIPIndexFactory, - create_fallback_scip_factory, - FallbackRelationshipExtractor as ModularFallbackRelationshipExtractor, - FallbackEnumMapper as ModularFallbackEnumMapper, - FallbackBasicAnalyzer -) - -# Advanced features -from .caching_system import SCIPCacheManager, BatchProcessor, CacheEntry -from .streaming_indexer import StreamingIndexer, IndexingProgress, IndexMerger -from .unified_api import SCIPFrameworkAPI, SCIPConfig, create_scip_framework - -__all__ = [ - # Core framework - 'SCIPSymbolDescriptor', - 'SCIPPositionInfo', - 'SCIPSymbolContext', - 'SCIPSymbolExtractor', - 'SCIPStandardFramework', - 'SCIPSymbolGenerator', - 'SCIPPositionCalculator', - 'SCIPComplianceValidator', - 'SCIPRelationshipManager', - 'RelationshipType', - 'SymbolRelationship', - - # Language implementations (legacy - removed) - # 'JavaSCIPIndexFactory', - moved to java/ module - # 'JavaSCIPEnumMapper', - moved to java/ module - - # Base abstract classes - 'BaseSCIPIndexFactory', - 'BaseRelationshipExtractor', - 'BaseEnumMapper', - 'BaseLanguageAnalyzer', - - # New modular Python components - 'ModularPythonSCIPIndexFactory', - 'create_python_scip_factory', - 'ModularPythonRelationshipExtractor', - 'ModularPythonEnumMapper', - 'PythonASTAnalyzer', - - # New modular JavaScript components - 'ModularJavaScriptSCIPIndexFactory', - 'create_javascript_scip_factory', - 'ModularJavaScriptRelationshipExtractor', - 'ModularJavaScriptEnumMapper', - 'JavaScriptSyntaxAnalyzer', - - # New modular Java components - 'ModularJavaSCIPIndexFactory', - 'create_java_scip_factory', - 'ModularJavaRelationshipExtractor', - 'ModularJavaEnumMapper', - 'JavaTreeSitterAnalyzer', - - # New modular Objective-C components - 'ModularObjectiveCSCIPIndexFactory', - 'create_objective_c_scip_factory', - 'ModularObjectiveCRelationshipExtractor', - 'ModularObjectiveCEnumMapper', - 'ObjectiveCClangAnalyzer', - - # New modular Zig components - 'ModularZigSCIPIndexFactory', - 'create_zig_scip_factory', - 'ModularZigRelationshipExtractor', - 'ModularZigEnumMapper', - 'ZigTreeSitterAnalyzer', - - # New modular Fallback components - 'ModularFallbackSCIPIndexFactory', - 'create_fallback_scip_factory', - 'ModularFallbackRelationshipExtractor', - 'ModularFallbackEnumMapper', - 'FallbackBasicAnalyzer', - - # Advanced features - 'SCIPCacheManager', - 'BatchProcessor', - 'CacheEntry', - 'StreamingIndexer', - 'IndexingProgress', - 'IndexMerger', - 'SCIPFrameworkAPI', - 'SCIPConfig', - 'create_scip_framework' -] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/__init__.py b/src/code_index_mcp/scip/framework/base/__init__.py deleted file mode 100644 index 65456c8..0000000 --- a/src/code_index_mcp/scip/framework/base/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Base classes for SCIP framework components.""" - -from .index_factory import SCIPIndexFactory -from .relationship_extractor import BaseRelationshipExtractor -from .enum_mapper import BaseEnumMapper -from .language_analyzer import BaseLanguageAnalyzer - -__all__ = [ - 'SCIPIndexFactory', - 'BaseRelationshipExtractor', - 'BaseEnumMapper', - 'BaseLanguageAnalyzer', -] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/enum_mapper.py b/src/code_index_mcp/scip/framework/base/enum_mapper.py deleted file mode 100644 index c929bee..0000000 --- a/src/code_index_mcp/scip/framework/base/enum_mapper.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Base enum mapper class for SCIP compliance.""" - -from abc import ABC, abstractmethod - - -class BaseEnumMapper(ABC): - """Base enum mapper class - mandatory implementation for all languages.""" - - @abstractmethod - def map_symbol_kind(self, language_kind: str) -> int: - """Map language-specific type to SCIP SymbolKind.""" - pass - - @abstractmethod - def map_syntax_kind(self, language_syntax: str) -> int: - """Map language-specific syntax to SCIP SyntaxKind.""" - pass - - @abstractmethod - def map_symbol_role(self, language_role: str) -> int: - """Map language-specific role to SCIP SymbolRole.""" - pass - - def validate_enum_value(self, enum_value: int, enum_type: str) -> bool: - """Validate enum value validity.""" - valid_ranges = { - 'SymbolKind': range(0, 65), # Updated range based on actual protobuf - 'SyntaxKind': range(0, 30), # 0-29 according to SCIP standard - 'SymbolRole': [1, 2, 4, 8, 16, 32] # Bit flags - } - - if enum_type in valid_ranges: - if enum_type == 'SymbolRole': - return enum_value in valid_ranges[enum_type] or any(enum_value & flag for flag in valid_ranges[enum_type]) - else: - return enum_value in valid_ranges[enum_type] - - return False \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/index_factory.py b/src/code_index_mcp/scip/framework/base/index_factory.py deleted file mode 100644 index 068c3d9..0000000 --- a/src/code_index_mcp/scip/framework/base/index_factory.py +++ /dev/null @@ -1,206 +0,0 @@ -"""Abstract factory base class for SCIP index generation with guaranteed completeness.""" - -from abc import ABC, abstractmethod -from typing import Set, List, Iterator -from ..types import SCIPContext -from ..symbol_generator import SCIPSymbolGenerator -from ..position_calculator import SCIPPositionCalculator -from .relationship_extractor import BaseRelationshipExtractor -from .enum_mapper import BaseEnumMapper -from ...proto import scip_pb2 -from ...core.relationship_types import InternalRelationshipType - - -class SCIPIndexFactory(ABC): - """Abstract factory for SCIP index generation with guaranteed completeness.""" - - def __init__(self, - project_root: str, - symbol_generator: SCIPSymbolGenerator, - relationship_extractor: BaseRelationshipExtractor, - enum_mapper: BaseEnumMapper, - position_calculator: SCIPPositionCalculator): - """ - Constructor injection ensures all required components are provided. - - Args: - project_root: Root directory of the project - symbol_generator: SCIP symbol ID generator - relationship_extractor: Language-specific relationship extractor - enum_mapper: Language-specific enum mapper - position_calculator: UTF-8 compliant position calculator - """ - self.project_root = project_root - self.symbol_generator = symbol_generator - self.relationship_extractor = relationship_extractor - self.enum_mapper = enum_mapper - self.position_calculator = position_calculator - - @abstractmethod - def get_language(self) -> str: - """Return the language identifier.""" - pass - - @abstractmethod - def get_supported_extensions(self) -> Set[str]: - """Return supported file extensions.""" - pass - - @abstractmethod - def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: - """Extract symbol definitions from source code.""" - pass - - @abstractmethod - def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: - """Extract symbol occurrences from source code.""" - pass - - def create_document(self, file_path: str, content: str) -> scip_pb2.Document: - """ - Create complete SCIP document with all essential components. - - This method is final and ensures all components are used. - """ - document = scip_pb2.Document() - document.relative_path = self._get_relative_path(file_path) - document.language = self.get_language() - - # Create processing context - context = SCIPContext(file_path, content, [], {}) - - # Extract symbols (guaranteed to be implemented) - symbols = list(self._extract_symbols(context)) - document.symbols.extend(symbols) - - # Extract occurrences (guaranteed to be implemented) - occurrences = list(self._extract_occurrences(context)) - document.occurrences.extend(occurrences) - - # Extract relationships (guaranteed to be available) - relationships = list(self.relationship_extractor.extract_all_relationships(context)) - self._add_relationships_to_document(document, relationships) - - return document - - def build_complete_index(self, files: List[str]) -> scip_pb2.Index: - """Build complete SCIP index with all 6 essential content categories.""" - index = scip_pb2.Index() - - # 1. Create metadata - index.metadata.CopyFrom(self.create_metadata()) - - # 2. Process all documents - documents = [] - for file_path in files: - if self.can_handle_file(file_path): - document = self.create_document(file_path, self._read_file(file_path)) - documents.append(document) - - index.documents.extend(documents) - - # 3. Extract external symbols - external_symbols = self.extract_external_symbols(documents) - index.external_symbols.extend(external_symbols) - - return index - - def create_metadata(self) -> scip_pb2.Metadata: - """Create standard SCIP metadata.""" - metadata = scip_pb2.Metadata() - metadata.version = scip_pb2.UnspecifiedProtocolVersion - metadata.tool_info.name = "code-index-mcp" - metadata.tool_info.version = "2.1.1" - metadata.tool_info.arguments.extend(["scip-indexing", self.get_language()]) - metadata.project_root = self.project_root - metadata.text_document_encoding = scip_pb2.UTF8 - return metadata - - @abstractmethod - def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract external symbols from imports and dependencies.""" - pass - - @abstractmethod - def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: - """ - Build cross-document relationships for language-specific processing. - - This method should analyze the provided documents and create relationships - between symbols across different files, taking into account the language's - specific module system and import semantics. - - Args: - documents: List of SCIP documents for this language - full_index: Complete SCIP index with all documents and symbols - - Returns: - Number of cross-document relationships added - """ - pass - - def can_handle_file(self, file_path: str) -> bool: - """Check if this factory can handle the file.""" - import os - extension = os.path.splitext(file_path)[1].lower() - return extension in self.get_supported_extensions() - - def _get_relative_path(self, file_path: str) -> str: - """Get relative path from project root.""" - import os - return os.path.relpath(file_path, self.project_root) - - def _read_file(self, file_path: str) -> str: - """Read file content.""" - try: - with open(file_path, 'r', encoding='utf-8') as f: - return f.read() - except Exception: - return "" - - def _add_relationships_to_document(self, document: scip_pb2.Document, relationships): - """Add relationships to document symbols.""" - # Build a map of symbol_id -> SymbolInformation for quick lookup - symbol_map = {} - for symbol_info in document.symbols: - symbol_map[symbol_info.symbol] = symbol_info - - # Process each relationship - for rel in relationships: - # Add forward relationship (source -> target) - if rel.source_symbol in symbol_map: - source_symbol_info = symbol_map[rel.source_symbol] - - # Create SCIP Relationship - scip_rel = scip_pb2.Relationship() - scip_rel.symbol = rel.target_symbol - - # Map relationship type to SCIP flags - if rel.relationship_type == InternalRelationshipType.CALLS: - scip_rel.is_reference = True - elif rel.relationship_type == InternalRelationshipType.INHERITS: - scip_rel.is_reference = True - elif rel.relationship_type == InternalRelationshipType.IMPLEMENTS: - scip_rel.is_implementation = True - elif rel.relationship_type == InternalRelationshipType.IMPORTS: - scip_rel.is_reference = True - elif rel.relationship_type == InternalRelationshipType.CONTAINS: - scip_rel.is_definition = True - else: - scip_rel.is_reference = True # Default - - # Add to source symbol's relationships - source_symbol_info.relationships.append(scip_rel) - - # Add reverse relationship for called_by (target -> source) - if rel.relationship_type == InternalRelationshipType.CALLS: - if rel.target_symbol in symbol_map: - target_symbol_info = symbol_map[rel.target_symbol] - - # Create reverse relationship for called_by - reverse_rel = scip_pb2.Relationship() - reverse_rel.symbol = rel.source_symbol - reverse_rel.is_reference = True # called_by is a reference - - # Add to target symbol's relationships - target_symbol_info.relationships.append(reverse_rel) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/language_analyzer.py b/src/code_index_mcp/scip/framework/base/language_analyzer.py deleted file mode 100644 index 358cbd0..0000000 --- a/src/code_index_mcp/scip/framework/base/language_analyzer.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Base language analyzer class for different parsing approaches.""" - -from abc import ABC, abstractmethod -from typing import Dict, List, Any, Optional - - -class BaseLanguageAnalyzer(ABC): - """Base class for language-specific analyzers (AST, regex, tree-sitter, etc.).""" - - @abstractmethod - def parse(self, content: str, filename: str = ""): - """Parse source code content into an internal representation.""" - pass - - @abstractmethod - def is_symbol_definition(self, node) -> bool: - """Check if a node represents a symbol definition.""" - pass - - @abstractmethod - def is_symbol_reference(self, node) -> bool: - """Check if a node represents a symbol reference.""" - pass - - @abstractmethod - def get_symbol_name(self, node) -> Optional[str]: - """Extract symbol name from a node.""" - pass - - @abstractmethod - def get_node_position(self, node) -> tuple: - """Get position information from a node.""" - pass - - def extract_symbols(self, content: str) -> List[Dict[str, Any]]: - """Extract all symbols from content - default implementation.""" - symbols = [] - try: - parsed = self.parse(content) - nodes = self.walk(parsed) if hasattr(self, 'walk') else [parsed] - - for node in nodes: - if self.is_symbol_definition(node): - symbol_name = self.get_symbol_name(node) - if symbol_name: - position = self.get_node_position(node) - symbols.append({ - 'name': symbol_name, - 'position': position, - 'node': node - }) - except Exception: - pass - - return symbols - - def extract_references(self, content: str) -> List[Dict[str, Any]]: - """Extract all symbol references from content - default implementation.""" - references = [] - try: - parsed = self.parse(content) - nodes = self.walk(parsed) if hasattr(self, 'walk') else [parsed] - - for node in nodes: - if self.is_symbol_reference(node): - symbol_name = self.get_symbol_name(node) - if symbol_name: - position = self.get_node_position(node) - references.append({ - 'name': symbol_name, - 'position': position, - 'node': node - }) - except Exception: - pass - - return references \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/base/relationship_extractor.py b/src/code_index_mcp/scip/framework/base/relationship_extractor.py deleted file mode 100644 index 1a851dd..0000000 --- a/src/code_index_mcp/scip/framework/base/relationship_extractor.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Base class for all language-specific relationship extractors.""" - -from abc import ABC, abstractmethod -from typing import Iterator -from ..types import SCIPContext, Relationship - - -class BaseRelationshipExtractor(ABC): - """Base class for all language-specific relationship extractors.""" - - @abstractmethod - def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract inheritance relationships - required for all OOP languages.""" - pass - - @abstractmethod - def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract call relationships - required for all languages.""" - pass - - @abstractmethod - def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract import/dependency relationships - required for all languages.""" - pass - - def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract composition relationships - optional implementation.""" - return iter([]) - - def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract interface relationships - optional implementation.""" - return iter([]) - - def extract_all_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract all relationships using implemented methods.""" - # Yield from all relationship extraction methods - yield from self.extract_inheritance_relationships(context) - yield from self.extract_call_relationships(context) - yield from self.extract_import_relationships(context) - yield from self.extract_composition_relationships(context) - yield from self.extract_interface_relationships(context) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/caching_system.py b/src/code_index_mcp/scip/framework/caching_system.py deleted file mode 100644 index eaa0392..0000000 --- a/src/code_index_mcp/scip/framework/caching_system.py +++ /dev/null @@ -1,346 +0,0 @@ -"""SCIP Framework Caching System - Performance optimization with intelligent caching.""" - -import logging -import hashlib -import pickle -import os -import time -from typing import Dict, Any, Optional, List, Tuple -from datetime import datetime, timedelta -from dataclasses import dataclass -from pathlib import Path - -from ..proto import scip_pb2 - -logger = logging.getLogger(__name__) - - -@dataclass -class CacheEntry: - """Cache entry with metadata.""" - data: Any - created_at: datetime - file_hash: str - access_count: int = 0 - last_accessed: Optional[datetime] = None - - -class SCIPCacheManager: - """Advanced caching system for SCIP framework with intelligent invalidation.""" - - def __init__(self, cache_dir: Optional[str] = None, max_memory_entries: int = 1000): - """Initialize cache manager.""" - self.cache_dir = Path(cache_dir) if cache_dir else Path.cwd() / ".scip_cache" - self.cache_dir.mkdir(exist_ok=True) - - # In-memory cache for frequently accessed items - self._memory_cache: Dict[str, CacheEntry] = {} - self.max_memory_entries = max_memory_entries - - # File modification tracking - self._file_hashes: Dict[str, str] = {} - - # Performance metrics - self._cache_hits = 0 - self._cache_misses = 0 - self._cache_invalidations = 0 - - logger.debug(f"Initialized SCIP cache manager with directory: {self.cache_dir}") - - def get_document_cache(self, file_path: str) -> Optional[scip_pb2.Document]: - """Get cached document if valid.""" - cache_key = self._get_cache_key("document", file_path) - - # Check if file has been modified - if self._is_file_modified(file_path): - self._invalidate_file_cache(file_path) - return None - - # Try memory cache first - if cache_key in self._memory_cache: - entry = self._memory_cache[cache_key] - entry.access_count += 1 - entry.last_accessed = datetime.now() - self._cache_hits += 1 - logger.debug(f"Memory cache hit for document: {file_path}") - return entry.data - - # Try disk cache - disk_entry = self._load_from_disk(cache_key) - if disk_entry: - # Move to memory cache for faster access - self._memory_cache[cache_key] = disk_entry - self._cache_hits += 1 - logger.debug(f"Disk cache hit for document: {file_path}") - return disk_entry.data - - self._cache_misses += 1 - return None - - def cache_document(self, file_path: str, document: scip_pb2.Document) -> None: - """Cache document with file modification tracking.""" - cache_key = self._get_cache_key("document", file_path) - file_hash = self._calculate_file_hash(file_path) - - entry = CacheEntry( - data=document, - created_at=datetime.now(), - file_hash=file_hash - ) - - # Store in memory cache - self._memory_cache[cache_key] = entry - self._file_hashes[file_path] = file_hash - - # Evict old entries if memory cache is full - self._evict_old_entries() - - # Store on disk for persistence - self._save_to_disk(cache_key, entry) - - logger.debug(f"Cached document: {file_path}") - - def get_symbol_cache(self, symbol_id: str) -> Optional[scip_pb2.SymbolInformation]: - """Get cached symbol information.""" - cache_key = self._get_cache_key("symbol", symbol_id) - - if cache_key in self._memory_cache: - entry = self._memory_cache[cache_key] - entry.access_count += 1 - entry.last_accessed = datetime.now() - self._cache_hits += 1 - return entry.data - - disk_entry = self._load_from_disk(cache_key) - if disk_entry: - self._memory_cache[cache_key] = disk_entry - self._cache_hits += 1 - return disk_entry.data - - self._cache_misses += 1 - return None - - def cache_symbol(self, symbol_id: str, symbol_info: scip_pb2.SymbolInformation) -> None: - """Cache symbol information.""" - cache_key = self._get_cache_key("symbol", symbol_id) - - entry = CacheEntry( - data=symbol_info, - created_at=datetime.now(), - file_hash="" # Symbols don't have associated files directly - ) - - self._memory_cache[cache_key] = entry - self._save_to_disk(cache_key, entry) - - logger.debug(f"Cached symbol: {symbol_id}") - - def get_relationship_cache(self, source_symbol: str, target_symbol: str) -> Optional[List[str]]: - """Get cached relationships between symbols.""" - cache_key = self._get_cache_key("relationship", f"{source_symbol}::{target_symbol}") - - if cache_key in self._memory_cache: - entry = self._memory_cache[cache_key] - entry.access_count += 1 - self._cache_hits += 1 - return entry.data - - self._cache_misses += 1 - return None - - def cache_relationships(self, source_symbol: str, target_symbol: str, relationships: List[str]) -> None: - """Cache relationships between symbols.""" - cache_key = self._get_cache_key("relationship", f"{source_symbol}::{target_symbol}") - - entry = CacheEntry( - data=relationships, - created_at=datetime.now(), - file_hash="" - ) - - self._memory_cache[cache_key] = entry - logger.debug(f"Cached relationships: {source_symbol} -> {target_symbol}") - - def invalidate_file_cache(self, file_path: str) -> None: - """Invalidate all cache entries related to a file.""" - self._invalidate_file_cache(file_path) - - def invalidate_all_cache(self) -> None: - """Clear all caches.""" - self._memory_cache.clear() - self._file_hashes.clear() - - # Clear disk cache - for cache_file in self.cache_dir.glob("*.cache"): - try: - cache_file.unlink() - except OSError as e: - logger.warning(f"Failed to delete cache file {cache_file}: {e}") - - self._cache_invalidations += 1 - logger.info("Invalidated all caches") - - def get_cache_statistics(self) -> Dict[str, Any]: - """Get cache performance statistics.""" - total_requests = self._cache_hits + self._cache_misses - hit_rate = (self._cache_hits / total_requests) if total_requests > 0 else 0 - - return { - "cache_hits": self._cache_hits, - "cache_misses": self._cache_misses, - "hit_rate": f"{hit_rate:.2%}", - "memory_entries": len(self._memory_cache), - "max_memory_entries": self.max_memory_entries, - "cache_invalidations": self._cache_invalidations, - "tracked_files": len(self._file_hashes), - "cache_directory": str(self.cache_dir) - } - - def _get_cache_key(self, cache_type: str, identifier: str) -> str: - """Generate cache key for identifier.""" - return f"{cache_type}_{hashlib.md5(identifier.encode()).hexdigest()}" - - def _calculate_file_hash(self, file_path: str) -> str: - """Calculate hash of file content.""" - try: - with open(file_path, 'rb') as f: - return hashlib.md5(f.read()).hexdigest() - except (OSError, IOError) as e: - logger.warning(f"Failed to calculate hash for {file_path}: {e}") - return "" - - def _is_file_modified(self, file_path: str) -> bool: - """Check if file has been modified since last cache.""" - if file_path not in self._file_hashes: - return True - - current_hash = self._calculate_file_hash(file_path) - return current_hash != self._file_hashes[file_path] - - def _invalidate_file_cache(self, file_path: str) -> None: - """Invalidate cache entries for a specific file.""" - # Remove from file hash tracking - if file_path in self._file_hashes: - del self._file_hashes[file_path] - - # Find and remove related cache entries - document_key = self._get_cache_key("document", file_path) - if document_key in self._memory_cache: - del self._memory_cache[document_key] - - # Remove from disk cache - cache_file = self.cache_dir / f"{document_key}.cache" - if cache_file.exists(): - try: - cache_file.unlink() - except OSError as e: - logger.warning(f"Failed to delete cache file {cache_file}: {e}") - - self._cache_invalidations += 1 - logger.debug(f"Invalidated cache for file: {file_path}") - - def _evict_old_entries(self) -> None: - """Evict least recently used entries when memory cache is full.""" - if len(self._memory_cache) <= self.max_memory_entries: - return - - # Sort by last accessed time (least recent first) - sorted_entries = sorted( - self._memory_cache.items(), - key=lambda x: x[1].last_accessed or x[1].created_at - ) - - # Remove oldest 10% of entries - entries_to_remove = max(1, len(sorted_entries) // 10) - for i in range(entries_to_remove): - key_to_remove = sorted_entries[i][0] - del self._memory_cache[key_to_remove] - - logger.debug(f"Evicted {entries_to_remove} cache entries") - - def _save_to_disk(self, cache_key: str, entry: CacheEntry) -> None: - """Save cache entry to disk.""" - try: - cache_file = self.cache_dir / f"{cache_key}.cache" - with open(cache_file, 'wb') as f: - pickle.dump(entry, f) - except (OSError, IOError, pickle.PickleError) as e: - logger.warning(f"Failed to save cache entry {cache_key}: {e}") - - def _load_from_disk(self, cache_key: str) -> Optional[CacheEntry]: - """Load cache entry from disk.""" - try: - cache_file = self.cache_dir / f"{cache_key}.cache" - if not cache_file.exists(): - return None - - # Check if cache file is too old (older than 24 hours) - if time.time() - cache_file.stat().st_mtime > 86400: # 24 hours - cache_file.unlink() - return None - - with open(cache_file, 'rb') as f: - entry = pickle.load(f) - entry.last_accessed = datetime.now() - return entry - - except (OSError, IOError, pickle.PickleError) as e: - logger.warning(f"Failed to load cache entry {cache_key}: {e}") - return None - - -class BatchProcessor: - """Batch processing system for optimized SCIP index generation.""" - - def __init__(self, cache_manager: SCIPCacheManager, batch_size: int = 50): - """Initialize batch processor.""" - self.cache_manager = cache_manager - self.batch_size = batch_size - self._pending_documents: List[Tuple[str, str]] = [] # (file_path, content) - self._processed_count = 0 - - def add_file(self, file_path: str, content: str) -> None: - """Add file to processing batch.""" - self._pending_documents.append((file_path, content)) - - # Process batch when it reaches the target size - if len(self._pending_documents) >= self.batch_size: - self.process_batch() - - def process_batch(self) -> List[scip_pb2.Document]: - """Process current batch of files.""" - if not self._pending_documents: - return [] - - logger.info(f"Processing batch of {len(self._pending_documents)} files") - documents = [] - - for file_path, content in self._pending_documents: - # Check cache first - cached_doc = self.cache_manager.get_document_cache(file_path) - if cached_doc: - documents.append(cached_doc) - logger.debug(f"Using cached document for {file_path}") - else: - # Process file (this would be implemented by the specific factory) - logger.debug(f"Processing file {file_path}") - # Placeholder for actual processing - documents.append(scip_pb2.Document()) - - self._processed_count += len(self._pending_documents) - self._pending_documents.clear() - - logger.info(f"Completed batch processing. Total processed: {self._processed_count}") - return documents - - def finalize(self) -> List[scip_pb2.Document]: - """Process any remaining files in the batch.""" - return self.process_batch() - - def get_stats(self) -> Dict[str, int]: - """Get batch processing statistics.""" - return { - "processed_files": self._processed_count, - "pending_files": len(self._pending_documents), - "batch_size": self.batch_size - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/compliance_validator.py b/src/code_index_mcp/scip/framework/compliance_validator.py deleted file mode 100644 index dca9eb2..0000000 --- a/src/code_index_mcp/scip/framework/compliance_validator.py +++ /dev/null @@ -1,319 +0,0 @@ -"""SCIP Compliance Validator - Runtime verification for SCIP standard compliance.""" - -import logging -import re -from typing import List, Dict, Optional, Tuple, Any -from .types import SCIPPositionInfo -from ..proto import scip_pb2 - - -logger = logging.getLogger(__name__) - - -class SCIPComplianceValidator: - """SCIP compliance validator for runtime verification of generated content.""" - - # SCIP symbol ID format patterns - LOCAL_SYMBOL_PATTERN = re.compile(r'^local\s+.+$') - GLOBAL_SYMBOL_PATTERN = re.compile(r'^[^\s]+\s+[^\s]+\s+[^\s]+(\s+[^\s]+)?\s+.+$') - - def __init__(self): - """Initialize compliance validator.""" - self.validation_errors = [] - self.validation_warnings = [] - - def validate_document(self, document: scip_pb2.Document) -> bool: - """ - Validate complete SCIP document for compliance. - - Args: - document: SCIP Document to validate - - Returns: - True if document is compliant, False otherwise - """ - self.clear_validation_results() - - try: - # Validate document structure - self._validate_document_structure(document) - - # Validate all symbol occurrences - for occurrence in document.occurrences: - self._validate_occurrence(occurrence) - - # Validate all symbol information - for symbol_info in document.symbols: - self._validate_symbol_information(symbol_info) - - # Check for consistency between occurrences and symbols - self._validate_occurrence_symbol_consistency(document) - - # Log validation results - if self.validation_errors: - logger.error(f"Document validation failed with {len(self.validation_errors)} errors") - for error in self.validation_errors: - logger.error(f" - {error}") - return False - - if self.validation_warnings: - logger.warning(f"Document validation completed with {len(self.validation_warnings)} warnings") - for warning in self.validation_warnings: - logger.warning(f" - {warning}") - - logger.debug("Document validation passed") - return True - - except Exception as e: - self._add_error(f"Validation exception: {e}") - return False - - def validate_index(self, index: scip_pb2.Index) -> bool: - """ - Validate complete SCIP index for compliance. - - Args: - index: SCIP Index to validate - - Returns: - True if index is compliant, False otherwise - """ - self.clear_validation_results() - - try: - # Validate index metadata - if index.HasField('metadata'): - self._validate_metadata(index.metadata) - else: - self._add_error("Index missing required metadata") - - # Validate all documents - for document in index.documents: - if not self.validate_document(document): - self._add_error(f"Document validation failed: {document.relative_path}") - - # Validate external symbols - for external_symbol in index.external_symbols: - self._validate_symbol_information(external_symbol) - - return len(self.validation_errors) == 0 - - except Exception as e: - self._add_error(f"Index validation exception: {e}") - return False - - def validate_symbol_id(self, symbol_id: str) -> bool: - """ - Validate symbol ID against SCIP grammar. - - Args: - symbol_id: Symbol ID to validate - - Returns: - True if valid, False otherwise - """ - if not symbol_id: - return False - - if symbol_id.startswith('local '): - return self._validate_local_symbol(symbol_id[6:]) - else: - return self._validate_global_symbol(symbol_id) - - def validate_position(self, position: SCIPPositionInfo, content: str) -> bool: - """ - Validate position information against content. - - Args: - position: Position to validate - content: Source content - - Returns: - True if position is valid, False otherwise - """ - try: - # Basic position validation - if not position.validate(): - return False - - # Document bounds validation - if not self._is_within_document_bounds(position, content): - return False - - # UTF-8 compliance validation - if not self._is_utf8_compliant(position, content): - return False - - return True - - except Exception as e: - logger.error(f"Position validation error: {e}") - return False - - def _validate_document_structure(self, document: scip_pb2.Document) -> None: - """Validate basic document structure.""" - if not document.relative_path: - self._add_error("Document missing relative_path") - - if not document.language: - self._add_warning("Document missing language specification") - - # Check path format - if '\\' in document.relative_path: - self._add_warning("Document path should use forward slashes") - - def _validate_occurrence(self, occurrence: scip_pb2.Occurrence) -> None: - """Validate SCIP occurrence.""" - # Validate symbol ID - if not self.validate_symbol_id(occurrence.symbol): - self._add_error(f"Invalid symbol ID in occurrence: {occurrence.symbol}") - - # Validate symbol roles - if not self._validate_symbol_roles(occurrence.symbol_roles): - self._add_error(f"Invalid symbol roles: {occurrence.symbol_roles}") - - # Validate syntax kind - if not self._validate_syntax_kind(occurrence.syntax_kind): - self._add_error(f"Invalid syntax kind: {occurrence.syntax_kind}") - - # Validate range - if occurrence.HasField('range'): - self._validate_range(occurrence.range) - - def _validate_symbol_information(self, symbol_info: scip_pb2.SymbolInformation) -> None: - """Validate SCIP symbol information.""" - # Validate symbol ID - if not self.validate_symbol_id(symbol_info.symbol): - self._add_error(f"Invalid symbol ID in symbol info: {symbol_info.symbol}") - - # Validate symbol kind - if not self._validate_symbol_kind(symbol_info.kind): - self._add_error(f"Invalid symbol kind: {symbol_info.kind}") - - # Validate display name - if not symbol_info.display_name: - self._add_warning(f"Symbol missing display name: {symbol_info.symbol}") - - def _validate_metadata(self, metadata: scip_pb2.Metadata) -> None: - """Validate SCIP metadata.""" - if not metadata.HasField('tool_info'): - self._add_error("Metadata missing tool_info") - else: - if not metadata.tool_info.name: - self._add_error("Metadata tool_info missing name") - if not metadata.tool_info.version: - self._add_warning("Metadata tool_info missing version") - - if not metadata.project_root: - self._add_error("Metadata missing project_root") - - # Validate text encoding - if metadata.text_document_encoding == scip_pb2.UnspecifiedTextDocumentEncoding: - self._add_warning("Metadata has unspecified text encoding") - - def _validate_range(self, range_obj: scip_pb2.Range) -> None: - """Validate SCIP range object.""" - if len(range_obj.start) < 2 or len(range_obj.end) < 2: - self._add_error("Range missing start or end positions (need [line, character])") - return - - start_line, start_char = range_obj.start[0], range_obj.start[1] - end_line, end_char = range_obj.end[0], range_obj.end[1] - - # Validate position ordering - if start_line > end_line or (start_line == end_line and start_char > end_char): - self._add_error(f"Invalid range: start position after end position") - - # Validate non-negative positions - if start_line < 0 or start_char < 0 or end_line < 0 or end_char < 0: - self._add_error("Range positions cannot be negative") - - def _validate_occurrence_symbol_consistency(self, document: scip_pb2.Document) -> None: - """Validate consistency between occurrences and symbol definitions.""" - defined_symbols = {symbol.symbol for symbol in document.symbols} - referenced_symbols = {occ.symbol for occ in document.occurrences} - - # Check for undefined symbols (warnings, not errors) - undefined_refs = referenced_symbols - defined_symbols - for undefined_symbol in undefined_refs: - if undefined_symbol.startswith('local '): - self._add_warning(f"Reference to undefined local symbol: {undefined_symbol}") - - def _validate_local_symbol(self, local_id: str) -> bool: - """Validate local symbol format.""" - return bool(local_id and not local_id.startswith(' ') and not local_id.endswith(' ')) - - def _validate_global_symbol(self, symbol_id: str) -> bool: - """Validate global symbol format.""" - parts = symbol_id.split(' ') - return len(parts) >= 3 and all(part.strip() for part in parts) - - def _validate_symbol_kind(self, kind: int) -> bool: - """Validate SymbolKind enum value.""" - return 0 <= kind <= 64 # SCIP SymbolKind range (updated to match actual protobuf) - - def _validate_syntax_kind(self, kind: int) -> bool: - """Validate SyntaxKind enum value.""" - return 0 <= kind <= 29 # SCIP SyntaxKind range - - def _validate_symbol_roles(self, roles: int) -> bool: - """Validate SymbolRole bit flags.""" - valid_flags = [1, 2, 4, 8, 16, 32] # Definition, Import, WriteAccess, ReadAccess, Generated, Test - - if roles in valid_flags: - return True - - # Check if it's a valid combination of flags - return (roles & ~sum(valid_flags)) == 0 and roles > 0 - - def _is_within_document_bounds(self, position: SCIPPositionInfo, content: str) -> bool: - """Check if position is within document boundaries.""" - lines = content.split('\n') - return ( - 0 <= position.start_line < len(lines) and - 0 <= position.end_line < len(lines) and - 0 <= position.start_column <= len(lines[position.start_line]) and - 0 <= position.end_column <= len(lines[position.end_line]) - ) - - def _is_utf8_compliant(self, position: SCIPPositionInfo, content: str) -> bool: - """Validate UTF-8 character position accuracy.""" - try: - lines = content.split('\n') - - # Test encoding/decoding at position boundaries - if position.start_line < len(lines): - start_line_text = lines[position.start_line][:position.start_column] - start_line_text.encode('utf-8').decode('utf-8') - - if position.end_line < len(lines): - end_line_text = lines[position.end_line][:position.end_column] - end_line_text.encode('utf-8').decode('utf-8') - - return True - - except (UnicodeEncodeError, UnicodeDecodeError, IndexError): - return False - - def _add_error(self, message: str) -> None: - """Add validation error.""" - self.validation_errors.append(message) - - def _add_warning(self, message: str) -> None: - """Add validation warning.""" - self.validation_warnings.append(message) - - def clear_validation_results(self) -> None: - """Clear previous validation results.""" - self.validation_errors.clear() - self.validation_warnings.clear() - - def get_validation_summary(self) -> dict: - """Get summary of validation results.""" - return { - 'errors': len(self.validation_errors), - 'warnings': len(self.validation_warnings), - 'error_messages': self.validation_errors.copy(), - 'warning_messages': self.validation_warnings.copy(), - 'is_valid': len(self.validation_errors) == 0 - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/__init__.py b/src/code_index_mcp/scip/framework/fallback/__init__.py deleted file mode 100644 index e9cce6e..0000000 --- a/src/code_index_mcp/scip/framework/fallback/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Fallback SCIP Framework Module - For unsupported languages and files.""" - -from .factory import FallbackSCIPIndexFactory, create_fallback_scip_factory -from .relationship_extractor import FallbackRelationshipExtractor -from .enum_mapper import FallbackEnumMapper -from .basic_analyzer import FallbackBasicAnalyzer - -__all__ = [ - 'FallbackSCIPIndexFactory', - 'create_fallback_scip_factory', - 'FallbackRelationshipExtractor', - 'FallbackEnumMapper', - 'FallbackBasicAnalyzer' -] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/basic_analyzer.py b/src/code_index_mcp/scip/framework/fallback/basic_analyzer.py deleted file mode 100644 index f561e08..0000000 --- a/src/code_index_mcp/scip/framework/fallback/basic_analyzer.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Fallback basic analyzer implementation.""" - -from typing import Iterator, Optional, Set, List, Dict, Any -from ..types import SCIPContext -from ..base.language_analyzer import BaseLanguageAnalyzer -from pathlib import Path - - -class FallbackBasicAnalyzer(BaseLanguageAnalyzer): - """Fallback analyzer for basic file analysis without parsing.""" - - def __init__(self): - """Initialize the fallback basic analyzer.""" - self._processed_files: Set[str] = set() - - def parse(self, content: str, filename: str = ""): - """Parse content (no-op for fallback, returns file info).""" - return { - 'filename': filename, - 'content_length': len(content), - 'line_count': content.count('\n') + 1, - 'type': 'fallback_file' - } - - def walk(self, tree) -> Iterator: - """Walk tree nodes (returns single file node for fallback).""" - yield tree # Return the entire file as a single "node" - - def is_symbol_definition(self, node) -> bool: - """Check if node represents a symbol definition (file-level only).""" - return isinstance(node, dict) and node.get('type') == 'fallback_file' - - def is_symbol_reference(self, node) -> bool: - """Check if node represents a symbol reference (none for fallback).""" - return False # Fallback doesn't analyze references - - def get_symbol_name(self, node) -> Optional[str]: - """Extract symbol name from node (filename for fallback).""" - if isinstance(node, dict) and 'filename' in node: - return Path(node['filename']).stem - return None - - def get_node_position(self, node) -> tuple: - """Get position information from node.""" - if isinstance(node, dict): - line_count = node.get('line_count', 1) - return (0, 0, line_count - 1, 0) # Start to end of file - return (0, 0, 0, 0) - - def extract_file_info(self, content: str, filename: str) -> Dict[str, Any]: - """Extract basic file information.""" - path = Path(filename) - - return { - 'filename': filename, - 'basename': path.name, - 'stem': path.stem, - 'suffix': path.suffix, - 'content_length': len(content), - 'line_count': content.count('\n') + 1, - 'language': self.detect_language_from_extension(path.suffix), - 'is_binary': self._is_likely_binary(content), - 'encoding': 'utf-8' # Assume UTF-8 for text files - } - - def detect_language_from_extension(self, extension: str) -> str: - """Detect specific language from file extension.""" - extension_mapping = { - # Programming languages - '.c': 'c', - '.cpp': 'cpp', '.cc': 'cpp', '.cxx': 'cpp', '.c++': 'cpp', - '.h': 'c', '.hpp': 'cpp', '.hh': 'cpp', '.hxx': 'cpp', - '.js': 'javascript', '.mjs': 'javascript', '.jsx': 'javascript', - '.ts': 'typescript', '.tsx': 'typescript', - '.py': 'python', '.pyi': 'python', '.pyx': 'python', - '.java': 'java', - '.go': 'go', - '.rs': 'rust', - '.rb': 'ruby', - '.cs': 'csharp', - '.php': 'php', - '.swift': 'swift', - '.kt': 'kotlin', '.kts': 'kotlin', - '.scala': 'scala', - '.r': 'r', - '.lua': 'lua', - '.perl': 'perl', '.pl': 'perl', - '.zig': 'zig', - '.dart': 'dart', - '.m': 'objective-c', '.mm': 'objective-c', - - # Web and markup - '.html': 'html', '.htm': 'html', - '.css': 'css', - '.scss': 'scss', '.sass': 'sass', - '.less': 'less', - '.vue': 'vue', - '.svelte': 'svelte', - '.astro': 'astro', - - # Data and config - '.json': 'json', - '.xml': 'xml', - '.yaml': 'yaml', '.yml': 'yaml', - '.toml': 'toml', - '.ini': 'ini', - '.cfg': 'ini', - '.conf': 'ini', - - # Documentation - '.md': 'markdown', '.markdown': 'markdown', - '.mdx': 'mdx', - '.tex': 'latex', - '.rst': 'rst', - - # Database and query - '.sql': 'sql', - '.cql': 'cql', - '.cypher': 'cypher', - '.sparql': 'sparql', - '.graphql': 'graphql', '.gql': 'graphql', - - # Shell and scripts - '.sh': 'shell', '.bash': 'bash', - '.zsh': 'zsh', '.fish': 'fish', - '.ps1': 'powershell', - '.bat': 'batch', '.cmd': 'batch', - - # Template languages - '.handlebars': 'handlebars', '.hbs': 'handlebars', - '.ejs': 'ejs', - '.pug': 'pug', - '.mustache': 'mustache', - - # Other - '.dockerfile': 'dockerfile', - '.gitignore': 'gitignore', - '.env': 'dotenv', - } - - return extension_mapping.get(extension.lower(), 'text') - - def get_file_statistics(self, content: str) -> Dict[str, int]: - """Get basic file statistics.""" - return { - 'total_characters': len(content), - 'total_lines': content.count('\n') + 1, - 'non_empty_lines': len([line for line in content.split('\n') if line.strip()]), - 'blank_lines': content.count('\n') + 1 - len([line for line in content.split('\n') if line.strip()]), - 'estimated_words': len(content.split()) if content.strip() else 0 - } - - def _is_likely_binary(self, content: str, sample_size: int = 1024) -> bool: - """Check if content is likely binary based on null bytes.""" - sample = content[:sample_size] - return '\x00' in sample or any(ord(c) > 127 for c in sample[:100]) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/enum_mapper.py b/src/code_index_mcp/scip/framework/fallback/enum_mapper.py deleted file mode 100644 index 08d338f..0000000 --- a/src/code_index_mcp/scip/framework/fallback/enum_mapper.py +++ /dev/null @@ -1,102 +0,0 @@ -"""Fallback enum mapper implementation.""" - -from typing import Dict, Optional -from ..base.enum_mapper import BaseEnumMapper -from ...proto import scip_pb2 - - -class FallbackEnumMapper(BaseEnumMapper): - """Fallback enum mapper for basic SCIP enum mappings.""" - - def __init__(self): - """Initialize fallback enum mapper with minimal mappings.""" - super().__init__() - - # Minimal symbol kind mappings for fallback - self._symbol_kind_map = { - 'file': scip_pb2.File, - 'text': scip_pb2.File, - 'unknown': scip_pb2.UnspecifiedSymbolKind, - } - - # Minimal symbol role mappings - self._symbol_role_map = { - 'definition': scip_pb2.Definition, - 'reference': scip_pb2.Read, - } - - # Minimal syntax kind mappings - self._syntax_kind_map = { - 'file': scip_pb2.UnspecifiedSyntaxKind, - 'text': scip_pb2.UnspecifiedSyntaxKind, - 'identifier': scip_pb2.IdentifierKeyword, - } - - def map_symbol_kind(self, fallback_kind: str) -> int: - """Map fallback symbol kind to SCIP SymbolKind enum.""" - kind = self._symbol_kind_map.get(fallback_kind.lower()) - if kind is not None: - return kind - - # Default to File for fallback - return scip_pb2.File - - def map_symbol_role(self, fallback_role: str) -> int: - """Map fallback symbol role to SCIP SymbolRole enum.""" - role = self._symbol_role_map.get(fallback_role.lower()) - if role is not None: - return role - - # Default to Definition for fallback - return scip_pb2.Definition - - def map_syntax_kind(self, fallback_syntax: str) -> int: - """Map fallback syntax kind to SCIP SyntaxKind enum.""" - syntax = self._syntax_kind_map.get(fallback_syntax.lower()) - if syntax is not None: - return syntax - - # Default to UnspecifiedSyntaxKind for fallback - return scip_pb2.UnspecifiedSyntaxKind - - def get_symbol_kind_name(self, kind: int) -> Optional[str]: - """Get human-readable name for symbol kind.""" - reverse_map = {v: k for k, v in self._symbol_kind_map.items()} - return reverse_map.get(kind) - - def get_symbol_role_name(self, role: int) -> Optional[str]: - """Get human-readable name for symbol role.""" - reverse_map = {v: k for k, v in self._symbol_role_map.items()} - return reverse_map.get(role) - - def get_syntax_kind_name(self, syntax: int) -> Optional[str]: - """Get human-readable name for syntax kind.""" - reverse_map = {v: k for k, v in self._syntax_kind_map.items()} - return reverse_map.get(syntax) - - def validate_symbol_kind(self, kind: int) -> bool: - """Validate if symbol kind is valid.""" - # Accept all valid SCIP symbol kinds - return 0 <= kind <= 64 - - def validate_symbol_role(self, role: int) -> bool: - """Validate if symbol role is valid.""" - # Accept all valid SCIP symbol roles - return 0 <= role <= 32 - - def validate_syntax_kind(self, syntax: int) -> bool: - """Validate if syntax kind is valid.""" - # Accept all valid SCIP syntax kinds - return 0 <= syntax <= 1000 - - def get_supported_symbol_kinds(self) -> Dict[str, int]: - """Get all supported symbol kinds.""" - return self._symbol_kind_map.copy() - - def get_supported_symbol_roles(self) -> Dict[str, int]: - """Get all supported symbol roles.""" - return self._symbol_role_map.copy() - - def get_supported_syntax_kinds(self) -> Dict[str, int]: - """Get all supported syntax kinds.""" - return self._syntax_kind_map.copy() \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/factory.py b/src/code_index_mcp/scip/framework/fallback/factory.py deleted file mode 100644 index 4d57f4e..0000000 --- a/src/code_index_mcp/scip/framework/fallback/factory.py +++ /dev/null @@ -1,153 +0,0 @@ -"""Fallback SCIP Index Factory implementation.""" - -import os -from pathlib import Path -from typing import Set, List, Iterator, Optional -from ..base.index_factory import SCIPIndexFactory -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..base.enum_mapper import BaseEnumMapper -from ..symbol_generator import SCIPSymbolGenerator -from ..position_calculator import SCIPPositionCalculator -from ..types import SCIPContext -from .relationship_extractor import FallbackRelationshipExtractor -from .enum_mapper import FallbackEnumMapper -from .basic_analyzer import FallbackBasicAnalyzer -from ...proto import scip_pb2 -from ....constants import SUPPORTED_EXTENSIONS - - -class FallbackSCIPIndexFactory(SCIPIndexFactory): - """Fallback SCIP Index factory for unsupported languages and files.""" - - def __init__(self, - project_root: str, - symbol_generator: SCIPSymbolGenerator, - relationship_extractor: BaseRelationshipExtractor, - enum_mapper: BaseEnumMapper, - position_calculator: SCIPPositionCalculator): - """Initialize Fallback factory with required components via constructor injection.""" - super().__init__(project_root, symbol_generator, relationship_extractor, - enum_mapper, position_calculator) - self.basic_analyzer = FallbackBasicAnalyzer() - - def get_language(self) -> str: - """Return language identifier.""" - return "text" - - def get_supported_extensions(self) -> Set[str]: - """Return all supported file extensions as fallback handles everything.""" - return SUPPORTED_EXTENSIONS - - def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: - """Extract minimal symbol information (file-level only).""" - try: - # Only create a file-level symbol for fallback - file_name = Path(context.file_path).stem - if file_name: - symbol_info = self._create_file_symbol(context, file_name) - if symbol_info: - yield symbol_info - - except Exception as e: - # Silently handle errors in fallback - pass - - def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: - """Extract minimal occurrences (file-level only).""" - try: - # Create single occurrence for the entire file - file_name = Path(context.file_path).stem - if file_name: - occurrence = self._create_file_occurrence(context, file_name) - if occurrence: - yield occurrence - - except Exception as e: - # Silently handle errors in fallback - pass - - def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract external symbols (none for fallback).""" - return [] # Fallback doesn't analyze external dependencies - - def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: - """ - Build cross-document relationships for fallback (no relationships). - - Fallback factory doesn't create cross-document relationships as it handles - unsupported languages with minimal symbol information. - """ - return 0 # No cross-document relationships for fallback - - def _create_file_symbol(self, context: SCIPContext, file_name: str) -> Optional[scip_pb2.SymbolInformation]: - """Create SCIP symbol information for the file itself.""" - symbol_info = scip_pb2.SymbolInformation() - - # Detect language from file extension - language = self.basic_analyzer.detect_language_from_extension( - Path(context.file_path).suffix - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol( - language=language, - file_path=context.file_path, - symbol_path=[file_name], - descriptor="" - ) - symbol_info.display_name = file_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('file') - symbol_info.documentation.append( - f"File: {context.file_path} ({language})" - ) - - return symbol_info - - def _create_file_occurrence(self, context: SCIPContext, file_name: str) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence for the file itself.""" - occurrence = scip_pb2.Occurrence() - - # Set range to cover entire file (0,0) to (lines, 0) - lines = context.content.count('\n') - occurrence.range.start.extend([0, 0]) - occurrence.range.end.extend([lines, 0]) - - # Detect language from file extension - language = self.basic_analyzer.detect_language_from_extension( - Path(context.file_path).suffix - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol( - language=language, - file_path=context.file_path, - symbol_path=[file_name], - descriptor="" - ) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('file') - - return occurrence - - -def create_fallback_scip_factory(project_root: str) -> FallbackSCIPIndexFactory: - """ - Factory creator for Fallback SCIP factory. - Ensures all required components are properly assembled via constructor injection. - """ - symbol_generator = SCIPSymbolGenerator( - scheme="scip-fallback", - package_manager="generic", - package_name=Path(project_root).name, - version="HEAD" - ) - - relationship_extractor = FallbackRelationshipExtractor() - enum_mapper = FallbackEnumMapper() - position_calculator = SCIPPositionCalculator() - - return FallbackSCIPIndexFactory( - project_root=project_root, - symbol_generator=symbol_generator, - relationship_extractor=relationship_extractor, # Guaranteed to be provided - enum_mapper=enum_mapper, - position_calculator=position_calculator - ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/fallback/relationship_extractor.py b/src/code_index_mcp/scip/framework/fallback/relationship_extractor.py deleted file mode 100644 index facc4d4..0000000 --- a/src/code_index_mcp/scip/framework/fallback/relationship_extractor.py +++ /dev/null @@ -1,85 +0,0 @@ -"""Fallback relationship extractor implementation.""" - -from typing import List, Dict, Set, Optional, Any -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..relationship_manager import SymbolRelationship, RelationshipType -from ..types import SCIPContext - - -class FallbackRelationshipExtractor(BaseRelationshipExtractor): - """Fallback relationship extractor - minimal relationship analysis.""" - - def __init__(self): - """Initialize fallback relationship extractor.""" - super().__init__() - - def extract_symbol_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: - """Extract symbol relationships from fallback context (minimal analysis).""" - relationships = [] - - # For fallback, we only create minimal file-level relationships - try: - file_symbol = self._create_file_symbol_id(context.file_path) - - # Create self-relationship for the file - relationships.append(SymbolRelationship( - source_symbol=file_symbol, - target_symbol=file_symbol, - relationship_type=RelationshipType.CONTAINS, - source_location=(0, 0), - target_location=(0, 0), - context_info={ - "type": "file_self_reference", - "description": f"File contains itself: {context.file_path}" - } - )) - - except Exception: - # Silently handle any errors in fallback mode - pass - - return relationships - - def extract_import_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: - """Extract import relationships (none for fallback).""" - return [] # Fallback doesn't analyze imports - - def extract_inheritance_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: - """Extract inheritance relationships (none for fallback).""" - return [] # Fallback doesn't analyze inheritance - - def extract_call_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: - """Extract call relationships (none for fallback).""" - return [] # Fallback doesn't analyze function calls - - def extract_field_access_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: - """Extract field access relationships (none for fallback).""" - return [] # Fallback doesn't analyze field access - - def extract_type_relationships(self, context: SCIPContext) -> List[SymbolRelationship]: - """Extract type relationships (none for fallback).""" - return [] # Fallback doesn't analyze types - - def resolve_cross_file_references(self, - local_relationships: List[SymbolRelationship], - global_symbol_map: Dict[str, Any]) -> List[SymbolRelationship]: - """Resolve cross-file references (none for fallback).""" - return local_relationships # No cross-file analysis in fallback - - def get_relationship_statistics(self) -> Dict[str, int]: - """Get relationship extraction statistics.""" - return { - "total_relationships": 0, - "import_relationships": 0, - "inheritance_relationships": 0, - "call_relationships": 0, - "field_access_relationships": 0, - "type_relationships": 0, - "cross_file_relationships": 0 - } - - def _create_file_symbol_id(self, file_path: str) -> str: - """Create a simple symbol ID for the file.""" - from pathlib import Path - file_name = Path(file_path).stem - return f"local {file_name}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/index_factory.py b/src/code_index_mcp/scip/framework/index_factory.py deleted file mode 100644 index b78f343..0000000 --- a/src/code_index_mcp/scip/framework/index_factory.py +++ /dev/null @@ -1,337 +0,0 @@ -"""SCIP Index Factory - Abstract factory ensuring complete SCIP Index generation.""" - -import logging -import os -from abc import ABC, abstractmethod -from pathlib import Path -from typing import List, Dict, Optional, Tuple, Any - -from .types import SCIPSymbolDescriptor, SCIPPositionInfo -from .compliance_validator import SCIPComplianceValidator -from ..proto import scip_pb2 - - -logger = logging.getLogger(__name__) - - -class SCIPIndexFactory(ABC): - """ - Abstract factory ensuring complete SCIP Index generation. - - This factory ensures all generated SCIP indexes contain the 6 essential content categories: - 1. Index Metadata - 2. Document Collection - 3. Symbol Definitions - 4. Symbol Occurrences - 5. Symbol Relationships - 6. External Symbols - """ - - def __init__(self, project_root: str): - """ - Initialize SCIP index factory. - - Args: - project_root: Absolute path to project root - """ - self.project_root = Path(project_root).resolve() - self.project_name = self.project_root.name - self._validator = SCIPComplianceValidator() - - logger.debug(f"Initialized SCIP Index Factory for project: {self.project_name}") - - @abstractmethod - def create_metadata(self, project_root: str) -> scip_pb2.Metadata: - """ - Create standard-compliant metadata (Category 1). - - Args: - project_root: Project root directory - - Returns: - SCIP Metadata object with all required fields - """ - pass - - @abstractmethod - def create_document(self, file_path: str, content: str) -> scip_pb2.Document: - """ - Create complete document with all occurrences and symbols (Category 2). - - Args: - file_path: Path to source file - content: File content - - Returns: - SCIP Document with complete symbol information - """ - pass - - @abstractmethod - def create_symbol_definition(self, - name: str, - kind: str, - scope: List[str], - file_path: str, - position: Optional[SCIPPositionInfo] = None, - documentation: Optional[List[str]] = None) -> scip_pb2.SymbolInformation: - """ - Create SCIP-compliant symbol definition (Category 3). - - Args: - name: Symbol name - kind: Symbol kind (function, class, variable, etc.) - scope: Scope path - file_path: File where symbol is defined - position: Optional position information - documentation: Optional documentation - - Returns: - SCIP SymbolInformation object - """ - pass - - @abstractmethod - def create_symbol_occurrence(self, - symbol_id: str, - position: SCIPPositionInfo, - role: str, - syntax: str) -> scip_pb2.Occurrence: - """ - Create SCIP-compliant symbol occurrence (Category 4). - - Args: - symbol_id: SCIP symbol identifier - position: Position information - role: Symbol role (definition, reference, etc.) - syntax: Syntax kind - - Returns: - SCIP Occurrence object - """ - pass - - @abstractmethod - def create_symbol_relationship(self, - source: str, - target: str, - rel_type: str) -> scip_pb2.Relationship: - """ - Create SCIP-compliant symbol relationship (Category 5). - - Args: - source: Source symbol ID - target: Target symbol ID - rel_type: Relationship type (inheritance, call, import, etc.) - - Returns: - SCIP Relationship object - """ - pass - - @abstractmethod - def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """ - Extract external symbols from imports and dependencies (Category 6). - - Args: - documents: List of processed documents - - Returns: - List of external symbol information - """ - pass - - def _extract_symbol_relationships(self, files: List[str], symbol_definitions: Dict[str, str], - documents: List[scip_pb2.Document]) -> None: - """ - Extract symbol relationships (Category 5). - - Default implementation does nothing. Subclasses can override to provide - language-specific relationship extraction. - - Args: - files: List of file paths - symbol_definitions: Mapping of symbol names to symbol IDs - documents: List of processed documents to update with relationships - """ - # Default implementation - no relationship extraction - pass - - def build_complete_index(self, files: List[str]) -> scip_pb2.Index: - """ - Build complete SCIP Index with all 6 content categories. - - Args: - files: List of file paths to index - - Returns: - Complete SCIP Index - - Raises: - RuntimeError: If index validation fails - """ - logger.info(f"Building complete SCIP index for {len(files)} files") - - index = scip_pb2.Index() - - # 1. Create metadata (Category 1) - logger.debug("Creating index metadata...") - index.metadata.CopyFrom(self.create_metadata(str(self.project_root))) - - # 2. Process all documents (Category 2) - logger.debug(f"Processing {len(files)} documents...") - documents = [] - symbol_definitions = {} # Track all symbol definitions for relationship extraction - - for file_path in files: - try: - content = self._read_file(file_path) - if content is not None: - doc = self.create_document(file_path, content) - documents.append(doc) - - # Collect symbol definitions for relationship extraction - for symbol_info in doc.symbols: - symbol_definitions[symbol_info.display_name] = symbol_info.symbol - - logger.debug(f"Processed document: {doc.relative_path}") - else: - logger.warning(f"Skipped unreadable file: {file_path}") - except Exception as e: - logger.error(f"Failed to process {file_path}: {e}") - continue - - index.documents.extend(documents) - logger.info(f"Successfully processed {len(documents)} documents") - - # 2.5. Extract relationships (Category 5) - if supported by factory - logger.debug("Extracting symbol relationships...") - try: - self._extract_symbol_relationships(files, symbol_definitions, documents) - logger.info("Completed relationship extraction") - except Exception as e: - logger.warning(f"Relationship extraction failed: {e}") - - # 3. Extract external symbols (Category 6) - logger.debug("Extracting external symbols...") - try: - external_symbols = self.extract_external_symbols(documents) - index.external_symbols.extend(external_symbols) - logger.info(f"Extracted {len(external_symbols)} external symbols") - except Exception as e: - logger.warning(f"Failed to extract external symbols: {e}") - - # 4. Validate complete index - logger.debug("Validating complete index...") - if not self._validator.validate_index(index): - validation_summary = self._validator.get_validation_summary() - error_msg = f"Index validation failed: {validation_summary['error_messages']}" - logger.error(error_msg) - raise RuntimeError(error_msg) - - # Log final statistics - total_occurrences = sum(len(doc.occurrences) for doc in documents) - total_symbols = sum(len(doc.symbols) for doc in documents) - - logger.info(f"Created complete SCIP index:") - logger.info(f" - Documents: {len(documents)}") - logger.info(f" - Occurrences: {total_occurrences}") - logger.info(f" - Symbol Definitions: {total_symbols}") - logger.info(f" - External Symbols: {len(external_symbols)}") - - return index - - def validate_generated_content(self, content: Any) -> bool: - """ - Validate any generated SCIP content for compliance. - - Args: - content: SCIP content to validate (Index, Document, etc.) - - Returns: - True if content is compliant - """ - try: - if isinstance(content, scip_pb2.Index): - return self._validator.validate_index(content) - elif isinstance(content, scip_pb2.Document): - return self._validator.validate_document(content) - else: - logger.warning(f"Unknown content type for validation: {type(content)}") - return False - except Exception as e: - logger.error(f"Validation failed: {e}") - return False - - def get_validation_summary(self) -> dict: - """Get detailed validation summary from last validation operation.""" - return self._validator.get_validation_summary() - - def _read_file(self, file_path: str) -> Optional[str]: - """ - Read file content with encoding detection. - - Args: - file_path: Path to file - - Returns: - File content or None if reading fails - """ - encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252'] - - for encoding in encodings: - try: - with open(file_path, 'r', encoding=encoding) as f: - return f.read() - except UnicodeDecodeError: - continue - except (OSError, PermissionError, FileNotFoundError) as e: - logger.warning(f"Could not read {file_path}: {e}") - return None - - logger.warning(f"Could not decode {file_path} with any supported encoding") - return None - - def _get_relative_path(self, file_path: str) -> str: - """ - Get relative path from project root. - - Args: - file_path: Absolute or relative file path - - Returns: - Relative path from project root - """ - try: - path = Path(file_path) - if path.is_absolute(): - return str(path.relative_to(self.project_root)).replace('\\', '/') - return file_path.replace('\\', '/') - except ValueError: - # If path is not under project_root, return as-is - return str(Path(file_path)).replace('\\', '/') - - def _validate_symbol_id(self, symbol_id: str) -> bool: - """Validate symbol ID format.""" - return self._validator.validate_symbol_id(symbol_id) - - def _validate_position(self, position: SCIPPositionInfo, content: str) -> bool: - """Validate position information.""" - return self._validator.validate_position(position, content) - - def get_factory_info(self) -> dict: - """Get information about this factory instance.""" - return { - 'project_root': str(self.project_root), - 'project_name': self.project_name, - 'factory_type': self.__class__.__name__, - 'supported_categories': [ - 'Index Metadata', - 'Document Collection', - 'Symbol Definitions', - 'Symbol Occurrences', - 'Symbol Relationships', - 'External Symbols' - ] - } diff --git a/src/code_index_mcp/scip/framework/java/__init__.py b/src/code_index_mcp/scip/framework/java/__init__.py deleted file mode 100644 index f9bd800..0000000 --- a/src/code_index_mcp/scip/framework/java/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Java SCIP framework module.""" - -from .factory import JavaSCIPIndexFactory, create_java_scip_factory -from .enum_mapper import JavaEnumMapper -from .relationship_extractor import JavaRelationshipExtractor -from .tree_sitter_analyzer import JavaTreeSitterAnalyzer - -__all__ = [ - 'JavaSCIPIndexFactory', - 'create_java_scip_factory', - 'JavaEnumMapper', - 'JavaRelationshipExtractor', - 'JavaTreeSitterAnalyzer' -] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/java/enum_mapper.py b/src/code_index_mcp/scip/framework/java/enum_mapper.py deleted file mode 100644 index 13d5f77..0000000 --- a/src/code_index_mcp/scip/framework/java/enum_mapper.py +++ /dev/null @@ -1,200 +0,0 @@ -"""Java enum mapper implementation.""" - -from ..base.enum_mapper import BaseEnumMapper -from ...proto import scip_pb2 - - -class JavaEnumMapper(BaseEnumMapper): - """Java-specific enum mapper for SCIP compliance.""" - - # Java symbol kind mappings - SYMBOL_KIND_MAP = { - 'method': scip_pb2.Method, - 'class': scip_pb2.Class, - 'interface': scip_pb2.Interface, - 'enum': scip_pb2.Enum, - 'field': scip_pb2.Field, - 'variable': scip_pb2.Variable, - 'parameter': scip_pb2.Parameter, - 'constructor': scip_pb2.Constructor, - 'package': scip_pb2.Package, - 'annotation': scip_pb2.Interface, - 'constant': scip_pb2.Constant, - 'local_variable': scip_pb2.Variable, - 'type_parameter': scip_pb2.TypeParameter, - } - - # Java syntax kind mappings - SYNTAX_KIND_MAP = { - 'method_declaration': scip_pb2.IdentifierFunctionDefinition, - 'class_declaration': scip_pb2.IdentifierType, - 'interface_declaration': scip_pb2.IdentifierType, - 'enum_declaration': scip_pb2.IdentifierType, - 'field_declaration': scip_pb2.IdentifierAttribute, - 'variable_declaration': scip_pb2.IdentifierLocal, - 'parameter_declaration': scip_pb2.IdentifierParameter, - 'constructor_declaration': scip_pb2.IdentifierFunctionDefinition, - 'annotation_declaration': scip_pb2.IdentifierType, - 'identifier': scip_pb2.Identifier, - 'keyword': scip_pb2.IdentifierKeyword, - 'string_literal': scip_pb2.StringLiteral, - 'numeric_literal': scip_pb2.NumericLiteral, - 'boolean_literal': scip_pb2.BooleanLiteral, - 'comment': scip_pb2.Comment, - 'punctuation': scip_pb2.PunctuationDelimiter, - } - - # Java symbol role mappings (official SCIP naming) - SYMBOL_ROLE_MAP = { - 'definition': scip_pb2.Definition, - 'import': scip_pb2.Import, - 'write': scip_pb2.Write, # Official SCIP naming - 'read': scip_pb2.Read, # Official SCIP naming - 'generated': scip_pb2.Generated, - 'test': scip_pb2.Test, - 'type': scip_pb2.Type, # Add missing Type role - 'reference': scip_pb2.Read, # Default reference is read access - } - - def map_symbol_kind(self, language_kind: str) -> int: - """Map Java symbol type to SCIP SymbolKind.""" - kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SymbolKind'): - raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") - - return kind - - def map_syntax_kind(self, language_syntax: str) -> int: - """Map Java syntax element to SCIP SyntaxKind.""" - kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SyntaxKind'): - raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") - - return kind - - def map_symbol_role(self, language_role: str) -> int: - """Map Java symbol role to SCIP SymbolRole.""" - role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) - - # Validate enum value - if not self.validate_enum_value(role, 'SymbolRole'): - raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") - - return role - - def get_java_node_symbol_kind(self, node_type: str) -> str: - """ - Map Java tree-sitter node type to internal symbol kind string. - - Args: - node_type: Java tree-sitter node type (e.g., 'method_declaration', 'class_declaration') - - Returns: - Internal symbol kind string for use with map_symbol_kind() - """ - node_kind_map = { - 'method_declaration': 'method', - 'constructor_declaration': 'constructor', - 'class_declaration': 'class', - 'interface_declaration': 'interface', - 'enum_declaration': 'enum', - 'field_declaration': 'field', - 'local_variable_declaration': 'local_variable', - 'formal_parameter': 'parameter', - 'annotation_type_declaration': 'annotation', - 'type_parameter': 'type_parameter', - } - - return node_kind_map.get(node_type, 'variable') - - def get_java_node_syntax_kind(self, node_type: str, context: str = None) -> str: - """ - Map Java tree-sitter node type to internal syntax kind string. - - Args: - node_type: Java tree-sitter node type - context: Additional context for disambiguation - - Returns: - Internal syntax kind string for use with map_syntax_kind() - """ - node_syntax_map = { - 'method_declaration': 'method_declaration', - 'constructor_declaration': 'constructor_declaration', - 'class_declaration': 'class_declaration', - 'interface_declaration': 'interface_declaration', - 'enum_declaration': 'enum_declaration', - 'field_declaration': 'field_declaration', - 'local_variable_declaration': 'variable_declaration', - 'formal_parameter': 'parameter_declaration', - 'annotation_type_declaration': 'annotation_declaration', - 'identifier': 'identifier', - 'string_literal': 'string_literal', - 'decimal_integer_literal': 'numeric_literal', - 'hex_integer_literal': 'numeric_literal', - 'octal_integer_literal': 'numeric_literal', - 'binary_integer_literal': 'numeric_literal', - 'decimal_floating_point_literal': 'numeric_literal', - 'hex_floating_point_literal': 'numeric_literal', - 'true': 'boolean_literal', - 'false': 'boolean_literal', - 'null_literal': 'boolean_literal', - } - - return node_syntax_map.get(node_type, 'identifier') - - def get_java_node_symbol_role(self, node_type: str, context: str = None) -> str: - """ - Map Java tree-sitter node type to internal symbol role string. - - Args: - node_type: Java tree-sitter node type - context: Additional context (e.g., 'in_assignment', 'in_call') - - Returns: - Internal symbol role string for use with map_symbol_role() - """ - if context == 'definition': - return 'definition' - elif context == 'assignment': - return 'write' - elif context == 'import': - return 'import' - elif node_type in ['method_declaration', 'constructor_declaration', 'class_declaration', - 'interface_declaration', 'enum_declaration', 'field_declaration', - 'annotation_type_declaration']: - return 'definition' - else: - return 'reference' - - def is_valid_java_symbol_kind(self, symbol_kind: str) -> bool: - """Check if symbol kind is valid for Java.""" - return symbol_kind in self.SYMBOL_KIND_MAP - - def is_valid_java_syntax_kind(self, syntax_kind: str) -> bool: - """Check if syntax kind is valid for Java.""" - return syntax_kind in self.SYNTAX_KIND_MAP - - def is_valid_java_symbol_role(self, symbol_role: str) -> bool: - """Check if symbol role is valid for Java.""" - return symbol_role in self.SYMBOL_ROLE_MAP - - def get_all_java_symbol_kinds(self) -> list: - """Get all available Java symbol kinds.""" - return list(self.SYMBOL_KIND_MAP.keys()) - - def get_all_java_syntax_kinds(self) -> list: - """Get all available Java syntax kinds.""" - return list(self.SYNTAX_KIND_MAP.keys()) - - def get_all_java_symbol_roles(self) -> list: - """Get all available Java symbol roles.""" - return list(self.SYMBOL_ROLE_MAP.keys()) - - def get_java_type_reference_role(self) -> str: - """Get symbol role for type references (e.g., in generic parameters).""" - return 'type' \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/java/factory.py b/src/code_index_mcp/scip/framework/java/factory.py deleted file mode 100644 index 8883a2d..0000000 --- a/src/code_index_mcp/scip/framework/java/factory.py +++ /dev/null @@ -1,399 +0,0 @@ -"""Java SCIP Index Factory implementation.""" - -import os -from pathlib import Path -from typing import Set, List, Iterator, Optional -from ..base.index_factory import SCIPIndexFactory -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..base.enum_mapper import BaseEnumMapper -from ..symbol_generator import SCIPSymbolGenerator -from ..position_calculator import SCIPPositionCalculator -from ..types import SCIPContext, SCIPSymbolDescriptor -from .relationship_extractor import JavaRelationshipExtractor -from .enum_mapper import JavaEnumMapper -from .tree_sitter_analyzer import JavaTreeSitterAnalyzer -from ...proto import scip_pb2 - -try: - import tree_sitter - from tree_sitter_java import language as java_language - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - - -class JavaSCIPIndexFactory(SCIPIndexFactory): - """Java-specific SCIP Index factory implementation with constructor injection.""" - - def __init__(self, - project_root: str, - symbol_generator: SCIPSymbolGenerator, - relationship_extractor: BaseRelationshipExtractor, - enum_mapper: BaseEnumMapper, - position_calculator: SCIPPositionCalculator): - """Initialize Java factory with required components via constructor injection.""" - if not TREE_SITTER_AVAILABLE: - raise ImportError("Tree-sitter Java library not available") - - super().__init__(project_root, symbol_generator, relationship_extractor, - enum_mapper, position_calculator) - self.tree_analyzer = JavaTreeSitterAnalyzer() - - def get_language(self) -> str: - """Return language identifier.""" - return "java" - - def get_supported_extensions(self) -> Set[str]: - """Return supported file extensions.""" - return {'.java'} - - def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: - """Extract Java symbol definitions using tree-sitter analysis.""" - try: - tree = self.tree_analyzer.parse(context.content) - - for node in self.tree_analyzer.walk(tree): - if self.tree_analyzer.is_symbol_definition(node): - symbol_info = self._create_symbol_from_tree_node(node, context) - if symbol_info: - yield symbol_info - - except SyntaxError as e: - # Handle syntax errors gracefully - pass - - def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: - """Extract Java symbol occurrences.""" - try: - tree = self.tree_analyzer.parse(context.content) - - for node in self.tree_analyzer.walk(tree): - if (self.tree_analyzer.is_symbol_definition(node) or - self.tree_analyzer.is_symbol_reference(node)): - occurrence = self._create_occurrence_from_tree_node(node, context) - if occurrence: - yield occurrence - - except SyntaxError as e: - # Handle syntax errors gracefully - pass - - def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract Java external symbols from imports.""" - external_symbols = [] - - for doc in documents: - try: - content = self._read_file(os.path.join(self.project_root, doc.relative_path)) - tree = self.tree_analyzer.parse(content) - - # Extract import statements - import_statements = self.tree_analyzer.extract_import_statements(tree) - for import_path in import_statements: - external_symbol = self._create_external_symbol_from_import(import_path) - if external_symbol: - external_symbols.append(external_symbol) - - except Exception as e: - # Skip problematic files - continue - - return external_symbols - - def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: - """ - Build Java-specific cross-document relationships. - - This implementation provides basic cross-document relationship support - for Java. A more sophisticated implementation would analyze package imports - and class dependencies. - """ - # For now, use a simplified approach - # TODO: Implement proper Java package import analysis - return 0 # Placeholder - no relationships added yet - - def _create_symbol_from_tree_node(self, node, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: - """Create SCIP symbol information from tree-sitter node.""" - symbol_info = scip_pb2.SymbolInformation() - - symbol_name = self.tree_analyzer.get_symbol_name(node) - if not symbol_name: - return None - - if node.type == 'class_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="class", - scope_path=context.scope_stack, - descriptor_suffix="#" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('class') - - elif node.type == 'interface_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('interface') - - elif node.type == 'enum_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('enum') - - elif node.type == 'method_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('method') - - elif node.type == 'constructor_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('constructor') - - elif node.type == 'field_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('field') - - elif node.type == 'local_variable_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('local_variable') - - elif node.type == 'formal_parameter': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('parameter') - - else: - return None - - return symbol_info - - def _create_occurrence_from_tree_node(self, node, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence from tree-sitter node.""" - occurrence = scip_pb2.Occurrence() - - # Calculate position using position calculator - try: - position_info = self.position_calculator.calculate_positions_from_tree_node( - context.content, node - ) - - # Set range - occurrence.range.start.extend([position_info.start_line, position_info.start_column]) - occurrence.range.end.extend([position_info.end_line, position_info.end_column]) - - except Exception as e: - # Skip if position calculation fails - return None - - symbol_name = self.tree_analyzer.get_symbol_name(node) - if not symbol_name: - return None - - # Set symbol and roles based on node type - if node.type == 'class_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('class_declaration') - - elif node.type == 'interface_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('interface_declaration') - - elif node.type == 'method_declaration': - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('method_declaration') - - elif node.type in ['identifier', 'type_identifier']: - # Handle variable references - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') - - else: - return None - - return occurrence - - def _create_external_symbol_from_import(self, import_path: str) -> Optional[scip_pb2.SymbolInformation]: - """Create external symbol from import statement.""" - symbol_info = scip_pb2.SymbolInformation() - - # Determine if it's a standard library or external dependency - if import_path.startswith('java.') or import_path.startswith('javax.'): - symbol_info.symbol = f"java-stdlib {import_path}" - symbol_info.display_name = import_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('package') - symbol_info.documentation.append(f"Java standard library: {import_path}") - else: - symbol_info.symbol = f"java-external {import_path}" - symbol_info.display_name = import_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('package') - symbol_info.documentation.append(f"External Java package: {import_path}") - - return symbol_info - - -def create_java_scip_factory(project_root: str) -> JavaSCIPIndexFactory: - """ - Factory creator for Java SCIP factory. - Ensures all required components are properly assembled via constructor injection. - """ - if not TREE_SITTER_AVAILABLE: - raise ImportError("Tree-sitter Java library not available") - - symbol_generator = SCIPSymbolGenerator( - scheme="scip-java", - package_manager="maven", - package_name=Path(project_root).name, - version="HEAD" - ) - - relationship_extractor = JavaRelationshipExtractor() - enum_mapper = JavaEnumMapper() - position_calculator = SCIPPositionCalculator() - - return JavaSCIPIndexFactory( - project_root=project_root, - symbol_generator=symbol_generator, - relationship_extractor=relationship_extractor, # Guaranteed to be provided - enum_mapper=enum_mapper, - position_calculator=position_calculator - ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/java/relationship_extractor.py b/src/code_index_mcp/scip/framework/java/relationship_extractor.py deleted file mode 100644 index 092b3ea..0000000 --- a/src/code_index_mcp/scip/framework/java/relationship_extractor.py +++ /dev/null @@ -1,295 +0,0 @@ -"""Java relationship extractor implementation.""" - -from typing import Iterator, Optional, List -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..types import SCIPContext, Relationship -from ...core.relationship_types import InternalRelationshipType - -try: - import tree_sitter - from tree_sitter_java import language as java_language - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - - -class JavaRelationshipExtractor(BaseRelationshipExtractor): - """Java-specific relationship extractor using tree-sitter analysis.""" - - def __init__(self): - """Initialize the Java relationship extractor.""" - if not TREE_SITTER_AVAILABLE: - raise ImportError("Tree-sitter Java library not available") - - java_lang = tree_sitter.Language(java_language()) - self.parser = tree_sitter.Parser(java_lang) - - def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract inheritance relationships from Java classes.""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - for node in self._walk_tree(tree.root_node): - if node.type == 'class_declaration': - class_name = self._get_class_name(node) - if not class_name: - continue - - class_symbol_id = self._create_class_symbol_id(class_name, context) - - # Look for extends clause - extends_node = self._find_child_by_type(node, 'superclass') - if extends_node: - parent_type = self._find_child_by_type(extends_node, 'type_identifier') - if parent_type: - parent_name = self._get_node_text(parent_type, context.content) - parent_symbol_id = self._create_class_symbol_id(parent_name, context) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract method call relationships.""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - for node in self._walk_tree(tree.root_node): - if node.type == 'method_declaration': - method_name = self._get_method_name(node) - if not method_name: - continue - - method_symbol_id = self._create_method_symbol_id(method_name, context) - - # Find method invocations within this method - for call_node in self._walk_tree(node): - if call_node.type == 'method_invocation': - target_method = self._get_invocation_target(call_node, context.content) - if target_method and target_method != method_name: - target_symbol_id = self._create_method_symbol_id(target_method, context) - yield Relationship( - source_symbol=method_symbol_id, - target_symbol=target_symbol_id, - relationship_type=InternalRelationshipType.CALLS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract import/dependency relationships.""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - file_symbol_id = self._create_file_symbol_id(context.file_path) - - for node in self._walk_tree(tree.root_node): - if node.type == 'import_declaration': - import_path = self._get_import_path(node, context.content) - if import_path: - # Determine if it's a standard library or external dependency - if import_path.startswith('java.') or import_path.startswith('javax.'): - module_symbol_id = f"java-stdlib {import_path}" - else: - module_symbol_id = f"java-external {import_path}" - - yield Relationship( - source_symbol=file_symbol_id, - target_symbol=module_symbol_id, - relationship_type=InternalRelationshipType.IMPORTS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract composition relationships (class fields).""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - for node in self._walk_tree(tree.root_node): - if node.type == 'class_declaration': - class_name = self._get_class_name(node) - if not class_name: - continue - - class_symbol_id = self._create_class_symbol_id(class_name, context) - - # Find field declarations in this class - for field_node in self._walk_tree(node): - if field_node.type == 'field_declaration': - field_name = self._get_field_name(field_node, context.content) - if field_name: - field_symbol_id = self._create_field_symbol_id(field_name, class_symbol_id) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=field_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract interface implementation relationships.""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - for node in self._walk_tree(tree.root_node): - if node.type == 'class_declaration': - class_name = self._get_class_name(node) - if not class_name: - continue - - class_symbol_id = self._create_class_symbol_id(class_name, context) - - # Look for implements clause - implements_node = self._find_child_by_type(node, 'super_interfaces') - if implements_node: - for interface_node in self._find_children_by_type(implements_node, 'type_identifier'): - interface_name = self._get_node_text(interface_node, context.content) - interface_symbol_id = self._create_interface_symbol_id(interface_name, context) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=interface_symbol_id, - relationship_type=InternalRelationshipType.IMPLEMENTS - ) - - elif node.type == 'interface_declaration': - interface_name = self._get_interface_name(node, context.content) - if not interface_name: - continue - - interface_symbol_id = self._create_interface_symbol_id(interface_name, context) - - # Look for extends clause in interface - extends_node = self._find_child_by_type(node, 'extends_interfaces') - if extends_node: - for parent_interface_node in self._find_children_by_type(extends_node, 'type_identifier'): - parent_interface_name = self._get_node_text(parent_interface_node, context.content) - parent_symbol_id = self._create_interface_symbol_id(parent_interface_name, context) - yield Relationship( - source_symbol=interface_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - - except Exception: - # Skip files with parsing errors - return - - def _walk_tree(self, node) -> Iterator: - """Walk tree-sitter tree nodes.""" - yield node - for child in node.children: - yield from self._walk_tree(child) - - def _find_child_by_type(self, node, node_type: str): - """Find first child node of specified type.""" - for child in node.children: - if child.type == node_type: - return child - return None - - def _find_children_by_type(self, node, node_type: str) -> List: - """Find all child nodes of specified type.""" - children = [] - for child in node.children: - if child.type == node_type: - children.append(child) - return children - - def _get_node_text(self, node, content: str) -> str: - """Get text content of a tree-sitter node.""" - return content[node.start_byte:node.end_byte] - - def _get_class_name(self, class_node) -> Optional[str]: - """Extract class name from class declaration node.""" - identifier_node = self._find_child_by_type(class_node, 'identifier') - if identifier_node: - return identifier_node.text.decode('utf8') - return None - - def _get_method_name(self, method_node) -> Optional[str]: - """Extract method name from method declaration node.""" - identifier_node = self._find_child_by_type(method_node, 'identifier') - if identifier_node: - return identifier_node.text.decode('utf8') - return None - - def _get_interface_name(self, interface_node, content: str) -> Optional[str]: - """Extract interface name from interface declaration node.""" - identifier_node = self._find_child_by_type(interface_node, 'identifier') - if identifier_node: - return self._get_node_text(identifier_node, content) - return None - - def _get_field_name(self, field_node, content: str) -> Optional[str]: - """Extract field name from field declaration node.""" - # Field declarations can have multiple declarators - declarator = self._find_child_by_type(field_node, 'variable_declarator') - if declarator: - identifier = self._find_child_by_type(declarator, 'identifier') - if identifier: - return self._get_node_text(identifier, content) - return None - - def _get_import_path(self, import_node, content: str) -> Optional[str]: - """Extract import path from import declaration.""" - # Look for scoped_identifier or identifier in import - for child in import_node.children: - if child.type in ['scoped_identifier', 'identifier']: - return self._get_node_text(child, content) - return None - - def _get_invocation_target(self, invocation_node, content: str) -> Optional[str]: - """Extract target method name from method invocation.""" - identifier_node = self._find_child_by_type(invocation_node, 'identifier') - if identifier_node: - return self._get_node_text(identifier_node, content) - - # Handle method calls like object.method() - field_access = self._find_child_by_type(invocation_node, 'field_access') - if field_access: - identifier = self._find_child_by_type(field_access, 'identifier') - if identifier: - return self._get_node_text(identifier, content) - - return None - - def _create_class_symbol_id(self, class_name: str, context: SCIPContext) -> str: - """Create symbol ID for class.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{class_name}" if scope_path else class_name - return f"local {local_id}#" - - def _create_method_symbol_id(self, method_name: str, context: SCIPContext) -> str: - """Create symbol ID for method.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{method_name}" if scope_path else method_name - return f"local {local_id}()." - - def _create_interface_symbol_id(self, interface_name: str, context: SCIPContext) -> str: - """Create symbol ID for interface.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{interface_name}" if scope_path else interface_name - return f"local {local_id}#" - - def _create_field_symbol_id(self, field_name: str, class_symbol_id: str) -> str: - """Create symbol ID for field.""" - # Extract class name from class symbol ID - class_name = class_symbol_id.replace("local ", "").replace("#", "") - return f"local {class_name}.{field_name}" - - def _create_file_symbol_id(self, file_path: str) -> str: - """Create symbol ID for file.""" - return f"local {file_path}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/java/tree_sitter_analyzer.py b/src/code_index_mcp/scip/framework/java/tree_sitter_analyzer.py deleted file mode 100644 index 0f0c841..0000000 --- a/src/code_index_mcp/scip/framework/java/tree_sitter_analyzer.py +++ /dev/null @@ -1,327 +0,0 @@ -"""Java tree-sitter analyzer implementation.""" - -from typing import Iterator, Optional, Set, List, Dict, Any -from ..types import SCIPContext -from ..base.language_analyzer import BaseLanguageAnalyzer - -try: - import tree_sitter - from tree_sitter_java import language as java_language - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - - -class JavaTreeSitterAnalyzer(BaseLanguageAnalyzer): - """Java analyzer using tree-sitter for AST parsing.""" - - def __init__(self): - """Initialize the Java tree-sitter analyzer.""" - if not TREE_SITTER_AVAILABLE: - raise ImportError("Tree-sitter Java library not available") - - java_lang = tree_sitter.Language(java_language()) - self.parser = tree_sitter.Parser(java_lang) - self._processed_nodes: Set[int] = set() - - def parse(self, content: str, filename: str = ""): - """Parse Java source code into tree-sitter AST.""" - try: - return self.parser.parse(bytes(content, 'utf8')) - except Exception as e: - raise SyntaxError(f"Java syntax error in {filename}: {e}") - - def walk(self, tree) -> Iterator: - """Walk tree-sitter tree nodes, avoiding duplicates.""" - for node in self._walk_node(tree.root_node): - node_id = id(node) - if node_id not in self._processed_nodes: - self._processed_nodes.add(node_id) - yield node - - def _walk_node(self, node) -> Iterator: - """Recursively walk tree nodes.""" - yield node - for child in node.children: - yield from self._walk_node(child) - - def is_symbol_definition(self, node) -> bool: - """Check if tree-sitter node represents a symbol definition.""" - return node.type in { - 'class_declaration', - 'interface_declaration', - 'enum_declaration', - 'method_declaration', - 'constructor_declaration', - 'field_declaration', - 'local_variable_declaration', - 'formal_parameter', - 'annotation_type_declaration', - } - - def is_symbol_reference(self, node) -> bool: - """Check if tree-sitter node represents a symbol reference.""" - return node.type in { - 'identifier', - 'type_identifier', - 'method_invocation', - 'field_access', - } - - def get_symbol_name(self, node) -> Optional[str]: - """Extract symbol name from tree-sitter node.""" - if node.type in ['class_declaration', 'interface_declaration', 'enum_declaration', - 'method_declaration', 'constructor_declaration', 'annotation_type_declaration']: - identifier_node = self._find_child_by_type(node, 'identifier') - if identifier_node: - return identifier_node.text.decode('utf8') - - elif node.type == 'field_declaration': - # Field declarations can have multiple declarators - declarator = self._find_child_by_type(node, 'variable_declarator') - if declarator: - identifier = self._find_child_by_type(declarator, 'identifier') - if identifier: - return identifier.text.decode('utf8') - - elif node.type == 'local_variable_declaration': - declarator = self._find_child_by_type(node, 'variable_declarator') - if declarator: - identifier = self._find_child_by_type(declarator, 'identifier') - if identifier: - return identifier.text.decode('utf8') - - elif node.type == 'formal_parameter': - identifier = self._find_child_by_type(node, 'identifier') - if identifier: - return identifier.text.decode('utf8') - - elif node.type in ['identifier', 'type_identifier']: - return node.text.decode('utf8') - - return None - - def get_node_position(self, node) -> tuple: - """Get position information from tree-sitter node.""" - start_line = node.start_point[0] - start_col = node.start_point[1] - end_line = node.end_point[0] - end_col = node.end_point[1] - - return (start_line, start_col, end_line, end_col) - - def extract_class_info(self, tree) -> List[Dict[str, Any]]: - """Extract class information from the AST.""" - classes = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'class_declaration': - class_info = { - 'name': self.get_symbol_name(node), - 'type': 'class', - 'position': self.get_node_position(node), - 'modifiers': self._extract_modifiers(node), - 'superclass': self._extract_superclass(node), - 'interfaces': self._extract_implemented_interfaces(node), - 'methods': self._extract_class_methods(node), - 'fields': self._extract_class_fields(node), - } - classes.append(class_info) - - return classes - - def extract_interface_info(self, tree) -> List[Dict[str, Any]]: - """Extract interface information from the AST.""" - interfaces = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'interface_declaration': - interface_info = { - 'name': self.get_symbol_name(node), - 'type': 'interface', - 'position': self.get_node_position(node), - 'modifiers': self._extract_modifiers(node), - 'extends': self._extract_extended_interfaces(node), - 'methods': self._extract_interface_methods(node), - } - interfaces.append(interface_info) - - return interfaces - - def extract_method_info(self, tree) -> List[Dict[str, Any]]: - """Extract method information from the AST.""" - methods = [] - - for node in self._walk_node(tree.root_node): - if node.type in ['method_declaration', 'constructor_declaration']: - method_info = { - 'name': self.get_symbol_name(node), - 'type': 'constructor' if node.type == 'constructor_declaration' else 'method', - 'position': self.get_node_position(node), - 'modifiers': self._extract_modifiers(node), - 'return_type': self._extract_return_type(node), - 'parameters': self._extract_method_parameters(node), - 'throws': self._extract_throws_clause(node), - } - methods.append(method_info) - - return methods - - def extract_import_statements(self, tree) -> List[str]: - """Extract import statements from the AST.""" - imports = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'import_declaration': - import_path = self._extract_import_path(node) - if import_path: - imports.append(import_path) - - return imports - - def extract_package_declaration(self, tree) -> Optional[str]: - """Extract package declaration from the AST.""" - for node in self._walk_node(tree.root_node): - if node.type == 'package_declaration': - return self._extract_package_name(node) - return None - - def _find_child_by_type(self, node, node_type: str): - """Find first child node of specified type.""" - for child in node.children: - if child.type == node_type: - return child - return None - - def _find_children_by_type(self, node, node_type: str) -> List: - """Find all child nodes of specified type.""" - children = [] - for child in node.children: - if child.type == node_type: - children.append(child) - return children - - def _extract_modifiers(self, node) -> List[str]: - """Extract modifiers from a declaration node.""" - modifiers = [] - modifiers_node = self._find_child_by_type(node, 'modifiers') - if modifiers_node: - for child in modifiers_node.children: - if child.type in ['public', 'private', 'protected', 'static', 'final', - 'abstract', 'synchronized', 'volatile', 'transient', 'native']: - modifiers.append(child.type) - return modifiers - - def _extract_superclass(self, class_node) -> Optional[str]: - """Extract superclass name from class declaration.""" - superclass_node = self._find_child_by_type(class_node, 'superclass') - if superclass_node: - type_node = self._find_child_by_type(superclass_node, 'type_identifier') - if type_node: - return type_node.text.decode('utf8') - return None - - def _extract_implemented_interfaces(self, class_node) -> List[str]: - """Extract implemented interface names from class declaration.""" - interfaces = [] - interfaces_node = self._find_child_by_type(class_node, 'super_interfaces') - if interfaces_node: - for interface_node in self._find_children_by_type(interfaces_node, 'type_identifier'): - interfaces.append(interface_node.text.decode('utf8')) - return interfaces - - def _extract_extended_interfaces(self, interface_node) -> List[str]: - """Extract extended interface names from interface declaration.""" - interfaces = [] - extends_node = self._find_child_by_type(interface_node, 'extends_interfaces') - if extends_node: - for interface_node in self._find_children_by_type(extends_node, 'type_identifier'): - interfaces.append(interface_node.text.decode('utf8')) - return interfaces - - def _extract_class_methods(self, class_node) -> List[str]: - """Extract method names from class declaration.""" - methods = [] - for child in class_node.children: - if child.type in ['method_declaration', 'constructor_declaration']: - method_name = self.get_symbol_name(child) - if method_name: - methods.append(method_name) - return methods - - def _extract_class_fields(self, class_node) -> List[str]: - """Extract field names from class declaration.""" - fields = [] - for child in class_node.children: - if child.type == 'field_declaration': - field_name = self.get_symbol_name(child) - if field_name: - fields.append(field_name) - return fields - - def _extract_interface_methods(self, interface_node) -> List[str]: - """Extract method names from interface declaration.""" - methods = [] - for child in interface_node.children: - if child.type == 'method_declaration': - method_name = self.get_symbol_name(child) - if method_name: - methods.append(method_name) - return methods - - def _extract_return_type(self, method_node) -> Optional[str]: - """Extract return type from method declaration.""" - # Constructor declarations don't have return types - if method_node.type == 'constructor_declaration': - return None - - # Look for various return type patterns - for child in method_node.children: - if child.type in ['type_identifier', 'primitive_type', 'array_type', 'generic_type']: - return child.text.decode('utf8') - return None - - def _extract_method_parameters(self, method_node) -> List[Dict[str, str]]: - """Extract parameter information from method declaration.""" - parameters = [] - formal_params_node = self._find_child_by_type(method_node, 'formal_parameters') - if formal_params_node: - for param_node in self._find_children_by_type(formal_params_node, 'formal_parameter'): - param_name = self.get_symbol_name(param_node) - param_type = self._extract_parameter_type(param_node) - if param_name: - parameters.append({ - 'name': param_name, - 'type': param_type or 'unknown' - }) - return parameters - - def _extract_parameter_type(self, param_node) -> Optional[str]: - """Extract parameter type from formal parameter node.""" - for child in param_node.children: - if child.type in ['type_identifier', 'primitive_type', 'array_type', 'generic_type']: - return child.text.decode('utf8') - return None - - def _extract_throws_clause(self, method_node) -> List[str]: - """Extract throws clause from method declaration.""" - throws = [] - throws_node = self._find_child_by_type(method_node, 'throws') - if throws_node: - for exception_node in self._find_children_by_type(throws_node, 'type_identifier'): - throws.append(exception_node.text.decode('utf8')) - return throws - - def _extract_import_path(self, import_node) -> Optional[str]: - """Extract import path from import declaration.""" - for child in import_node.children: - if child.type in ['scoped_identifier', 'identifier']: - return child.text.decode('utf8') - return None - - def _extract_package_name(self, package_node) -> Optional[str]: - """Extract package name from package declaration.""" - for child in package_node.children: - if child.type in ['scoped_identifier', 'identifier']: - return child.text.decode('utf8') - return None \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/__init__.py b/src/code_index_mcp/scip/framework/javascript/__init__.py deleted file mode 100644 index f15ddd6..0000000 --- a/src/code_index_mcp/scip/framework/javascript/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""JavaScript/TypeScript-specific SCIP framework components.""" - -from .factory import JavaScriptSCIPIndexFactory, create_javascript_scip_factory -from .relationship_extractor import JavaScriptRelationshipExtractor -from .enum_mapper import JavaScriptEnumMapper -from .syntax_analyzer import JavaScriptSyntaxAnalyzer - -__all__ = [ - 'JavaScriptSCIPIndexFactory', - 'create_javascript_scip_factory', - 'JavaScriptRelationshipExtractor', - 'JavaScriptEnumMapper', - 'JavaScriptSyntaxAnalyzer', -] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/enum_mapper.py b/src/code_index_mcp/scip/framework/javascript/enum_mapper.py deleted file mode 100644 index e5f03ab..0000000 --- a/src/code_index_mcp/scip/framework/javascript/enum_mapper.py +++ /dev/null @@ -1,237 +0,0 @@ -"""JavaScript enum mapper implementation.""" - -from typing import Dict -from ..base.enum_mapper import BaseEnumMapper -from ...proto import scip_pb2 - - -class JavaScriptEnumMapper(BaseEnumMapper): - """JavaScript/TypeScript-specific enum mapper for SCIP compliance.""" - - # JavaScript symbol kind mappings - SYMBOL_KIND_MAP = { - 'function': scip_pb2.Function, - 'arrow_function': scip_pb2.Function, - 'method': scip_pb2.Method, - 'class': scip_pb2.Class, - 'variable': scip_pb2.Variable, - 'constant': scip_pb2.Constant, - 'module': scip_pb2.Module, - 'parameter': scip_pb2.Parameter, - 'property': scip_pb2.Property, - 'constructor': scip_pb2.Constructor, - 'field': scip_pb2.Field, - 'namespace': scip_pb2.Namespace, - 'interface': scip_pb2.Interface, - 'type': scip_pb2.Type, - 'object': scip_pb2.Object, - 'enum': scip_pb2.Enum, - } - - # JavaScript syntax kind mappings - SYNTAX_KIND_MAP = { - 'function_definition': scip_pb2.IdentifierFunctionDefinition, - 'class_definition': scip_pb2.IdentifierType, - 'variable_definition': scip_pb2.IdentifierLocal, - 'parameter_definition': scip_pb2.IdentifierParameter, - 'property_definition': scip_pb2.IdentifierAttribute, - 'method_definition': scip_pb2.IdentifierFunctionDefinition, - 'interface_definition': scip_pb2.IdentifierType, - 'type_definition': scip_pb2.IdentifierType, - 'identifier': scip_pb2.Identifier, - 'keyword': scip_pb2.IdentifierKeyword, - 'string_literal': scip_pb2.StringLiteral, - 'numeric_literal': scip_pb2.NumericLiteral, - 'boolean_literal': scip_pb2.BooleanLiteral, - 'regex_literal': scip_pb2.RegexEscape, - 'comment': scip_pb2.Comment, - 'punctuation': scip_pb2.PunctuationDelimiter, - 'operator': scip_pb2.PunctuationDelimiter, - } - - # JavaScript symbol role mappings (official SCIP naming) - SYMBOL_ROLE_MAP = { - 'definition': scip_pb2.Definition, - 'import': scip_pb2.Import, - 'write': scip_pb2.Write, # Official SCIP naming - 'read': scip_pb2.Read, # Official SCIP naming - 'generated': scip_pb2.Generated, - 'test': scip_pb2.Test, - 'type': scip_pb2.Type, # Add missing Type role - 'reference': scip_pb2.Read, # Default reference is read access - 'export': scip_pb2.Definition, # Exports are definitions - } - - def map_symbol_kind(self, language_kind: str) -> int: - """Map JavaScript symbol type to SCIP SymbolKind.""" - kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SymbolKind'): - raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") - - return kind - - def map_syntax_kind(self, language_syntax: str) -> int: - """Map JavaScript syntax element to SCIP SyntaxKind.""" - kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SyntaxKind'): - raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") - - return kind - - def map_symbol_role(self, language_role: str) -> int: - """Map JavaScript symbol role to SCIP SymbolRole.""" - role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) - - # Validate enum value - if not self.validate_enum_value(role, 'SymbolRole'): - raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") - - return role - - def get_javascript_pattern_symbol_kind(self, pattern_type: str) -> str: - """ - Map JavaScript pattern type to internal symbol kind string. - - Args: - pattern_type: Pattern type from regex matches (e.g., 'function', 'class') - - Returns: - Internal symbol kind string for use with map_symbol_kind() - """ - pattern_kind_map = { - 'function': 'function', - 'arrow_function': 'arrow_function', - 'class': 'class', - 'const': 'constant', - 'let': 'variable', - 'var': 'variable', - 'method': 'method', - 'object_method': 'function', - 'constructor': 'constructor', - 'interface': 'interface', - 'type': 'type', - 'enum': 'enum', - 'namespace': 'namespace', - } - - return pattern_kind_map.get(pattern_type, 'variable') - - def get_javascript_pattern_syntax_kind(self, pattern_type: str, context: str = None) -> str: - """ - Map JavaScript pattern type to internal syntax kind string. - - Args: - pattern_type: Pattern type from regex matches - context: Additional context for disambiguation - - Returns: - Internal syntax kind string for use with map_syntax_kind() - """ - pattern_syntax_map = { - 'function': 'function_definition', - 'arrow_function': 'function_definition', - 'class': 'class_definition', - 'const': 'variable_definition', - 'let': 'variable_definition', - 'var': 'variable_definition', - 'method': 'method_definition', - 'object_method': 'function_definition', - 'interface': 'interface_definition', - 'type': 'type_definition', - 'identifier': 'identifier', - 'string': 'string_literal', - 'number': 'numeric_literal', - 'boolean': 'boolean_literal', - 'regex': 'regex_literal', - } - - return pattern_syntax_map.get(pattern_type, 'identifier') - - def get_javascript_pattern_symbol_role(self, pattern_type: str, context: str = None) -> str: - """ - Map JavaScript pattern type to internal symbol role string. - - Args: - pattern_type: Pattern type from regex matches - context: Additional context (e.g., 'in_assignment', 'in_call') - - Returns: - Internal symbol role string for use with map_symbol_role() - """ - if context == 'definition': - return 'definition' - elif context == 'assignment': - return 'write' - elif context == 'import': - return 'import' - elif context == 'export': - return 'export' - elif pattern_type in ['function', 'arrow_function', 'class', 'method', 'object_method', - 'const', 'let', 'var', 'interface', 'type']: - return 'definition' - else: - return 'reference' - - def get_typescript_specific_kinds(self) -> Dict[str, str]: - """Get TypeScript-specific symbol kinds.""" - return { - 'interface': 'interface', - 'type_alias': 'type', - 'enum': 'enum', - 'namespace': 'namespace', - 'generic_type': 'type', - 'union_type': 'type', - 'intersection_type': 'type', - } - - def get_javascript_type_reference_role(self) -> str: - """Get symbol role for type references (e.g., in TypeScript annotations).""" - return 'type' - - def is_valid_javascript_symbol_kind(self, symbol_kind: str) -> bool: - """Check if symbol kind is valid for JavaScript.""" - return symbol_kind in self.SYMBOL_KIND_MAP - - def is_valid_javascript_syntax_kind(self, syntax_kind: str) -> bool: - """Check if syntax kind is valid for JavaScript.""" - return syntax_kind in self.SYNTAX_KIND_MAP - - def is_valid_javascript_symbol_role(self, symbol_role: str) -> bool: - """Check if symbol role is valid for JavaScript.""" - return symbol_role in self.SYMBOL_ROLE_MAP - - def get_all_javascript_symbol_kinds(self) -> list: - """Get all available JavaScript symbol kinds.""" - return list(self.SYMBOL_KIND_MAP.keys()) - - def get_all_javascript_syntax_kinds(self) -> list: - """Get all available JavaScript syntax kinds.""" - return list(self.SYNTAX_KIND_MAP.keys()) - - def get_all_javascript_symbol_roles(self) -> list: - """Get all available JavaScript symbol roles.""" - return list(self.SYMBOL_ROLE_MAP.keys()) - - def supports_typescript(self) -> bool: - """Check if TypeScript features are supported.""" - return True - - def get_es6_feature_kinds(self) -> Dict[str, str]: - """Get ES6+ specific feature mappings.""" - return { - 'arrow_function': 'function', - 'class': 'class', - 'const': 'constant', - 'let': 'variable', - 'destructuring': 'variable', - 'spread_operator': 'operator', - 'template_literal': 'string_literal', - 'async_function': 'function', - 'generator_function': 'function', - 'module_export': 'module', - 'module_import': 'module', - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/factory.py b/src/code_index_mcp/scip/framework/javascript/factory.py deleted file mode 100644 index a08d8d9..0000000 --- a/src/code_index_mcp/scip/framework/javascript/factory.py +++ /dev/null @@ -1,376 +0,0 @@ -"""JavaScript/TypeScript SCIP Index Factory implementation.""" - -import re -import os -from pathlib import Path -from typing import Set, List, Iterator, Optional, Dict, Any -from ..base.index_factory import SCIPIndexFactory -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..base.enum_mapper import BaseEnumMapper -from ..symbol_generator import SCIPSymbolGenerator -from ..position_calculator import SCIPPositionCalculator -from ..types import SCIPContext, SCIPSymbolDescriptor -from .relationship_extractor import JavaScriptRelationshipExtractor -from .enum_mapper import JavaScriptEnumMapper -from .syntax_analyzer import JavaScriptSyntaxAnalyzer -from ...proto import scip_pb2 - - -class JavaScriptSCIPIndexFactory(SCIPIndexFactory): - """JavaScript/TypeScript-specific SCIP Index factory implementation with constructor injection.""" - - def __init__(self, - project_root: str, - symbol_generator: SCIPSymbolGenerator, - relationship_extractor: BaseRelationshipExtractor, - enum_mapper: BaseEnumMapper, - position_calculator: SCIPPositionCalculator): - """Initialize JavaScript factory with required components via constructor injection.""" - super().__init__(project_root, symbol_generator, relationship_extractor, - enum_mapper, position_calculator) - self.syntax_analyzer = JavaScriptSyntaxAnalyzer() - - def get_language(self) -> str: - """Return language identifier.""" - return "javascript" - - def get_supported_extensions(self) -> Set[str]: - """Return supported file extensions.""" - return {'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs'} - - def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: - """Extract JavaScript symbol definitions using regex-based analysis.""" - try: - patterns = self.syntax_analyzer.get_symbol_patterns() - - for pattern_type, pattern in patterns.items(): - for match in re.finditer(pattern, context.content, re.MULTILINE): - symbol_info = self._create_symbol_from_match(match, pattern_type, context) - if symbol_info: - yield symbol_info - - except Exception as e: - # Handle parsing errors gracefully - pass - - def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: - """Extract JavaScript symbol occurrences.""" - try: - patterns = self.syntax_analyzer.get_occurrence_patterns() - - for pattern_type, pattern in patterns.items(): - for match in re.finditer(pattern, context.content, re.MULTILINE): - occurrence = self._create_occurrence_from_match(match, pattern_type, context) - if occurrence: - yield occurrence - - except Exception as e: - # Handle parsing errors gracefully - pass - - def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract JavaScript external symbols from imports.""" - external_symbols = [] - - for doc in documents: - try: - content = self._read_file(os.path.join(self.project_root, doc.relative_path)) - import_patterns = self.syntax_analyzer.get_import_patterns() - - for pattern_type, pattern in import_patterns.items(): - for match in re.finditer(pattern, content, re.MULTILINE): - external_symbol = self._create_external_symbol_from_import_match(match, pattern_type) - if external_symbol: - external_symbols.append(external_symbol) - - except Exception as e: - # Skip problematic files - continue - - return external_symbols - - def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: - """ - Build JavaScript-specific cross-document relationships. - - This implementation provides basic cross-document relationship support - for JavaScript/TypeScript. A more sophisticated implementation would - analyze ES6 imports and require statements. - """ - # For now, use a simplified approach - # TODO: Implement proper JavaScript import/export analysis - return 0 # Placeholder - no relationships added yet - - def _create_symbol_from_match(self, match: re.Match, pattern_type: str, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: - """Create SCIP symbol information from regex match.""" - symbol_info = scip_pb2.SymbolInformation() - - if pattern_type == 'function': - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = name - symbol_info.kind = self.enum_mapper.map_symbol_kind('function') - - elif pattern_type == 'arrow_function': - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = name - symbol_info.kind = self.enum_mapper.map_symbol_kind('function') - - elif pattern_type == 'class': - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = name - symbol_info.kind = self.enum_mapper.map_symbol_kind('class') - - elif pattern_type == 'const': - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = name - symbol_info.kind = self.enum_mapper.map_symbol_kind('constant') - - elif pattern_type == 'method': - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = name - symbol_info.kind = self.enum_mapper.map_symbol_kind('method') - - elif pattern_type == 'object_method': - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = name - symbol_info.kind = self.enum_mapper.map_symbol_kind('function') - - else: - return None - - return symbol_info - - def _create_occurrence_from_match(self, match: re.Match, pattern_type: str, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence from regex match.""" - occurrence = scip_pb2.Occurrence() - - # Calculate position using position calculator - try: - start_pos = match.start() - end_pos = match.end() - - position_info = self.position_calculator.calculate_positions_from_offset( - context.content, start_pos, end_pos - ) - - # Set range - occurrence.range.start.extend([position_info.start_line, position_info.start_column]) - occurrence.range.end.extend([position_info.end_line, position_info.end_column]) - - except Exception as e: - # Skip if position calculation fails - return None - - # Set symbol and roles based on pattern type - if pattern_type in ['function', 'arrow_function', 'method', 'object_method']: - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('function_definition') - - elif pattern_type == 'class': - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('class_definition') - - elif pattern_type in ['const', 'let', 'var']: - name = match.group(1) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('variable_definition') - - elif pattern_type == 'identifier': - name = match.group(0) - descriptor = SCIPSymbolDescriptor( - - name=name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') - - else: - return None - - return occurrence - - def _create_external_symbol_from_import_match(self, match: re.Match, pattern_type: str) -> Optional[scip_pb2.SymbolInformation]: - """Create external symbol from import statement match.""" - symbol_info = scip_pb2.SymbolInformation() - - if pattern_type == 'es6_import': - # import { name } from 'module' - module_name = match.group(2) if match.lastindex >= 2 else match.group(1) - symbol_info.symbol = f"npm {module_name}" - symbol_info.display_name = module_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"ES6 imported module: {module_name}") - return symbol_info - - elif pattern_type == 'require': - # const name = require('module') - module_name = match.group(2) if match.lastindex >= 2 else match.group(1) - symbol_info.symbol = f"npm {module_name}" - symbol_info.display_name = module_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"CommonJS required module: {module_name}") - return symbol_info - - elif pattern_type == 'dynamic_import': - # import('module') - module_name = match.group(1) - symbol_info.symbol = f"npm {module_name}" - symbol_info.display_name = module_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"Dynamic imported module: {module_name}") - return symbol_info - - return None - - -def create_javascript_scip_factory(project_root: str) -> JavaScriptSCIPIndexFactory: - """ - Factory creator for JavaScript SCIP factory. - Ensures all required components are properly assembled via constructor injection. - """ - symbol_generator = SCIPSymbolGenerator( - scheme="scip-javascript", - package_manager="npm", - package_name=Path(project_root).name, - version="HEAD" - ) - - relationship_extractor = JavaScriptRelationshipExtractor() - enum_mapper = JavaScriptEnumMapper() - position_calculator = SCIPPositionCalculator() - - return JavaScriptSCIPIndexFactory( - project_root=project_root, - symbol_generator=symbol_generator, - relationship_extractor=relationship_extractor, # Guaranteed to be provided - enum_mapper=enum_mapper, - position_calculator=position_calculator - ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/relationship_extractor.py b/src/code_index_mcp/scip/framework/javascript/relationship_extractor.py deleted file mode 100644 index 7b25afe..0000000 --- a/src/code_index_mcp/scip/framework/javascript/relationship_extractor.py +++ /dev/null @@ -1,281 +0,0 @@ -"""JavaScript relationship extractor implementation.""" - -import re -from typing import Iterator, Dict, List -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..types import SCIPContext, Relationship -from ...core.relationship_types import InternalRelationshipType - - -class JavaScriptRelationshipExtractor(BaseRelationshipExtractor): - """JavaScript-specific relationship extractor using regex-based analysis.""" - - def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract inheritance relationships from JavaScript classes.""" - try: - # ES6 class inheritance: class Child extends Parent - class_extends_pattern = r'class\s+(\w+)\s+extends\s+(\w+)' - - for match in re.finditer(class_extends_pattern, context.content, re.MULTILINE): - child_class = match.group(1) - parent_class = match.group(2) - - child_symbol_id = self._create_class_symbol_id(child_class, context) - parent_symbol_id = self._create_class_symbol_id(parent_class, context) - - yield Relationship( - source_symbol=child_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - - # Prototype inheritance: Object.setPrototypeOf or Object.create - prototype_pattern = r'Object\.setPrototypeOf\s*\(\s*(\w+)\.prototype\s*,\s*(\w+)\.prototype\s*\)' - - for match in re.finditer(prototype_pattern, context.content, re.MULTILINE): - child_obj = match.group(1) - parent_obj = match.group(2) - - child_symbol_id = self._create_function_symbol_id(child_obj, context) - parent_symbol_id = self._create_function_symbol_id(parent_obj, context) - - yield Relationship( - source_symbol=child_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract function/method call relationships.""" - try: - # Function calls: functionName() or object.method() - function_call_patterns = [ - r'(\w+)\s*\(', # Direct function calls - r'(\w+)\.(\w+)\s*\(', # Method calls - r'this\.(\w+)\s*\(', # Method calls on this - r'super\.(\w+)\s*\(', # Super method calls - ] - - # Find all function definitions first - function_defs = self._extract_function_definitions(context.content) - - for func_name in function_defs: - func_symbol_id = self._create_function_symbol_id(func_name, context) - - # Look for calls within this function - func_body = self._extract_function_body(context.content, func_name) - if func_body: - for pattern in function_call_patterns: - for match in re.finditer(pattern, func_body, re.MULTILINE): - if pattern == r'(\w+)\.(\w+)\s*\(': - # Method call - target_function = match.group(2) - elif pattern == r'this\.(\w+)\s*\(' or pattern == r'super\.(\w+)\s*\(': - target_function = match.group(1) - else: - # Direct function call - target_function = match.group(1) - - if target_function and target_function != func_name: - target_symbol_id = self._create_function_symbol_id(target_function, context) - yield Relationship( - source_symbol=func_symbol_id, - target_symbol=target_symbol_id, - relationship_type=InternalRelationshipType.CALLS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract import/dependency relationships.""" - try: - import_patterns = { - 'es6_import': r'import\s+(?:\{[^}]+\}\s+from\s+)?[\'"]([^\'"]+)[\'"]', - 'require': r'require\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)', - 'dynamic_import': r'import\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)', - 'export_from': r'export\s+(?:\{[^}]+\}\s+)?from\s+[\'"]([^\'"]+)[\'"]' - } - - file_symbol_id = self._create_file_symbol_id(context.file_path) - - for pattern_type, pattern in import_patterns.items(): - for match in re.finditer(pattern, context.content, re.MULTILINE): - module_name = match.group(1) - - # Determine if it's a local or external module - if module_name.startswith('.'): - # Local module - module_symbol_id = f"local {module_name}" - else: - # External module (npm package) - module_symbol_id = f"npm {module_name}" - - yield Relationship( - source_symbol=file_symbol_id, - target_symbol=module_symbol_id, - relationship_type=InternalRelationshipType.IMPORTS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract composition relationships (object properties).""" - try: - # Class property definitions - class_property_pattern = r'class\s+(\w+)\s*\{[^}]*?(\w+)\s*=' - - for match in re.finditer(class_property_pattern, context.content, re.MULTILINE | re.DOTALL): - class_name = match.group(1) - property_name = match.group(2) - - class_symbol_id = self._create_class_symbol_id(class_name, context) - property_symbol_id = self._create_property_symbol_id(property_name, class_symbol_id) - - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=property_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - - # Object literal properties - object_literal_pattern = r'const\s+(\w+)\s*=\s*\{[^}]*?(\w+)\s*:' - - for match in re.finditer(object_literal_pattern, context.content, re.MULTILINE | re.DOTALL): - object_name = match.group(1) - property_name = match.group(2) - - object_symbol_id = self._create_variable_symbol_id(object_name, context) - property_symbol_id = self._create_property_symbol_id(property_name, object_symbol_id) - - yield Relationship( - source_symbol=object_symbol_id, - target_symbol=property_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract interface relationships (TypeScript interfaces).""" - try: - # TypeScript interface implementation - interface_impl_pattern = r'class\s+(\w+)\s+implements\s+([^{]+)' - - for match in re.finditer(interface_impl_pattern, context.content, re.MULTILINE): - class_name = match.group(1) - interfaces = match.group(2).strip() - - class_symbol_id = self._create_class_symbol_id(class_name, context) - - # Parse multiple interfaces - for interface_name in re.findall(r'\w+', interfaces): - interface_symbol_id = self._create_interface_symbol_id(interface_name, context) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=interface_symbol_id, - relationship_type=InternalRelationshipType.IMPLEMENTS - ) - - # TypeScript interface extension - interface_extends_pattern = r'interface\s+(\w+)\s+extends\s+([^{]+)' - - for match in re.finditer(interface_extends_pattern, context.content, re.MULTILINE): - child_interface = match.group(1) - parent_interfaces = match.group(2).strip() - - child_symbol_id = self._create_interface_symbol_id(child_interface, context) - - for parent_interface in re.findall(r'\w+', parent_interfaces): - parent_symbol_id = self._create_interface_symbol_id(parent_interface, context) - yield Relationship( - source_symbol=child_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - - except Exception: - # Skip files with parsing errors - return - - def _extract_function_definitions(self, content: str) -> List[str]: - """Extract function definition names from content.""" - function_patterns = [ - r'function\s+(\w+)\s*\(', - r'(?:const|let|var)\s+(\w+)\s*=\s*function', - r'(?:const|let|var)\s+(\w+)\s*=\s*\([^)]*\)\s*=>', - r'(\w+)\s*\([^)]*\)\s*\{', # Method definitions - ] - - functions = [] - for pattern in function_patterns: - for match in re.finditer(pattern, content, re.MULTILINE): - functions.append(match.group(1)) - - return list(set(functions)) # Remove duplicates - - def _extract_function_body(self, content: str, func_name: str) -> str: - """Extract the body of a specific function.""" - # Simple heuristic - find function and extract until matching brace - func_pattern = rf'(?:function\s+{func_name}\s*\(|{func_name}\s*\([^)]*\)\s*=>|\b{func_name}\s*\([^)]*\)\s*{{)' - - match = re.search(func_pattern, content, re.MULTILINE) - if match: - start_pos = match.end() - brace_count = 1 - i = start_pos - - while i < len(content) and brace_count > 0: - if content[i] == '{': - brace_count += 1 - elif content[i] == '}': - brace_count -= 1 - i += 1 - - if brace_count == 0: - return content[start_pos:i-1] - - return "" - - def _create_class_symbol_id(self, class_name: str, context: SCIPContext) -> str: - """Create symbol ID for class.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{class_name}" if scope_path else class_name - return f"local {local_id}#" - - def _create_function_symbol_id(self, function_name: str, context: SCIPContext) -> str: - """Create symbol ID for function.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{function_name}" if scope_path else function_name - return f"local {local_id}()." - - def _create_variable_symbol_id(self, variable_name: str, context: SCIPContext) -> str: - """Create symbol ID for variable.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{variable_name}" if scope_path else variable_name - return f"local {local_id}" - - def _create_property_symbol_id(self, property_name: str, parent_symbol_id: str) -> str: - """Create symbol ID for property.""" - # Extract parent name from parent symbol ID - parent_name = parent_symbol_id.replace("local ", "").rstrip("#().") - return f"local {parent_name}.{property_name}" - - def _create_interface_symbol_id(self, interface_name: str, context: SCIPContext) -> str: - """Create symbol ID for TypeScript interface.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{interface_name}" if scope_path else interface_name - return f"local {local_id}#" - - def _create_file_symbol_id(self, file_path: str) -> str: - """Create symbol ID for file.""" - return f"local {file_path}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/javascript/syntax_analyzer.py b/src/code_index_mcp/scip/framework/javascript/syntax_analyzer.py deleted file mode 100644 index 573cd17..0000000 --- a/src/code_index_mcp/scip/framework/javascript/syntax_analyzer.py +++ /dev/null @@ -1,418 +0,0 @@ -"""JavaScript syntax analyzer implementation.""" - -import re -from typing import Dict, List, Optional, Set, Tuple, Any - - -class JavaScriptSyntaxAnalyzer: - """JavaScript/TypeScript syntax analyzer using regex patterns.""" - - def __init__(self): - """Initialize the syntax analyzer.""" - self._symbol_patterns = self._build_symbol_patterns() - self._occurrence_patterns = self._build_occurrence_patterns() - self._import_patterns = self._build_import_patterns() - self._comment_patterns = self._build_comment_patterns() - - def get_symbol_patterns(self) -> Dict[str, str]: - """Get regex patterns for symbol definitions.""" - return self._symbol_patterns - - def get_occurrence_patterns(self) -> Dict[str, str]: - """Get regex patterns for symbol occurrences.""" - return self._occurrence_patterns - - def get_import_patterns(self) -> Dict[str, str]: - """Get regex patterns for import statements.""" - return self._import_patterns - - def _build_symbol_patterns(self) -> Dict[str, str]: - """Build regex patterns for JavaScript symbol definitions.""" - return { - # Function declarations - 'function': r'function\s+(\w+)\s*\(', - - # Arrow functions - 'arrow_function': r'(?:const|let|var)\s+(\w+)\s*=\s*(?:\([^)]*\)|\w+)\s*=>\s*', - - # Class declarations - 'class': r'class\s+(\w+)(?:\s+extends\s+\w+)?\s*\{', - - # Method definitions (inside classes or objects) - 'method': r'(?:async\s+)?(\w+)\s*\([^)]*\)\s*\{', - - # Object method assignment - 'object_method': r'(\w+)\s*:\s*(?:async\s+)?function\s*\([^)]*\)\s*\{', - - # Variable declarations - 'const': r'const\s+(\w+)(?:\s*:\s*[^=]+)?\s*=', - 'let': r'let\s+(\w+)(?:\s*:\s*[^=]+)?(?:\s*=|;)', - 'var': r'var\s+(\w+)(?:\s*:\s*[^=]+)?(?:\s*=|;)', - - # TypeScript interfaces - 'interface': r'interface\s+(\w+)(?:\s+extends\s+[^{]+)?\s*\{', - - # TypeScript type aliases - 'type': r'type\s+(\w+)(?:<[^>]*>)?\s*=', - - # TypeScript enums - 'enum': r'enum\s+(\w+)\s*\{', - - # TypeScript namespaces - 'namespace': r'namespace\s+(\w+)\s*\{', - - # Constructor functions (legacy pattern) - 'constructor': r'function\s+(\w+)\s*\([^)]*\)\s*\{[^}]*this\.', - - # Module exports - 'export_function': r'export\s+(?:default\s+)?function\s+(\w+)\s*\(', - 'export_class': r'export\s+(?:default\s+)?class\s+(\w+)', - 'export_const': r'export\s+const\s+(\w+)\s*=', - - # Destructuring assignments - 'destructure': r'(?:const|let|var)\s*\{\s*(\w+)(?:\s*,\s*\w+)*\s*\}\s*=', - } - - def _build_occurrence_patterns(self) -> Dict[str, str]: - """Build regex patterns for symbol occurrences/references.""" - return { - # Function calls - 'function_call': r'(\w+)\s*\(', - - # Method calls - 'method_call': r'(\w+)\.(\w+)\s*\(', - - # Property access - 'property_access': r'(\w+)\.(\w+)(?!\s*\()', - - # Variable references - 'identifier': r'\b(\w+)\b', - - # this references - 'this_reference': r'this\.(\w+)', - - # super references - 'super_reference': r'super\.(\w+)', - - # Template literal expressions - 'template_expression': r'\$\{([^}]+)\}', - - # Assignment targets - 'assignment': r'(\w+)\s*[+\-*/%&|^]?=', - - # Function parameters - 'parameter': r'function\s+\w+\s*\(([^)]*)\)', - - # Object literal properties - 'object_property': r'(\w+)\s*:', - } - - def _build_import_patterns(self) -> Dict[str, str]: - """Build regex patterns for import statements.""" - return { - # ES6 imports - 'es6_import': r'import\s+(?:\{([^}]+)\}|(\w+)|\*\s+as\s+(\w+))\s+from\s+[\'"]([^\'"]+)[\'"]', - - # Default imports - 'default_import': r'import\s+(\w+)\s+from\s+[\'"]([^\'"]+)[\'"]', - - # Named imports - 'named_import': r'import\s+\{([^}]+)\}\s+from\s+[\'"]([^\'"]+)[\'"]', - - # Namespace imports - 'namespace_import': r'import\s+\*\s+as\s+(\w+)\s+from\s+[\'"]([^\'"]+)[\'"]', - - # Side effect imports - 'side_effect_import': r'import\s+[\'"]([^\'"]+)[\'"]', - - # CommonJS require - 'require': r'(?:const|let|var)\s+(?:\{([^}]+)\}|(\w+))\s*=\s*require\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)', - - # Dynamic imports - 'dynamic_import': r'import\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)', - - # Re-exports - 'export_from': r'export\s+(?:\{([^}]+)\}|\*(?:\s+as\s+(\w+))?)\s+from\s+[\'"]([^\'"]+)[\'"]', - } - - def _build_comment_patterns(self) -> Dict[str, str]: - """Build regex patterns for comments.""" - return { - 'single_line': r'//.*$', - 'multi_line': r'/\*[\s\S]*?\*/', - 'jsdoc': r'/\*\*[\s\S]*?\*/', - } - - def extract_functions(self, content: str) -> List[Dict[str, Any]]: - """Extract function information from JavaScript content.""" - functions = [] - - # Function declarations - for match in re.finditer(self._symbol_patterns['function'], content, re.MULTILINE): - functions.append({ - 'name': match.group(1), - 'type': 'function', - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - # Arrow functions - for match in re.finditer(self._symbol_patterns['arrow_function'], content, re.MULTILINE): - functions.append({ - 'name': match.group(1), - 'type': 'arrow_function', - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - # Methods - for match in re.finditer(self._symbol_patterns['method'], content, re.MULTILINE): - functions.append({ - 'name': match.group(1), - 'type': 'method', - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - return functions - - def extract_classes(self, content: str) -> List[Dict[str, Any]]: - """Extract class information from JavaScript content.""" - classes = [] - - for match in re.finditer(self._symbol_patterns['class'], content, re.MULTILINE): - class_info = { - 'name': match.group(1), - 'type': 'class', - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n'), - 'methods': [], - 'properties': [] - } - - # Extract class body - class_body = self._extract_class_body(content, match.end()) - if class_body: - class_info['methods'] = self._extract_class_methods(class_body) - class_info['properties'] = self._extract_class_properties(class_body) - - classes.append(class_info) - - return classes - - def extract_variables(self, content: str) -> List[Dict[str, Any]]: - """Extract variable declarations from JavaScript content.""" - variables = [] - - for var_type in ['const', 'let', 'var']: - pattern = self._symbol_patterns[var_type] - for match in re.finditer(pattern, content, re.MULTILINE): - variables.append({ - 'name': match.group(1), - 'type': var_type, - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - return variables - - def extract_imports(self, content: str) -> List[Dict[str, Any]]: - """Extract import statements from JavaScript content.""" - imports = [] - - for import_type, pattern in self._import_patterns.items(): - for match in re.finditer(pattern, content, re.MULTILINE): - import_info = { - 'type': import_type, - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n'), - 'raw': match.group(0) - } - - # Extract specific information based on import type - if import_type == 'es6_import': - import_info['module'] = match.group(4) if match.lastindex >= 4 else match.group(3) - import_info['imports'] = match.group(1) if match.group(1) else match.group(2) - elif import_type in ['default_import', 'namespace_import']: - import_info['name'] = match.group(1) - import_info['module'] = match.group(2) - elif import_type == 'require': - import_info['module'] = match.group(3) if match.lastindex >= 3 else match.group(2) - import_info['name'] = match.group(2) if match.lastindex >= 2 else match.group(1) - elif import_type == 'dynamic_import': - import_info['module'] = match.group(1) - - imports.append(import_info) - - return imports - - def extract_exports(self, content: str) -> List[Dict[str, Any]]: - """Extract export statements from JavaScript content.""" - exports = [] - - export_patterns = { - 'export_default': r'export\s+default\s+(?:function\s+(\w+)|class\s+(\w+)|(\w+))', - 'export_named': r'export\s+\{([^}]+)\}', - 'export_function': r'export\s+function\s+(\w+)', - 'export_class': r'export\s+class\s+(\w+)', - 'export_const': r'export\s+const\s+(\w+)', - } - - for export_type, pattern in export_patterns.items(): - for match in re.finditer(pattern, content, re.MULTILINE): - exports.append({ - 'type': export_type, - 'name': match.group(1) if match.group(1) else match.group(0), - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - return exports - - def remove_comments(self, content: str) -> str: - """Remove comments from JavaScript content.""" - # Remove single-line comments - content = re.sub(self._comment_patterns['single_line'], '', content, flags=re.MULTILINE) - - # Remove multi-line comments - content = re.sub(self._comment_patterns['multi_line'], '', content, flags=re.DOTALL) - - return content - - def extract_string_literals(self, content: str) -> List[Dict[str, Any]]: - """Extract string literals from JavaScript content.""" - string_patterns = { - 'single_quote': r"'([^'\\]|\\.)*'", - 'double_quote': r'"([^"\\\\]|\\\\.)*"', - 'template_literal': r'`([^`\\\\]|\\\\.)*`', - } - - strings = [] - for string_type, pattern in string_patterns.items(): - for match in re.finditer(pattern, content, re.MULTILINE): - strings.append({ - 'type': string_type, - 'value': match.group(0), - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - return strings - - def _extract_class_body(self, content: str, start_pos: int) -> str: - """Extract the body of a class from start position.""" - brace_count = 0 - i = start_pos - - # Find the opening brace - while i < len(content) and content[i] != '{': - i += 1 - - if i >= len(content): - return "" - - start_body = i + 1 - brace_count = 1 - i += 1 - - # Find the matching closing brace - while i < len(content) and brace_count > 0: - if content[i] == '{': - brace_count += 1 - elif content[i] == '}': - brace_count -= 1 - i += 1 - - if brace_count == 0: - return content[start_body:i-1] - - return "" - - def _extract_class_methods(self, class_body: str) -> List[str]: - """Extract method names from class body.""" - methods = [] - - method_pattern = r'(?:async\s+)?(\w+)\s*\([^)]*\)\s*\{' - for match in re.finditer(method_pattern, class_body, re.MULTILINE): - methods.append(match.group(1)) - - return methods - - def _extract_class_properties(self, class_body: str) -> List[str]: - """Extract property names from class body.""" - properties = [] - - property_patterns = [ - r'(\w+)\s*=', # Property assignment - r'(\w+)\s*;', # Property declaration (TypeScript) - ] - - for pattern in property_patterns: - for match in re.finditer(pattern, class_body, re.MULTILINE): - prop_name = match.group(1) - if prop_name not in ['constructor'] and not prop_name.startswith('_'): - properties.append(prop_name) - - return properties - - def is_typescript_file(self, file_path: str) -> bool: - """Check if file is TypeScript based on extension.""" - return file_path.endswith(('.ts', '.tsx')) - - def extract_typescript_features(self, content: str) -> Dict[str, List[Dict[str, Any]]]: - """Extract TypeScript-specific features.""" - if not self.is_typescript_file: - return {} - - features = { - 'interfaces': [], - 'types': [], - 'enums': [], - 'namespaces': [] - } - - # Extract interfaces - for match in re.finditer(self._symbol_patterns['interface'], content, re.MULTILINE): - features['interfaces'].append({ - 'name': match.group(1), - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - # Extract type aliases - for match in re.finditer(self._symbol_patterns['type'], content, re.MULTILINE): - features['types'].append({ - 'name': match.group(1), - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - # Extract enums - for match in re.finditer(self._symbol_patterns['enum'], content, re.MULTILINE): - features['enums'].append({ - 'name': match.group(1), - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - # Extract namespaces - for match in re.finditer(self._symbol_patterns['namespace'], content, re.MULTILINE): - features['namespaces'].append({ - 'name': match.group(1), - 'start': match.start(), - 'end': match.end(), - 'line': content[:match.start()].count('\n') - }) - - return features \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/__init__.py b/src/code_index_mcp/scip/framework/objective_c/__init__.py deleted file mode 100644 index ae824b0..0000000 --- a/src/code_index_mcp/scip/framework/objective_c/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Objective-C SCIP framework module.""" - -from .factory import ObjectiveCSCIPIndexFactory, create_objective_c_scip_factory -from .enum_mapper import ObjectiveCEnumMapper -from .relationship_extractor import ObjectiveCRelationshipExtractor -from .clang_analyzer import ObjectiveCClangAnalyzer - -__all__ = [ - 'ObjectiveCSCIPIndexFactory', - 'create_objective_c_scip_factory', - 'ObjectiveCEnumMapper', - 'ObjectiveCRelationshipExtractor', - 'ObjectiveCClangAnalyzer' -] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/clang_analyzer.py b/src/code_index_mcp/scip/framework/objective_c/clang_analyzer.py deleted file mode 100644 index 8ddc861..0000000 --- a/src/code_index_mcp/scip/framework/objective_c/clang_analyzer.py +++ /dev/null @@ -1,338 +0,0 @@ -"""Objective-C libclang analyzer implementation.""" - -from typing import Iterator, Optional, Set, List, Dict, Any -from ..types import SCIPContext -from ..base.language_analyzer import BaseLanguageAnalyzer - -try: - import clang.cindex as clang - from clang.cindex import CursorKind, TypeKind - LIBCLANG_AVAILABLE = True -except ImportError: - LIBCLANG_AVAILABLE = False - clang = None - CursorKind = None - TypeKind = None - - -class ObjectiveCClangAnalyzer(BaseLanguageAnalyzer): - """Objective-C analyzer using libclang for AST parsing.""" - - def __init__(self): - """Initialize the Objective-C libclang analyzer.""" - if not LIBCLANG_AVAILABLE: - raise ImportError("libclang library not available") - - self.index = clang.Index.create() - self._processed_cursors: Set[int] = set() - - def parse(self, content: str, filename: str = ""): - """Parse Objective-C source code into libclang AST.""" - try: - # Create a temporary file for parsing - args = ['-x', 'objective-c', '-I/usr/include', '-I/usr/local/include'] - return self.index.parse(filename, args=args, unsaved_files=[(filename, content)]) - except Exception as e: - raise SyntaxError(f"Objective-C syntax error in {filename}: {e}") - - def walk(self, translation_unit) -> Iterator: - """Walk libclang cursor nodes, avoiding duplicates.""" - for cursor in self._walk_cursor(translation_unit.cursor): - cursor_id = hash((cursor.spelling, cursor.location.line, cursor.location.column)) - if cursor_id not in self._processed_cursors: - self._processed_cursors.add(cursor_id) - yield cursor - - def _walk_cursor(self, cursor) -> Iterator: - """Recursively walk cursor nodes.""" - yield cursor - for child in cursor.get_children(): - yield from self._walk_cursor(child) - - def is_symbol_definition(self, cursor) -> bool: - """Check if libclang cursor represents a symbol definition.""" - return cursor.kind in { - CursorKind.OBJC_INTERFACE_DECL, - CursorKind.OBJC_IMPLEMENTATION_DECL, - CursorKind.OBJC_PROTOCOL_DECL, - CursorKind.OBJC_CATEGORY_DECL, - CursorKind.OBJC_CATEGORY_IMPL_DECL, - CursorKind.OBJC_INSTANCE_METHOD_DECL, - CursorKind.OBJC_CLASS_METHOD_DECL, - CursorKind.OBJC_PROPERTY_DECL, - CursorKind.OBJC_IVAR_DECL, - CursorKind.CLASS_DECL, - CursorKind.STRUCT_DECL, - CursorKind.UNION_DECL, - CursorKind.ENUM_DECL, - CursorKind.FUNCTION_DECL, - CursorKind.VAR_DECL, - CursorKind.FIELD_DECL, - CursorKind.TYPEDEF_DECL, - CursorKind.MACRO_DEFINITION, - CursorKind.ENUM_CONSTANT_DECL, - } - - def is_symbol_reference(self, cursor) -> bool: - """Check if libclang cursor represents a symbol reference.""" - return cursor.kind in { - CursorKind.DECL_REF_EXPR, - CursorKind.MEMBER_REF_EXPR, - CursorKind.OBJC_MESSAGE_EXPR, - CursorKind.OBJC_SELECTOR_REF, - CursorKind.OBJC_PROTOCOL_REF, - CursorKind.OBJC_CLASS_REF, - CursorKind.OBJC_SUPER_CLASS_REF, - CursorKind.TYPE_REF, - CursorKind.CALL_EXPR, - } - - def get_symbol_name(self, cursor) -> Optional[str]: - """Extract symbol name from libclang cursor.""" - return cursor.spelling if cursor.spelling else None - - def get_node_position(self, cursor) -> tuple: - """Get position information from libclang cursor.""" - start_line = cursor.location.line - 1 # Convert to 0-based - start_col = cursor.location.column - 1 - - # Estimate end position based on symbol name length - if cursor.spelling: - end_line = start_line - end_col = start_col + len(cursor.spelling) - else: - end_line = start_line - end_col = start_col + 1 - - return (start_line, start_col, end_line, end_col) - - def extract_interface_info(self, translation_unit) -> List[Dict[str, Any]]: - """Extract Objective-C interface information from the AST.""" - interfaces = [] - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: - interface_info = { - 'name': cursor.spelling, - 'type': 'interface', - 'position': self.get_node_position(cursor), - 'superclass': self._extract_superclass(cursor), - 'protocols': self._extract_protocols(cursor), - 'methods': self._extract_interface_methods(cursor), - 'properties': self._extract_interface_properties(cursor), - } - interfaces.append(interface_info) - - return interfaces - - def extract_implementation_info(self, translation_unit) -> List[Dict[str, Any]]: - """Extract Objective-C implementation information from the AST.""" - implementations = [] - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind == CursorKind.OBJC_IMPLEMENTATION_DECL: - impl_info = { - 'name': cursor.spelling, - 'type': 'implementation', - 'position': self.get_node_position(cursor), - 'methods': self._extract_implementation_methods(cursor), - 'ivars': self._extract_implementation_ivars(cursor), - } - implementations.append(impl_info) - - return implementations - - def extract_protocol_info(self, translation_unit) -> List[Dict[str, Any]]: - """Extract Objective-C protocol information from the AST.""" - protocols = [] - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind == CursorKind.OBJC_PROTOCOL_DECL: - protocol_info = { - 'name': cursor.spelling, - 'type': 'protocol', - 'position': self.get_node_position(cursor), - 'parent_protocols': self._extract_parent_protocols(cursor), - 'methods': self._extract_protocol_methods(cursor), - 'properties': self._extract_protocol_properties(cursor), - } - protocols.append(protocol_info) - - return protocols - - def extract_method_info(self, translation_unit) -> List[Dict[str, Any]]: - """Extract method information from the AST.""" - methods = [] - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - method_info = { - 'name': cursor.spelling, - 'type': 'instance_method' if cursor.objc_method_kind == 1 else 'class_method', - 'position': self.get_node_position(cursor), - 'return_type': self._extract_return_type(cursor), - 'parameters': self._extract_method_parameters(cursor), - 'is_definition': cursor.is_definition(), - } - methods.append(method_info) - - return methods - - def extract_property_info(self, translation_unit) -> List[Dict[str, Any]]: - """Extract property information from the AST.""" - properties = [] - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind == CursorKind.OBJC_PROPERTY_DECL: - property_info = { - 'name': cursor.spelling, - 'type': 'property', - 'position': self.get_node_position(cursor), - 'property_type': self._extract_property_type(cursor), - 'attributes': self._extract_property_attributes(cursor), - } - properties.append(property_info) - - return properties - - def extract_include_statements(self, translation_unit) -> List[str]: - """Extract include statements from the AST.""" - includes = [] - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind == CursorKind.INCLUSION_DIRECTIVE: - included_file = cursor.get_included_file() - if included_file: - includes.append(included_file.name) - - return includes - - def extract_category_info(self, translation_unit) -> List[Dict[str, Any]]: - """Extract Objective-C category information from the AST.""" - categories = [] - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind in [CursorKind.OBJC_CATEGORY_DECL, CursorKind.OBJC_CATEGORY_IMPL_DECL]: - category_info = { - 'name': cursor.spelling, - 'type': 'category_interface' if cursor.kind == CursorKind.OBJC_CATEGORY_DECL else 'category_implementation', - 'position': self.get_node_position(cursor), - 'extended_class': self._extract_extended_class(cursor), - 'methods': self._extract_category_methods(cursor), - } - categories.append(category_info) - - return categories - - def _extract_superclass(self, interface_cursor) -> Optional[str]: - """Extract superclass name from interface declaration.""" - for child in interface_cursor.get_children(): - if child.kind == CursorKind.OBJC_SUPER_CLASS_REF: - return child.spelling - return None - - def _extract_protocols(self, interface_cursor) -> List[str]: - """Extract protocol names from interface declaration.""" - protocols = [] - for child in interface_cursor.get_children(): - if child.kind == CursorKind.OBJC_PROTOCOL_REF: - protocols.append(child.spelling) - return protocols - - def _extract_parent_protocols(self, protocol_cursor) -> List[str]: - """Extract parent protocol names from protocol declaration.""" - protocols = [] - for child in protocol_cursor.get_children(): - if child.kind == CursorKind.OBJC_PROTOCOL_REF: - protocols.append(child.spelling) - return protocols - - def _extract_interface_methods(self, interface_cursor) -> List[str]: - """Extract method names from interface declaration.""" - methods = [] - for child in interface_cursor.get_children(): - if child.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - methods.append(child.spelling) - return methods - - def _extract_implementation_methods(self, impl_cursor) -> List[str]: - """Extract method names from implementation.""" - methods = [] - for child in impl_cursor.get_children(): - if child.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - methods.append(child.spelling) - return methods - - def _extract_protocol_methods(self, protocol_cursor) -> List[str]: - """Extract method names from protocol declaration.""" - methods = [] - for child in protocol_cursor.get_children(): - if child.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - methods.append(child.spelling) - return methods - - def _extract_category_methods(self, category_cursor) -> List[str]: - """Extract method names from category.""" - methods = [] - for child in category_cursor.get_children(): - if child.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - methods.append(child.spelling) - return methods - - def _extract_interface_properties(self, interface_cursor) -> List[str]: - """Extract property names from interface declaration.""" - properties = [] - for child in interface_cursor.get_children(): - if child.kind == CursorKind.OBJC_PROPERTY_DECL: - properties.append(child.spelling) - return properties - - def _extract_protocol_properties(self, protocol_cursor) -> List[str]: - """Extract property names from protocol declaration.""" - properties = [] - for child in protocol_cursor.get_children(): - if child.kind == CursorKind.OBJC_PROPERTY_DECL: - properties.append(child.spelling) - return properties - - def _extract_implementation_ivars(self, impl_cursor) -> List[str]: - """Extract instance variable names from implementation.""" - ivars = [] - for child in impl_cursor.get_children(): - if child.kind == CursorKind.OBJC_IVAR_DECL: - ivars.append(child.spelling) - return ivars - - def _extract_extended_class(self, category_cursor) -> Optional[str]: - """Extract the class name that a category extends.""" - # The extended class is typically the first child that's a class reference - for child in category_cursor.get_children(): - if child.kind == CursorKind.OBJC_CLASS_REF: - return child.spelling - return None - - def _extract_return_type(self, method_cursor) -> Optional[str]: - """Extract return type from method declaration.""" - return method_cursor.result_type.spelling if method_cursor.result_type else None - - def _extract_method_parameters(self, method_cursor) -> List[Dict[str, str]]: - """Extract parameter information from method declaration.""" - parameters = [] - for child in method_cursor.get_children(): - if child.kind == CursorKind.PARM_DECL: - param_info = { - 'name': child.spelling, - 'type': child.type.spelling if child.type else 'unknown' - } - parameters.append(param_info) - return parameters - - def _extract_property_type(self, property_cursor) -> Optional[str]: - """Extract property type from property declaration.""" - return property_cursor.type.spelling if property_cursor.type else None - - def _extract_property_attributes(self, property_cursor) -> List[str]: - """Extract property attributes (readonly, strong, etc.).""" - # This is a simplified implementation - libclang doesn't easily expose - # property attributes, so we'd need to parse the source text for full accuracy - return [] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/enum_mapper.py b/src/code_index_mcp/scip/framework/objective_c/enum_mapper.py deleted file mode 100644 index 5d2f62b..0000000 --- a/src/code_index_mcp/scip/framework/objective_c/enum_mapper.py +++ /dev/null @@ -1,228 +0,0 @@ -"""Objective-C enum mapper implementation.""" - -from typing import Dict -from ..base.enum_mapper import BaseEnumMapper -from ...proto import scip_pb2 - - -class ObjectiveCEnumMapper(BaseEnumMapper): - """Objective-C-specific enum mapper for SCIP compliance.""" - - # Objective-C symbol kind mappings - SYMBOL_KIND_MAP = { - 'method': scip_pb2.Method, - 'class': scip_pb2.Class, - 'interface': scip_pb2.Interface, - 'protocol': scip_pb2.Interface, # Protocols are similar to interfaces - 'category': scip_pb2.Class, # Categories extend classes - 'enum': scip_pb2.Enum, - 'field': scip_pb2.Field, - 'property': scip_pb2.Property, - 'variable': scip_pb2.Variable, - 'parameter': scip_pb2.Parameter, - 'function': scip_pb2.Function, - 'macro': scip_pb2.Macro, - 'constant': scip_pb2.Constant, - 'typedef': scip_pb2.Type, - 'struct': scip_pb2.Struct, - 'union': scip_pb2.Struct, - 'ivar': scip_pb2.Field, # Instance variables - } - - # Objective-C syntax kind mappings - SYNTAX_KIND_MAP = { - 'method_declaration': scip_pb2.IdentifierFunctionDefinition, - 'class_declaration': scip_pb2.IdentifierType, - 'interface_declaration': scip_pb2.IdentifierType, - 'protocol_declaration': scip_pb2.IdentifierType, - 'category_declaration': scip_pb2.IdentifierType, - 'enum_declaration': scip_pb2.IdentifierType, - 'field_declaration': scip_pb2.IdentifierAttribute, - 'property_declaration': scip_pb2.IdentifierAttribute, - 'variable_declaration': scip_pb2.IdentifierLocal, - 'parameter_declaration': scip_pb2.IdentifierParameter, - 'function_declaration': scip_pb2.IdentifierFunctionDefinition, - 'macro_declaration': scip_pb2.IdentifierKeyword, - 'typedef_declaration': scip_pb2.IdentifierType, - 'struct_declaration': scip_pb2.IdentifierType, - 'union_declaration': scip_pb2.IdentifierType, - 'identifier': scip_pb2.Identifier, - 'keyword': scip_pb2.IdentifierKeyword, - 'string_literal': scip_pb2.StringLiteral, - 'numeric_literal': scip_pb2.NumericLiteral, - 'boolean_literal': scip_pb2.BooleanLiteral, - 'comment': scip_pb2.Comment, - 'punctuation': scip_pb2.PunctuationDelimiter, - } - - # Objective-C symbol role mappings (official SCIP naming) - SYMBOL_ROLE_MAP = { - 'definition': scip_pb2.Definition, - 'import': scip_pb2.Import, - 'write': scip_pb2.Write, # Official SCIP naming - 'read': scip_pb2.Read, # Official SCIP naming - 'generated': scip_pb2.Generated, - 'test': scip_pb2.Test, - 'type': scip_pb2.Type, # Add missing Type role - 'reference': scip_pb2.Read, # Default reference is read access - } - - def map_symbol_kind(self, language_kind: str) -> int: - """Map Objective-C symbol type to SCIP SymbolKind.""" - kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SymbolKind'): - raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") - - return kind - - def map_syntax_kind(self, language_syntax: str) -> int: - """Map Objective-C syntax element to SCIP SyntaxKind.""" - kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SyntaxKind'): - raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") - - return kind - - def map_symbol_role(self, language_role: str) -> int: - """Map Objective-C symbol role to SCIP SymbolRole.""" - role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) - - # Validate enum value - if not self.validate_enum_value(role, 'SymbolRole'): - raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") - - return role - - def get_objc_cursor_symbol_kind(self, cursor_kind: str) -> str: - """ - Map libclang cursor kind to internal symbol kind string. - - Args: - cursor_kind: libclang cursor kind (e.g., 'OBJC_INTERFACE_DECL', 'OBJC_INSTANCE_METHOD_DECL') - - Returns: - Internal symbol kind string for use with map_symbol_kind() - """ - cursor_kind_map = { - 'OBJC_INTERFACE_DECL': 'interface', - 'OBJC_IMPLEMENTATION_DECL': 'class', - 'OBJC_PROTOCOL_DECL': 'protocol', - 'OBJC_CATEGORY_DECL': 'category', - 'OBJC_CATEGORY_IMPL_DECL': 'category', - 'OBJC_INSTANCE_METHOD_DECL': 'method', - 'OBJC_CLASS_METHOD_DECL': 'method', - 'OBJC_PROPERTY_DECL': 'property', - 'OBJC_IVAR_DECL': 'ivar', - 'CLASS_DECL': 'class', - 'STRUCT_DECL': 'struct', - 'UNION_DECL': 'union', - 'ENUM_DECL': 'enum', - 'FUNCTION_DECL': 'function', - 'VAR_DECL': 'variable', - 'PARM_DECL': 'parameter', - 'FIELD_DECL': 'field', - 'TYPEDEF_DECL': 'typedef', - 'MACRO_DEFINITION': 'macro', - 'ENUM_CONSTANT_DECL': 'constant', - } - - return cursor_kind_map.get(cursor_kind, 'variable') - - def get_objc_cursor_syntax_kind(self, cursor_kind: str, context: str = None) -> str: - """ - Map libclang cursor kind to internal syntax kind string. - - Args: - cursor_kind: libclang cursor kind - context: Additional context for disambiguation - - Returns: - Internal syntax kind string for use with map_syntax_kind() - """ - cursor_syntax_map = { - 'OBJC_INTERFACE_DECL': 'interface_declaration', - 'OBJC_IMPLEMENTATION_DECL': 'class_declaration', - 'OBJC_PROTOCOL_DECL': 'protocol_declaration', - 'OBJC_CATEGORY_DECL': 'category_declaration', - 'OBJC_CATEGORY_IMPL_DECL': 'category_declaration', - 'OBJC_INSTANCE_METHOD_DECL': 'method_declaration', - 'OBJC_CLASS_METHOD_DECL': 'method_declaration', - 'OBJC_PROPERTY_DECL': 'property_declaration', - 'OBJC_IVAR_DECL': 'field_declaration', - 'CLASS_DECL': 'class_declaration', - 'STRUCT_DECL': 'struct_declaration', - 'UNION_DECL': 'union_declaration', - 'ENUM_DECL': 'enum_declaration', - 'FUNCTION_DECL': 'function_declaration', - 'VAR_DECL': 'variable_declaration', - 'PARM_DECL': 'parameter_declaration', - 'FIELD_DECL': 'field_declaration', - 'TYPEDEF_DECL': 'typedef_declaration', - 'MACRO_DEFINITION': 'macro_declaration', - } - - return cursor_syntax_map.get(cursor_kind, 'identifier') - - def get_objc_cursor_symbol_role(self, cursor_kind: str, context: str = None) -> str: - """ - Map libclang cursor kind to internal symbol role string. - - Args: - cursor_kind: libclang cursor kind - context: Additional context (e.g., 'in_assignment', 'in_call') - - Returns: - Internal symbol role string for use with map_symbol_role() - """ - if context == 'definition': - return 'definition' - elif context == 'assignment': - return 'write' - elif context == 'import': - return 'import' - elif cursor_kind in ['OBJC_INTERFACE_DECL', 'OBJC_IMPLEMENTATION_DECL', 'OBJC_PROTOCOL_DECL', - 'OBJC_CATEGORY_DECL', 'OBJC_INSTANCE_METHOD_DECL', 'OBJC_CLASS_METHOD_DECL', 'OBJC_PROPERTY_DECL', - 'CLASS_DECL', 'STRUCT_DECL', 'FUNCTION_DECL', 'VAR_DECL', 'TYPEDEF_DECL']: - return 'definition' - else: - return 'reference' - - def is_valid_objc_symbol_kind(self, symbol_kind: str) -> bool: - """Check if symbol kind is valid for Objective-C.""" - return symbol_kind in self.SYMBOL_KIND_MAP - - def is_valid_objc_syntax_kind(self, syntax_kind: str) -> bool: - """Check if syntax kind is valid for Objective-C.""" - return syntax_kind in self.SYNTAX_KIND_MAP - - def is_valid_objc_symbol_role(self, symbol_role: str) -> bool: - """Check if symbol role is valid for Objective-C.""" - return symbol_role in self.SYMBOL_ROLE_MAP - - def get_all_objc_symbol_kinds(self) -> list: - """Get all available Objective-C symbol kinds.""" - return list(self.SYMBOL_KIND_MAP.keys()) - - def get_all_objc_syntax_kinds(self) -> list: - """Get all available Objective-C syntax kinds.""" - return list(self.SYNTAX_KIND_MAP.keys()) - - def get_all_objc_symbol_roles(self) -> list: - """Get all available Objective-C symbol roles.""" - return list(self.SYMBOL_ROLE_MAP.keys()) - - def get_objective_c_specific_kinds(self) -> Dict[str, str]: - """Get Objective-C-specific symbol kinds.""" - return { - 'interface': 'interface', - 'protocol': 'protocol', - 'category': 'category', - 'property': 'property', - 'ivar': 'ivar', - 'class_method': 'method', - 'instance_method': 'method', - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/factory.py b/src/code_index_mcp/scip/framework/objective_c/factory.py deleted file mode 100644 index cfdfd9a..0000000 --- a/src/code_index_mcp/scip/framework/objective_c/factory.py +++ /dev/null @@ -1,500 +0,0 @@ -"""Objective-C SCIP Index Factory implementation.""" - -import os -from pathlib import Path -from typing import Set, List, Iterator, Optional -from ..base.index_factory import SCIPIndexFactory -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..base.enum_mapper import BaseEnumMapper -from ..symbol_generator import SCIPSymbolGenerator -from ..position_calculator import SCIPPositionCalculator -from ..types import SCIPContext, SCIPSymbolDescriptor -from .relationship_extractor import ObjectiveCRelationshipExtractor -from .enum_mapper import ObjectiveCEnumMapper -from .clang_analyzer import ObjectiveCClangAnalyzer -from ...proto import scip_pb2 - -try: - import clang.cindex as clang - from clang.cindex import CursorKind - LIBCLANG_AVAILABLE = True -except ImportError: - LIBCLANG_AVAILABLE = False - clang = None - CursorKind = None - - -class ObjectiveCSCIPIndexFactory(SCIPIndexFactory): - """Objective-C-specific SCIP Index factory implementation with constructor injection.""" - - def __init__(self, - project_root: str, - symbol_generator: SCIPSymbolGenerator, - relationship_extractor: BaseRelationshipExtractor, - enum_mapper: BaseEnumMapper, - position_calculator: SCIPPositionCalculator): - """Initialize Objective-C factory with required components via constructor injection.""" - if not LIBCLANG_AVAILABLE: - raise ImportError("libclang library not available") - - super().__init__(project_root, symbol_generator, relationship_extractor, - enum_mapper, position_calculator) - self.clang_analyzer = ObjectiveCClangAnalyzer() - - def get_language(self) -> str: - """Return language identifier.""" - return "objective-c" - - def get_supported_extensions(self) -> Set[str]: - """Return supported file extensions.""" - return {'.m', '.mm', '.h'} - - def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: - """Extract Objective-C symbol definitions using libclang analysis.""" - try: - translation_unit = self.clang_analyzer.parse(context.content, context.file_path) - - for cursor in self.clang_analyzer.walk(translation_unit): - if self.clang_analyzer.is_symbol_definition(cursor): - symbol_info = self._create_symbol_from_clang_cursor(cursor, context) - if symbol_info: - yield symbol_info - - except SyntaxError as e: - # Handle syntax errors gracefully - pass - - def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: - """Extract Objective-C symbol occurrences.""" - try: - translation_unit = self.clang_analyzer.parse(context.content, context.file_path) - - for cursor in self.clang_analyzer.walk(translation_unit): - if (self.clang_analyzer.is_symbol_definition(cursor) or - self.clang_analyzer.is_symbol_reference(cursor)): - occurrence = self._create_occurrence_from_clang_cursor(cursor, context) - if occurrence: - yield occurrence - - except SyntaxError as e: - # Handle syntax errors gracefully - pass - - def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract Objective-C external symbols from imports.""" - external_symbols = [] - - for doc in documents: - try: - content = self._read_file(os.path.join(self.project_root, doc.relative_path)) - translation_unit = self.clang_analyzer.parse(content, doc.relative_path) - - # Extract include statements - include_statements = self.clang_analyzer.extract_include_statements(translation_unit) - for include_path in include_statements: - external_symbol = self._create_external_symbol_from_include(include_path) - if external_symbol: - external_symbols.append(external_symbol) - - except Exception as e: - # Skip problematic files - continue - - return external_symbols - - def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: - """ - Build Objective-C-specific cross-document relationships. - - This implementation provides basic cross-document relationship support - for Objective-C. A more sophisticated implementation would analyze - #import/#include statements and framework dependencies. - """ - # For now, use a simplified approach - # TODO: Implement proper Objective-C import analysis - return 0 # Placeholder - no relationships added yet - - def _create_symbol_from_clang_cursor(self, cursor, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: - """Create SCIP symbol information from libclang cursor.""" - symbol_info = scip_pb2.SymbolInformation() - - symbol_name = self.clang_analyzer.get_symbol_name(cursor) - if not symbol_name: - return None - - if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('interface') - - elif cursor.kind == CursorKind.OBJC_IMPLEMENTATION_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('class') - - elif cursor.kind == CursorKind.OBJC_PROTOCOL_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('protocol') - - elif cursor.kind in [CursorKind.OBJC_CATEGORY_DECL, CursorKind.OBJC_CATEGORY_IMPL_DECL]: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('category') - - elif cursor.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('method') - - elif cursor.kind == CursorKind.OBJC_PROPERTY_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('property') - - elif cursor.kind == CursorKind.OBJC_IVAR_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('ivar') - - elif cursor.kind == CursorKind.FUNCTION_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('function') - - elif cursor.kind == CursorKind.VAR_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('variable') - - elif cursor.kind == CursorKind.ENUM_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('enum') - - elif cursor.kind == CursorKind.STRUCT_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('struct') - - elif cursor.kind == CursorKind.TYPEDEF_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('typedef') - - elif cursor.kind == CursorKind.MACRO_DEFINITION: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('macro') - - else: - return None - - return symbol_info - - def _create_occurrence_from_clang_cursor(self, cursor, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence from libclang cursor.""" - occurrence = scip_pb2.Occurrence() - - # Calculate position using position calculator - try: - position_info = self.position_calculator.calculate_positions_from_clang_cursor( - context.content, cursor - ) - - # Set range - occurrence.range.start.extend([position_info.start_line, position_info.start_column]) - occurrence.range.end.extend([position_info.end_line, position_info.end_column]) - - except Exception as e: - # Skip if position calculation fails - return None - - symbol_name = self.clang_analyzer.get_symbol_name(cursor) - if not symbol_name: - return None - - # Set symbol and roles based on cursor type - if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('interface_declaration') - - elif cursor.kind == CursorKind.OBJC_IMPLEMENTATION_DECL: - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('class_declaration') - - elif cursor.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('method_declaration') - - elif cursor.kind in [CursorKind.DECL_REF_EXPR, CursorKind.MEMBER_REF_EXPR]: - # Handle variable references - descriptor = SCIPSymbolDescriptor( - - name=symbol_name, - - kind="variable", - - scope_path=context.scope_stack, - - descriptor_suffix="" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') - - else: - return None - - return occurrence - - def _create_external_symbol_from_include(self, include_path: str) -> Optional[scip_pb2.SymbolInformation]: - """Create external symbol from include statement.""" - symbol_info = scip_pb2.SymbolInformation() - - # Determine if it's a system header or local header - if include_path.startswith('/System/') or include_path.startswith('/usr/'): - # System framework or library - symbol_info.symbol = f"objc-system {include_path}" - symbol_info.display_name = include_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"System header: {include_path}") - elif 'Frameworks' in include_path: - # Framework - symbol_info.symbol = f"objc-framework {include_path}" - symbol_info.display_name = include_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"Framework header: {include_path}") - else: - # Local or external header - symbol_info.symbol = f"objc-external {include_path}" - symbol_info.display_name = include_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"External header: {include_path}") - - return symbol_info - - -def create_objective_c_scip_factory(project_root: str) -> ObjectiveCSCIPIndexFactory: - """ - Factory creator for Objective-C SCIP factory. - Ensures all required components are properly assembled via constructor injection. - """ - if not LIBCLANG_AVAILABLE: - raise ImportError("libclang library not available") - - symbol_generator = SCIPSymbolGenerator( - scheme="scip-objc", - package_manager="xcode", - package_name=Path(project_root).name, - version="HEAD" - ) - - relationship_extractor = ObjectiveCRelationshipExtractor() - enum_mapper = ObjectiveCEnumMapper() - position_calculator = SCIPPositionCalculator() - - return ObjectiveCSCIPIndexFactory( - project_root=project_root, - symbol_generator=symbol_generator, - relationship_extractor=relationship_extractor, # Guaranteed to be provided - enum_mapper=enum_mapper, - position_calculator=position_calculator - ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/objective_c/relationship_extractor.py b/src/code_index_mcp/scip/framework/objective_c/relationship_extractor.py deleted file mode 100644 index bf884af..0000000 --- a/src/code_index_mcp/scip/framework/objective_c/relationship_extractor.py +++ /dev/null @@ -1,276 +0,0 @@ -"""Objective-C relationship extractor implementation.""" - -from typing import Iterator, Optional, List -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..types import SCIPContext, Relationship -from ...core.relationship_types import InternalRelationshipType - -try: - import clang.cindex as clang - from clang.cindex import CursorKind - LIBCLANG_AVAILABLE = True -except ImportError: - LIBCLANG_AVAILABLE = False - clang = None - CursorKind = None - - -class ObjectiveCRelationshipExtractor(BaseRelationshipExtractor): - """Objective-C-specific relationship extractor using libclang analysis.""" - - def __init__(self): - """Initialize the Objective-C relationship extractor.""" - if not LIBCLANG_AVAILABLE: - raise ImportError("libclang library not available") - - self.index = clang.Index.create() - - def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract inheritance relationships from Objective-C classes and protocols.""" - try: - translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: - interface_name = cursor.spelling - if not interface_name: - continue - - interface_symbol_id = self._create_interface_symbol_id(interface_name, context) - - # Look for superclass - for child in cursor.get_children(): - if child.kind == CursorKind.OBJC_SUPER_CLASS_REF: - parent_name = child.spelling - parent_symbol_id = self._create_interface_symbol_id(parent_name, context) - yield Relationship( - source_symbol=interface_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - - elif cursor.kind == CursorKind.OBJC_PROTOCOL_DECL: - protocol_name = cursor.spelling - if not protocol_name: - continue - - protocol_symbol_id = self._create_protocol_symbol_id(protocol_name, context) - - # Look for protocol inheritance - for child in cursor.get_children(): - if child.kind == CursorKind.OBJC_PROTOCOL_REF: - parent_protocol_name = child.spelling - parent_symbol_id = self._create_protocol_symbol_id(parent_protocol_name, context) - yield Relationship( - source_symbol=protocol_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract method call relationships.""" - try: - translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind in (CursorKind.OBJC_INSTANCE_METHOD_DECL, CursorKind.OBJC_CLASS_METHOD_DECL): - method_name = cursor.spelling - if not method_name: - continue - - method_symbol_id = self._create_method_symbol_id(method_name, context) - - # Find method calls within this method - for child in self._walk_cursor(cursor): - if child.kind == CursorKind.OBJC_MESSAGE_EXPR: - target_method = self._get_message_target(child) - if target_method and target_method != method_name: - target_symbol_id = self._create_method_symbol_id(target_method, context) - yield Relationship( - source_symbol=method_symbol_id, - target_symbol=target_symbol_id, - relationship_type=InternalRelationshipType.CALLS - ) - elif child.kind == CursorKind.CALL_EXPR: - # C function calls - target_function = child.spelling - if target_function and target_function != method_name: - target_symbol_id = self._create_function_symbol_id(target_function, context) - yield Relationship( - source_symbol=method_symbol_id, - target_symbol=target_symbol_id, - relationship_type=InternalRelationshipType.CALLS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract import/dependency relationships.""" - try: - translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) - - file_symbol_id = self._create_file_symbol_id(context.file_path) - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind == CursorKind.INCLUSION_DIRECTIVE: - include_path = self._get_include_path(cursor) - if include_path: - # Determine if it's a system header or local header - if include_path.startswith('<') and include_path.endswith('>'): - # System header - module_symbol_id = f"objc-system {include_path[1:-1]}" - elif include_path.startswith('"') and include_path.endswith('"'): - # Local header - module_symbol_id = f"local {include_path[1:-1]}" - else: - module_symbol_id = f"objc-external {include_path}" - - yield Relationship( - source_symbol=file_symbol_id, - target_symbol=module_symbol_id, - relationship_type=InternalRelationshipType.IMPORTS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract composition relationships (properties, ivars).""" - try: - translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind in [CursorKind.OBJC_INTERFACE_DECL, CursorKind.OBJC_IMPLEMENTATION_DECL]: - class_name = cursor.spelling - if not class_name: - continue - - class_symbol_id = self._create_class_symbol_id(class_name, context) - - # Find properties and ivars in this class - for child in cursor.get_children(): - if child.kind == CursorKind.OBJC_PROPERTY_DECL: - property_name = child.spelling - if property_name: - property_symbol_id = self._create_property_symbol_id(property_name, class_symbol_id) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=property_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - elif child.kind == CursorKind.OBJC_IVAR_DECL: - ivar_name = child.spelling - if ivar_name: - ivar_symbol_id = self._create_ivar_symbol_id(ivar_name, class_symbol_id) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=ivar_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract protocol implementation relationships.""" - try: - translation_unit = self.index.parse(context.file_path, args=['-x', 'objective-c']) - - for cursor in self._walk_cursor(translation_unit.cursor): - if cursor.kind == CursorKind.OBJC_INTERFACE_DECL: - interface_name = cursor.spelling - if not interface_name: - continue - - interface_symbol_id = self._create_interface_symbol_id(interface_name, context) - - # Look for protocol conformance - for child in cursor.get_children(): - if child.kind == CursorKind.OBJC_PROTOCOL_REF: - protocol_name = child.spelling - protocol_symbol_id = self._create_protocol_symbol_id(protocol_name, context) - yield Relationship( - source_symbol=interface_symbol_id, - target_symbol=protocol_symbol_id, - relationship_type=InternalRelationshipType.IMPLEMENTS - ) - - except Exception: - # Skip files with parsing errors - return - - def _walk_cursor(self, cursor) -> Iterator: - """Walk libclang cursor tree.""" - yield cursor - for child in cursor.get_children(): - yield from self._walk_cursor(child) - - def _get_message_target(self, message_expr_cursor) -> Optional[str]: - """Extract target method name from Objective-C message expression.""" - # Get the selector name from the message expression - for child in message_expr_cursor.get_children(): - if child.kind == CursorKind.OBJC_SELECTOR_REF: - return child.spelling - return None - - def _get_include_path(self, inclusion_cursor) -> Optional[str]: - """Extract include path from inclusion directive.""" - # Get the included file path - included_file = inclusion_cursor.get_included_file() - if included_file: - return included_file.name - return None - - def _create_class_symbol_id(self, class_name: str, context: SCIPContext) -> str: - """Create symbol ID for class.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{class_name}" if scope_path else class_name - return f"local {local_id}#" - - def _create_interface_symbol_id(self, interface_name: str, context: SCIPContext) -> str: - """Create symbol ID for interface.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{interface_name}" if scope_path else interface_name - return f"local {local_id}#" - - def _create_protocol_symbol_id(self, protocol_name: str, context: SCIPContext) -> str: - """Create symbol ID for protocol.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{protocol_name}" if scope_path else protocol_name - return f"local {local_id}#" - - def _create_method_symbol_id(self, method_name: str, context: SCIPContext) -> str: - """Create symbol ID for method.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{method_name}" if scope_path else method_name - return f"local {local_id}()." - - def _create_function_symbol_id(self, function_name: str, context: SCIPContext) -> str: - """Create symbol ID for C function.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{function_name}" if scope_path else function_name - return f"local {local_id}()." - - def _create_property_symbol_id(self, property_name: str, class_symbol_id: str) -> str: - """Create symbol ID for property.""" - # Extract class name from class symbol ID - class_name = class_symbol_id.replace("local ", "").replace("#", "") - return f"local {class_name}.{property_name}" - - def _create_ivar_symbol_id(self, ivar_name: str, class_symbol_id: str) -> str: - """Create symbol ID for instance variable.""" - # Extract class name from class symbol ID - class_name = class_symbol_id.replace("local ", "").replace("#", "") - return f"local {class_name}.{ivar_name}" - - def _create_file_symbol_id(self, file_path: str) -> str: - """Create symbol ID for file.""" - return f"local {file_path}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/position_calculator.py b/src/code_index_mcp/scip/framework/position_calculator.py deleted file mode 100644 index 6d4364d..0000000 --- a/src/code_index_mcp/scip/framework/position_calculator.py +++ /dev/null @@ -1,225 +0,0 @@ -"""SCIP Position Calculator - UTF-8/UTF-16 compliant position calculation.""" - -import logging -from typing import Tuple, Optional, Any -from .types import SCIPPositionInfo - - -logger = logging.getLogger(__name__) - - -class SCIPPositionCalculator: - """SCIP position calculator - UTF-8/UTF-16 compliant with mandatory validation.""" - - def __init__(self, encoding: str = "utf-8"): - """Initialize position calculator with specified encoding.""" - self.encoding = encoding - self._line_cache = {} # Cache for line information - - def calculate_positions(self, content: str, node_info: Any) -> SCIPPositionInfo: - """Calculate precise positions with mandatory validation.""" - - # Language-specific node position extraction logic - start_line, start_col, end_line, end_col = self._extract_node_positions(content, node_info) - - # Create position information - position = SCIPPositionInfo(start_line, start_col, end_line, end_col) - - # Mandatory validation - if not position.validate(): - raise ValueError(f"Invalid position: {position}") - - # Validate within document bounds - if not self._is_within_bounds(position, content): - raise ValueError(f"Position out of document bounds: {position}") - - return position - - def calculate_positions_from_range(self, content: str, start_byte: int, end_byte: int) -> SCIPPositionInfo: - """Calculate positions from byte ranges (useful for tree-sitter nodes).""" - lines = content.split('\n') - - # Convert byte offsets to line/column positions - start_line, start_col = self._byte_offset_to_line_col(content, start_byte, lines) - end_line, end_col = self._byte_offset_to_line_col(content, end_byte, lines) - - position = SCIPPositionInfo(start_line, start_col, end_line, end_col) - - # Mandatory validation - if not position.validate(): - raise ValueError(f"Invalid position calculated from bytes [{start_byte}:{end_byte}]: {position}") - - if not self._is_within_bounds(position, content): - raise ValueError(f"Position out of document bounds: {position}") - - return position - - def calculate_positions_from_line_col(self, content: str, start_line: int, start_col: int, - end_line: int, end_col: int) -> SCIPPositionInfo: - """Calculate positions from explicit line/column coordinates.""" - position = SCIPPositionInfo(start_line, start_col, end_line, end_col) - - # Mandatory validation - if not position.validate(): - raise ValueError(f"Invalid position: {position}") - - # Validate within document bounds - if not self._is_within_bounds(position, content): - raise ValueError(f"Position out of document bounds: {position}") - - return position - - def _extract_node_positions(self, content: str, node_info: Any) -> Tuple[int, int, int, int]: - """Extract node positions - subclass implementation required.""" - # Default implementation for objects with line/column attributes - if hasattr(node_info, 'lineno') and hasattr(node_info, 'col_offset'): - # AST node (Python) - start_line = node_info.lineno - 1 # Convert to 0-indexed - start_col = node_info.col_offset - - # Estimate end position if not available - if hasattr(node_info, 'end_lineno') and hasattr(node_info, 'end_col_offset'): - end_line = node_info.end_lineno - 1 - end_col = node_info.end_col_offset - else: - # Fallback: assume single token - end_line = start_line - end_col = start_col + len(getattr(node_info, 'name', 'unknown')) - - return start_line, start_col, end_line, end_col - - elif hasattr(node_info, 'start_point') and hasattr(node_info, 'end_point'): - # Tree-sitter node - start_line = node_info.start_point[0] - start_col = node_info.start_point[1] - end_line = node_info.end_point[0] - end_col = node_info.end_point[1] - - return start_line, start_col, end_line, end_col - - elif isinstance(node_info, dict): - # Dictionary format - return ( - node_info.get('start_line', 0), - node_info.get('start_col', 0), - node_info.get('end_line', 0), - node_info.get('end_col', 0) - ) - - else: - raise NotImplementedError(f"Position extraction not implemented for node type: {type(node_info)}") - - def _byte_offset_to_line_col(self, content: str, byte_offset: int, lines: list) -> Tuple[int, int]: - """Convert byte offset to line/column position with UTF-8 awareness.""" - if byte_offset == 0: - return 0, 0 - - # Convert content to bytes for accurate offset calculation - content_bytes = content.encode(self.encoding) - - if byte_offset >= len(content_bytes): - # End of file - return len(lines) - 1, len(lines[-1]) if lines else 0 - - # Find the line containing this byte offset - current_byte = 0 - for line_num, line in enumerate(lines): - line_bytes = (line + '\n').encode(self.encoding) if line_num < len(lines) - 1 else line.encode(self.encoding) - - if current_byte + len(line_bytes) > byte_offset: - # Byte offset is within this line - offset_in_line = byte_offset - current_byte - # Convert byte offset within line to character position - line_text = line_bytes[:offset_in_line].decode(self.encoding, errors='ignore') - return line_num, len(line_text) - - current_byte += len(line_bytes) - - # Fallback - return len(lines) - 1, len(lines[-1]) if lines else 0 - - def _is_within_bounds(self, position: SCIPPositionInfo, content: str) -> bool: - """Validate position is within document bounds.""" - lines = content.split('\n') - max_line = len(lines) - 1 - - # Check line bounds - if position.start_line < 0 or position.end_line > max_line: - return False - - # Check column bounds for start position - if position.start_line <= max_line: - max_start_col = len(lines[position.start_line]) - if position.start_column < 0 or position.start_column > max_start_col: - return False - - # Check column bounds for end position - if position.end_line <= max_line: - max_end_col = len(lines[position.end_line]) - if position.end_column < 0 or position.end_column > max_end_col: - return False - - return True - - def _is_utf8_compliant(self, position: SCIPPositionInfo, content: str) -> bool: - """Validate UTF-8 character position accuracy.""" - try: - lines = content.split('\n') - - # Check if positions fall on character boundaries - if position.start_line < len(lines): - start_line_text = lines[position.start_line] - if position.start_column <= len(start_line_text): - # Check UTF-8 character boundary - char_at_pos = start_line_text[:position.start_column].encode('utf-8') - # If we can encode/decode without errors, position is valid - char_at_pos.decode('utf-8') - - if position.end_line < len(lines): - end_line_text = lines[position.end_line] - if position.end_column <= len(end_line_text): - char_at_pos = end_line_text[:position.end_column].encode('utf-8') - char_at_pos.decode('utf-8') - - return True - - except (UnicodeEncodeError, UnicodeDecodeError, IndexError): - logger.warning(f"UTF-8 compliance check failed for position: {position}") - return False - - def validate_position_full(self, position: SCIPPositionInfo, content: str) -> bool: - """Perform full position validation including UTF-8 compliance.""" - return ( - position.validate() and - self._is_within_bounds(position, content) and - self._is_utf8_compliant(position, content) - ) - - def get_position_text(self, content: str, position: SCIPPositionInfo) -> str: - """Extract text at the given position for verification.""" - try: - lines = content.split('\n') - - if position.start_line == position.end_line: - # Single line - line = lines[position.start_line] - return line[position.start_column:position.end_column] - else: - # Multi-line - result_lines = [] - - # First line - result_lines.append(lines[position.start_line][position.start_column:]) - - # Middle lines - for line_num in range(position.start_line + 1, position.end_line): - result_lines.append(lines[line_num]) - - # Last line - result_lines.append(lines[position.end_line][:position.end_column]) - - return '\n'.join(result_lines) - - except IndexError as e: - logger.error(f"Failed to extract text at position {position}: {e}") - return "" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/__init__.py b/src/code_index_mcp/scip/framework/python/__init__.py deleted file mode 100644 index 231b4bd..0000000 --- a/src/code_index_mcp/scip/framework/python/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Python-specific SCIP framework components.""" - -from .factory import PythonSCIPIndexFactory, create_python_scip_factory -from .relationship_extractor import PythonRelationshipExtractor -from .enum_mapper import PythonEnumMapper -from .ast_analyzer import PythonASTAnalyzer - -__all__ = [ - 'PythonSCIPIndexFactory', - 'create_python_scip_factory', - 'PythonRelationshipExtractor', - 'PythonEnumMapper', - 'PythonASTAnalyzer', -] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/ast_analyzer.py b/src/code_index_mcp/scip/framework/python/ast_analyzer.py deleted file mode 100644 index a21a427..0000000 --- a/src/code_index_mcp/scip/framework/python/ast_analyzer.py +++ /dev/null @@ -1,312 +0,0 @@ -"""Python AST analyzer implementation.""" - -import ast -from typing import Iterator, Optional, Set, List, Dict, Any -from ..types import SCIPContext -from ..base.language_analyzer import BaseLanguageAnalyzer - - -class PythonASTAnalyzer(BaseLanguageAnalyzer): - """Python AST analyzer for deep code analysis.""" - - def __init__(self): - """Initialize the AST analyzer.""" - self._processed_nodes: Set[int] = set() - self._scope_stack: List[str] = [] - self._imports: Dict[str, str] = {} # alias -> module mapping - - def parse(self, content: str, filename: str = "") -> ast.AST: - """Parse Python source code into AST.""" - try: - return ast.parse(content, filename=filename) - except SyntaxError as e: - raise SyntaxError(f"Python syntax error in {filename}: {e}") - - def walk(self, tree: ast.AST) -> Iterator[ast.AST]: - """Walk AST nodes, avoiding duplicates.""" - for node in ast.walk(tree): - node_id = id(node) - if node_id not in self._processed_nodes: - self._processed_nodes.add(node_id) - yield node - - def is_symbol_definition(self, node: ast.AST) -> bool: - """Check if AST node represents a symbol definition.""" - return isinstance(node, ( - ast.FunctionDef, - ast.AsyncFunctionDef, - ast.ClassDef, - ast.Assign, - ast.AnnAssign, - ast.AugAssign - )) - - def is_symbol_reference(self, node: ast.AST) -> bool: - """Check if AST node represents a symbol reference.""" - return isinstance(node, ( - ast.Name, - ast.Attribute, - ast.Call - )) - - def get_symbol_name(self, node: ast.AST) -> Optional[str]: - """Extract symbol name from AST node.""" - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): - return node.name - elif isinstance(node, ast.Name): - return node.id - elif isinstance(node, ast.Assign): - # Handle simple assignments - if len(node.targets) == 1: - target = node.targets[0] - if isinstance(target, ast.Name): - return target.id - elif isinstance(node, ast.AnnAssign): - if isinstance(node.target, ast.Name): - return node.target.id - elif isinstance(node, ast.AugAssign): - if isinstance(node.target, ast.Name): - return node.target.id - - return None - - def get_node_position(self, node: ast.AST) -> tuple: - """Get position information from AST node.""" - if hasattr(node, 'lineno') and hasattr(node, 'col_offset'): - start_line = node.lineno - 1 # Convert to 0-based - start_col = node.col_offset - - # Try to get end position - if hasattr(node, 'end_lineno') and hasattr(node, 'end_col_offset'): - end_line = node.end_lineno - 1 - end_col = node.end_col_offset - else: - # Estimate end position - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): - name_len = len(node.name) - end_line = start_line - end_col = start_col + name_len - else: - end_line = start_line - end_col = start_col + 1 - - return (start_line, start_col, end_line, end_col) - - return (0, 0, 0, 1) # Default fallback - - def extract_decorators(self, node: ast.AST) -> List[str]: - """Extract decorator names from function or class.""" - decorators = [] - if hasattr(node, 'decorator_list'): - for decorator in node.decorator_list: - if isinstance(decorator, ast.Name): - decorators.append(decorator.id) - elif isinstance(decorator, ast.Attribute): - decorators.append(self._get_attribute_name(decorator)) - elif isinstance(decorator, ast.Call): - if isinstance(decorator.func, ast.Name): - decorators.append(decorator.func.id) - elif isinstance(decorator.func, ast.Attribute): - decorators.append(self._get_attribute_name(decorator.func)) - - return decorators - - def extract_function_arguments(self, node: ast.FunctionDef) -> List[Dict[str, Any]]: - """Extract function argument information.""" - arguments = [] - - # Regular arguments - for arg in node.args.args: - arg_info = { - 'name': arg.arg, - 'type': 'regular', - 'annotation': self._get_annotation_string(arg.annotation) if arg.annotation else None - } - arguments.append(arg_info) - - # *args - if node.args.vararg: - arg_info = { - 'name': node.args.vararg.arg, - 'type': 'vararg', - 'annotation': self._get_annotation_string(node.args.vararg.annotation) if node.args.vararg.annotation else None - } - arguments.append(arg_info) - - # **kwargs - if node.args.kwarg: - arg_info = { - 'name': node.args.kwarg.arg, - 'type': 'kwarg', - 'annotation': self._get_annotation_string(node.args.kwarg.annotation) if node.args.kwarg.annotation else None - } - arguments.append(arg_info) - - # Keyword-only arguments - for arg in node.args.kwonlyargs: - arg_info = { - 'name': arg.arg, - 'type': 'keyword_only', - 'annotation': self._get_annotation_string(arg.annotation) if arg.annotation else None - } - arguments.append(arg_info) - - return arguments - - def extract_class_bases(self, node: ast.ClassDef) -> List[str]: - """Extract base class names.""" - bases = [] - for base in node.bases: - if isinstance(base, ast.Name): - bases.append(base.id) - elif isinstance(base, ast.Attribute): - bases.append(self._get_attribute_name(base)) - - return bases - - def extract_class_methods(self, node: ast.ClassDef) -> List[Dict[str, Any]]: - """Extract class method information.""" - methods = [] - - for child in node.body: - if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)): - method_info = { - 'name': child.name, - 'type': 'async_method' if isinstance(child, ast.AsyncFunctionDef) else 'method', - 'decorators': self.extract_decorators(child), - 'arguments': self.extract_function_arguments(child), - 'is_property': 'property' in self.extract_decorators(child), - 'is_static': 'staticmethod' in self.extract_decorators(child), - 'is_class': 'classmethod' in self.extract_decorators(child), - } - methods.append(method_info) - - return methods - - def extract_imports(self, tree: ast.AST) -> Dict[str, str]: - """Extract import statements and build alias mapping.""" - imports = {} - - for node in ast.walk(tree): - if isinstance(node, ast.Import): - for alias in node.names: - name = alias.asname if alias.asname else alias.name - imports[name] = alias.name - elif isinstance(node, ast.ImportFrom): - if node.module: - for alias in node.names: - name = alias.asname if alias.asname else alias.name - imports[name] = f"{node.module}.{alias.name}" - - return imports - - def analyze_scope_context(self, node: ast.AST, parent_scopes: List[str] = None) -> List[str]: - """Analyze scope context for a node.""" - if parent_scopes is None: - parent_scopes = [] - - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): - return parent_scopes + [node.name] - - return parent_scopes - - def find_variable_assignments(self, tree: ast.AST) -> List[Dict[str, Any]]: - """Find all variable assignments in the AST.""" - assignments = [] - - for node in ast.walk(tree): - if isinstance(node, ast.Assign): - for target in node.targets: - if isinstance(target, ast.Name): - assignment_info = { - 'name': target.id, - 'type': 'assignment', - 'position': self.get_node_position(node), - 'value_type': self._get_value_type(node.value) - } - assignments.append(assignment_info) - elif isinstance(node, ast.AnnAssign): - if isinstance(node.target, ast.Name): - assignment_info = { - 'name': node.target.id, - 'type': 'annotated_assignment', - 'position': self.get_node_position(node), - 'annotation': self._get_annotation_string(node.annotation), - 'value_type': self._get_value_type(node.value) if node.value else None - } - assignments.append(assignment_info) - - return assignments - - def find_function_calls(self, tree: ast.AST) -> List[Dict[str, Any]]: - """Find all function calls in the AST.""" - calls = [] - - for node in ast.walk(tree): - if isinstance(node, ast.Call): - call_info = { - 'function': self._get_call_name(node), - 'position': self.get_node_position(node), - 'args_count': len(node.args), - 'kwargs_count': len(node.keywords) - } - calls.append(call_info) - - return calls - - def _get_attribute_name(self, attr_node: ast.Attribute) -> str: - """Get full attribute name (e.g., module.Class).""" - parts = [] - current = attr_node - - while isinstance(current, ast.Attribute): - parts.append(current.attr) - current = current.value - - if isinstance(current, ast.Name): - parts.append(current.id) - - return ".".join(reversed(parts)) if parts else "" - - def _get_annotation_string(self, annotation: ast.AST) -> str: - """Convert annotation AST to string.""" - if isinstance(annotation, ast.Name): - return annotation.id - elif isinstance(annotation, ast.Attribute): - return self._get_attribute_name(annotation) - elif isinstance(annotation, ast.Constant): - return str(annotation.value) - elif isinstance(annotation, ast.Str): # Python < 3.8 - return annotation.s - else: - return str(type(annotation).__name__) - - def _get_value_type(self, value: ast.AST) -> str: - """Get the type of a value expression.""" - if isinstance(value, ast.Constant): - return type(value.value).__name__ - elif isinstance(value, (ast.Str, ast.Bytes)): # Python < 3.8 - return type(value.s).__name__ - elif isinstance(value, ast.Num): # Python < 3.8 - return type(value.n).__name__ - elif isinstance(value, ast.List): - return "list" - elif isinstance(value, ast.Dict): - return "dict" - elif isinstance(value, ast.Set): - return "set" - elif isinstance(value, ast.Tuple): - return "tuple" - elif isinstance(value, ast.Call): - return self._get_call_name(value) - else: - return "unknown" - - def _get_call_name(self, call_node: ast.Call) -> str: - """Get the name of a function call.""" - if isinstance(call_node.func, ast.Name): - return call_node.func.id - elif isinstance(call_node.func, ast.Attribute): - return self._get_attribute_name(call_node.func) - else: - return "unknown" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/enum_mapper.py b/src/code_index_mcp/scip/framework/python/enum_mapper.py deleted file mode 100644 index 8d679a3..0000000 --- a/src/code_index_mcp/scip/framework/python/enum_mapper.py +++ /dev/null @@ -1,181 +0,0 @@ -"""Python enum mapper implementation.""" - -from ..base.enum_mapper import BaseEnumMapper -from ...proto import scip_pb2 - - -class PythonEnumMapper(BaseEnumMapper): - """Python-specific enum mapper for SCIP compliance.""" - - # Python symbol kind mappings - SYMBOL_KIND_MAP = { - 'function': scip_pb2.Function, - 'async_function': scip_pb2.Function, - 'method': scip_pb2.Method, - 'class': scip_pb2.Class, - 'variable': scip_pb2.Variable, - 'constant': scip_pb2.Constant, - 'module': scip_pb2.Module, - 'parameter': scip_pb2.Parameter, - 'property': scip_pb2.Property, - 'constructor': scip_pb2.Constructor, - 'field': scip_pb2.Field, - 'namespace': scip_pb2.Namespace, - } - - # Python syntax kind mappings (using actual SCIP protobuf attributes) - SYNTAX_KIND_MAP = { - 'function_definition': scip_pb2.IdentifierFunctionDefinition, - 'class_definition': scip_pb2.IdentifierType, - 'variable_definition': scip_pb2.IdentifierLocal, # Use IdentifierLocal instead of IdentifierVariable - 'parameter_definition': scip_pb2.IdentifierParameter, - 'identifier': scip_pb2.Identifier, - 'keyword': scip_pb2.IdentifierKeyword, - 'string_literal': scip_pb2.StringLiteral, - 'numeric_literal': scip_pb2.NumericLiteral, - 'boolean_literal': scip_pb2.BooleanLiteral, - 'comment': scip_pb2.Comment, - 'punctuation': scip_pb2.PunctuationDelimiter, - } - - # Python symbol role mappings (using official SCIP protobuf attributes) - SYMBOL_ROLE_MAP = { - 'definition': scip_pb2.Definition, - 'import': scip_pb2.Import, - 'write': scip_pb2.Write, # Official SCIP naming - 'read': scip_pb2.Read, # Official SCIP naming - 'generated': scip_pb2.Generated, - 'test': scip_pb2.Test, - 'type': scip_pb2.Type, # Add missing Type role - 'reference': scip_pb2.Read, # Default reference is read access - } - - def map_symbol_kind(self, language_kind: str) -> int: - """Map Python symbol type to SCIP SymbolKind.""" - kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SymbolKind'): - raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") - - return kind - - def map_syntax_kind(self, language_syntax: str) -> int: - """Map Python syntax element to SCIP SyntaxKind.""" - kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SyntaxKind'): - raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") - - return kind - - def map_symbol_role(self, language_role: str) -> int: - """Map Python symbol role to SCIP SymbolRole.""" - role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) - - # Validate enum value - if not self.validate_enum_value(role, 'SymbolRole'): - raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") - - return role - - def get_python_node_symbol_kind(self, node_type: str) -> str: - """ - Map Python AST node type to internal symbol kind string. - - Args: - node_type: Python AST node type (e.g., 'FunctionDef', 'ClassDef') - - Returns: - Internal symbol kind string for use with map_symbol_kind() - """ - node_kind_map = { - 'FunctionDef': 'function', - 'AsyncFunctionDef': 'async_function', - 'ClassDef': 'class', - 'Assign': 'variable', - 'AnnAssign': 'variable', - 'AugAssign': 'variable', - 'arg': 'parameter', - 'Import': 'module', - 'ImportFrom': 'module', - } - - return node_kind_map.get(node_type, 'variable') - - def get_python_node_syntax_kind(self, node_type: str, context: str = None) -> str: - """ - Map Python AST node type to internal syntax kind string. - - Args: - node_type: Python AST node type - context: Additional context for disambiguation - - Returns: - Internal syntax kind string for use with map_syntax_kind() - """ - node_syntax_map = { - 'FunctionDef': 'function_definition', - 'AsyncFunctionDef': 'function_definition', - 'ClassDef': 'class_definition', - 'Assign': 'variable_definition', - 'AnnAssign': 'variable_definition', - 'Name': 'identifier', - 'Str': 'string_literal', - 'Num': 'numeric_literal', - 'Constant': 'numeric_literal', # Python 3.8+ - 'NameConstant': 'boolean_literal', # True, False, None - } - - return node_syntax_map.get(node_type, 'identifier') - - def get_python_node_symbol_role(self, node_type: str, context: str = None) -> str: - """ - Map Python AST node type to internal symbol role string. - - Args: - node_type: Python AST node type - context: Additional context (e.g., 'in_assignment', 'in_call') - - Returns: - Internal symbol role string for use with map_symbol_role() - """ - if context == 'definition': - return 'definition' - elif context == 'assignment': - return 'write' - elif context == 'import': - return 'import' - elif node_type in ['FunctionDef', 'AsyncFunctionDef', 'ClassDef']: - return 'definition' - else: - return 'reference' - - def is_valid_python_symbol_kind(self, symbol_kind: str) -> bool: - """Check if symbol kind is valid for Python.""" - return symbol_kind in self.SYMBOL_KIND_MAP - - def is_valid_python_syntax_kind(self, syntax_kind: str) -> bool: - """Check if syntax kind is valid for Python.""" - return syntax_kind in self.SYNTAX_KIND_MAP - - def is_valid_python_symbol_role(self, symbol_role: str) -> bool: - """Check if symbol role is valid for Python.""" - return symbol_role in self.SYMBOL_ROLE_MAP - - def get_python_type_reference_role(self) -> str: - """Get symbol role for type references (e.g., in annotations).""" - return 'type' - - def get_all_python_symbol_kinds(self) -> list: - """Get all available Python symbol kinds.""" - return list(self.SYMBOL_KIND_MAP.keys()) - - def get_all_python_syntax_kinds(self) -> list: - """Get all available Python syntax kinds.""" - return list(self.SYNTAX_KIND_MAP.keys()) - - def get_all_python_symbol_roles(self) -> list: - """Get all available Python symbol roles.""" - return list(self.SYMBOL_ROLE_MAP.keys()) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/factory.py b/src/code_index_mcp/scip/framework/python/factory.py deleted file mode 100644 index 820dbef..0000000 --- a/src/code_index_mcp/scip/framework/python/factory.py +++ /dev/null @@ -1,583 +0,0 @@ -"""Python SCIP Index Factory implementation.""" - -import ast -import os -import logging -from pathlib import Path -from typing import Set, List, Iterator, Optional, Dict -from ..base.index_factory import SCIPIndexFactory -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..base.enum_mapper import BaseEnumMapper -from ..symbol_generator import SCIPSymbolGenerator -from ..position_calculator import SCIPPositionCalculator -from ..types import SCIPSymbolContext as SCIPContext, SCIPSymbolDescriptor -from .relationship_extractor import PythonRelationshipExtractor -from .enum_mapper import PythonEnumMapper -from .ast_analyzer import PythonASTAnalyzer -from ...proto import scip_pb2 - -logger = logging.getLogger(__name__) - - -class PythonSCIPIndexFactory(SCIPIndexFactory): - """Python-specific SCIP Index factory implementation with constructor injection.""" - - def __init__(self, - project_root: str, - symbol_generator: SCIPSymbolGenerator, - relationship_extractor: BaseRelationshipExtractor, - enum_mapper: BaseEnumMapper, - position_calculator: SCIPPositionCalculator): - """Initialize Python factory with required components via constructor injection.""" - super().__init__(project_root, symbol_generator, relationship_extractor, - enum_mapper, position_calculator) - self.ast_analyzer = PythonASTAnalyzer() - self._parsed_trees = {} # Cache parsed AST trees - self._current_file_symbols = set() # Track symbols defined in current file - - def get_language(self) -> str: - """Return language identifier.""" - return "python" - - def get_supported_extensions(self) -> Set[str]: - """Return supported file extensions.""" - return {'.py', '.pyw', '.pyx'} - - def _get_or_parse_tree(self, context: SCIPContext): - """Get cached AST tree or parse if not cached.""" - cache_key = context.file_path - if cache_key not in self._parsed_trees: - try: - self._parsed_trees[cache_key] = self.ast_analyzer.parse(context.content) - except SyntaxError: - self._parsed_trees[cache_key] = None - return self._parsed_trees[cache_key] - - def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: - """Extract Python symbol definitions using AST analysis.""" - tree = self._get_or_parse_tree(context) - if tree is None: - return - - # First pass: collect all defined symbols in this file - self._current_file_symbols.clear() - for node in self.ast_analyzer.walk(tree): - if self.ast_analyzer.is_symbol_definition(node): - symbol_name = self.ast_analyzer.get_symbol_name(node) - if symbol_name: - self._current_file_symbols.add(symbol_name) - - # Clear processed nodes cache for fresh traversal - self.ast_analyzer._processed_nodes.clear() - - for node in self.ast_analyzer.walk(tree): - if self.ast_analyzer.is_symbol_definition(node): - symbol_info = self._create_symbol_from_ast_node(node, context) - if symbol_info: - yield symbol_info - - def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: - """Extract Python symbol occurrences.""" - tree = self._get_or_parse_tree(context) - if tree is None: - return - - # First pass: collect all defined symbols in this file - self._current_file_symbols.clear() - for node in self.ast_analyzer.walk(tree): - if self.ast_analyzer.is_symbol_definition(node): - symbol_name = self.ast_analyzer.get_symbol_name(node) - if symbol_name: - self._current_file_symbols.add(symbol_name) - - # Need to clear processed nodes for occurrence extraction - # Since symbols were already extracted, the cache needs reset - self.ast_analyzer._processed_nodes.clear() - - for node in self.ast_analyzer.walk(tree): - if self.ast_analyzer.is_symbol_definition(node) or self.ast_analyzer.is_symbol_reference(node): - occurrence = self._create_occurrence_from_ast_node(node, context) - if occurrence: - yield occurrence - - def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract Python external symbols from imports.""" - external_symbols = [] - - for doc in documents: - # Use cached AST tree if available - need full path for cache key - full_path = os.path.join(self.project_root, doc.relative_path) - cache_key = full_path - tree = self._parsed_trees.get(cache_key) - - if tree is None: - # Only parse if not already cached - try: - content = self._read_file(full_path) - tree = self.ast_analyzer.parse(content) - self._parsed_trees[cache_key] = tree - except (FileNotFoundError, SyntaxError): - continue - - if tree is not None: - for node in self.ast_analyzer.walk(tree): - if isinstance(node, (ast.Import, ast.ImportFrom)): - external_symbol = self._create_external_symbol_from_import(node) - if external_symbol: - external_symbols.append(external_symbol) - continue - - return external_symbols - - def clear_cache(self): - """Clear AST parsing cache.""" - self._parsed_trees.clear() - - def _create_symbol_from_ast_node(self, node: ast.AST, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: - """Create SCIP symbol information from AST node.""" - symbol_info = scip_pb2.SymbolInformation() - - if isinstance(node, ast.FunctionDef): - descriptor = SCIPSymbolDescriptor( - - name=node.name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = node.name - symbol_info.kind = self.enum_mapper.map_symbol_kind('function') - - # Add docstring if available - docstring = ast.get_docstring(node) - if docstring: - symbol_info.documentation.append(docstring) - - elif isinstance(node, ast.AsyncFunctionDef): - descriptor = SCIPSymbolDescriptor( - - name=node.name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = node.name - symbol_info.kind = self.enum_mapper.map_symbol_kind('async_function') - - # Add docstring if available - docstring = ast.get_docstring(node) - if docstring: - symbol_info.documentation.append(docstring) - - elif isinstance(node, ast.ClassDef): - descriptor = SCIPSymbolDescriptor( - - name=node.name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = node.name - symbol_info.kind = self.enum_mapper.map_symbol_kind('class') - - # Add docstring if available - docstring = ast.get_docstring(node) - if docstring: - symbol_info.documentation.append(docstring) - - else: - return None - - return symbol_info - - def _create_occurrence_from_ast_node(self, node: ast.AST, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence from AST node.""" - occurrence = scip_pb2.Occurrence() - - # Calculate position using position calculator - try: - position_info = self.position_calculator.calculate_positions( - context.content, node - ) - - # Set range - occurrence.range.start.extend([position_info.start_line, position_info.start_column]) - occurrence.range.end.extend([position_info.end_line, position_info.end_column]) - - except Exception as e: - # Skip if position calculation fails - return None - - # Set symbol and roles based on node type - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): - descriptor = SCIPSymbolDescriptor( - - name=node.name, - - kind="function", - - scope_path=context.scope_stack, - - descriptor_suffix="()." - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('function_definition') - - elif isinstance(node, ast.ClassDef): - descriptor = SCIPSymbolDescriptor( - - name=node.name, - - kind="class", - - scope_path=context.scope_stack, - - descriptor_suffix="#" - - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('class_definition') - - elif isinstance(node, ast.Name): - # Handle variable references - # Check if this is an internal or external symbol - is_internal = node.id in self._current_file_symbols - - if is_internal: - descriptor = SCIPSymbolDescriptor( - name=node.id, - kind="variable", - scope_path=context.scope_stack, - descriptor_suffix="" - ) - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - else: - # External symbol - use appropriate namespace - # Common Python builtins - if node.id in {'str', 'int', 'float', 'bool', 'list', 'dict', 'set', 'tuple', - 'None', 'True', 'False', 'print', 'len', 'range', 'open'}: - occurrence.symbol = f"python-builtin {node.id}" - else: - # Assume it's from an import or global scope - occurrence.symbol = f"python {node.id}" - - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') - - elif isinstance(node, ast.Call): - # Handle function calls - func_name = self._extract_call_name(node.func) - if func_name: - # Check if this is an internal or external function - is_internal = func_name in self._current_file_symbols - - if is_internal: - # Internal function/method - if isinstance(node.func, ast.Attribute): - # Method call - use method descriptor - descriptor = SCIPSymbolDescriptor( - name=func_name, - kind="method", - scope_path=context.scope_stack, - descriptor_suffix="()." - ) - else: - # Function call - descriptor = SCIPSymbolDescriptor( - name=func_name, - kind="function", - scope_path=context.scope_stack, - descriptor_suffix="()." - ) - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - else: - # External function - if func_name in {'print', 'len', 'range', 'open', 'input', 'int', 'str', 'float'}: - occurrence.symbol = f"python-builtin {func_name}()." - else: - occurrence.symbol = f"python {func_name}()." - - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('function') - else: - return None - - elif isinstance(node, ast.Attribute): - # Handle attribute access (including method references) - attr_name = node.attr - descriptor = SCIPSymbolDescriptor( - name=attr_name, - kind="variable", # Could be method, property, or field - scope_path=context.scope_stack, - descriptor_suffix="" - ) - - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') - - else: - return None - - return occurrence - - def _create_external_symbol_from_import(self, node: ast.AST) -> Optional[scip_pb2.SymbolInformation]: - """Create external symbol from import statement.""" - symbol_info = scip_pb2.SymbolInformation() - - if isinstance(node, ast.Import): - for alias in node.names: - symbol_info.symbol = f"python-stdlib {alias.name}" - symbol_info.display_name = alias.name - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"Imported module: {alias.name}") - return symbol_info - - elif isinstance(node, ast.ImportFrom): - if node.module: - symbol_info.symbol = f"python-stdlib {node.module}" - symbol_info.display_name = node.module - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"Imported from module: {node.module}") - return symbol_info - - return None - - def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: - """ - Build Python-specific cross-document relationships. - - This implementation analyzes Python import statements and creates proper - cross-document relationships using package-qualified symbol names. - """ - logger.info(f"Building Python cross-document relationships for {len(documents)} files") - - # Step 1: Analyze Python imports across all documents - import_mapping = self._analyze_python_imports(documents) - - # Step 2: Build Python-specific symbol registry - symbol_registry = self._build_python_symbol_registry(documents, import_mapping) - - # Step 3: Process cross-document references - relationships_added = self._create_python_cross_document_relationships( - documents, symbol_registry, import_mapping - ) - - logger.info(f"Added {relationships_added} Python cross-document relationships") - return relationships_added - - def _analyze_python_imports(self, documents: List[scip_pb2.Document]) -> Dict[str, Dict[str, str]]: - """ - Analyze Python import statements across all documents. - - Returns: - Dict mapping file_path -> {symbol_name -> full_module_path} - """ - import_mapping = {} - - for doc in documents: - file_imports = {} - - # Get full file path for AST parsing - full_path = os.path.join(self.project_root, doc.relative_path) - cache_key = full_path - - # Use cached AST tree if available - tree = self._parsed_trees.get(cache_key) - if tree is None: - try: - content = self._read_file(full_path) - if content: - tree = self.ast_analyzer.parse(content) - self._parsed_trees[cache_key] = tree - except (FileNotFoundError, SyntaxError): - continue - - if tree is not None: - # Extract import information - for node in self.ast_analyzer.walk(tree): - if isinstance(node, ast.Import): - for alias in node.names: - imported_name = alias.asname if alias.asname else alias.name.split('.')[-1] - file_imports[imported_name] = alias.name - - elif isinstance(node, ast.ImportFrom): - if node.module: - for alias in node.names: - imported_name = alias.asname if alias.asname else alias.name - full_name = f"{node.module}.{alias.name}" - file_imports[imported_name] = full_name - - import_mapping[doc.relative_path] = file_imports - - logger.debug(f"Analyzed imports for {len(import_mapping)} Python files") - return import_mapping - - def _build_python_symbol_registry(self, documents: List[scip_pb2.Document], - import_mapping: Dict[str, Dict[str, str]]) -> Dict[str, tuple]: - """ - Build symbol registry with proper Python package-qualified names. - - Returns: - Dict mapping full_symbol_id -> (document, symbol_info) - """ - symbol_registry = {} - - for doc in documents: - module_path = self._file_path_to_module_path(doc.relative_path) - - for symbol_info in doc.symbols: - local_symbol = symbol_info.symbol - - # Convert local symbol to package-qualified symbol - if local_symbol.startswith('local '): - symbol_name = local_symbol[6:] # Remove 'local ' prefix - - # Create package-qualified symbol - package_symbol = f"python pypi {Path(self.project_root).name} HEAD {module_path}.{symbol_name}" - symbol_registry[package_symbol] = (doc, symbol_info) - - # Also register the local version for backward compatibility - symbol_registry[local_symbol] = (doc, symbol_info) - - logger.debug(f"Built Python symbol registry with {len(symbol_registry)} entries") - return symbol_registry - - def _create_python_cross_document_relationships(self, documents: List[scip_pb2.Document], - symbol_registry: Dict[str, tuple], - import_mapping: Dict[str, Dict[str, str]]) -> int: - """ - Create cross-document relationships based on Python import analysis. - """ - relationships_added = 0 - - for source_doc in documents: - file_imports = import_mapping.get(source_doc.relative_path, {}) - - for occurrence in source_doc.occurrences: - # Skip if not a reference - if not (occurrence.symbol_roles & 8): # ReadAccess - continue - - # Skip if it's also a definition - if occurrence.symbol_roles & 1: # Definition - continue - - # Check if this is a cross-file reference based on imports - symbol_name = self._extract_symbol_name_from_occurrence(occurrence) - if symbol_name in file_imports: - # This is a reference to an imported symbol - target_module = file_imports[symbol_name] - target_symbol_id = f"python pypi {Path(self.project_root).name} HEAD {target_module}" - - target_entry = symbol_registry.get(target_symbol_id) - if target_entry: - target_doc, target_symbol_info = target_entry - - # Find the containing symbol in source document - source_symbol_id = self._find_containing_symbol_in_python(occurrence, source_doc) - - if source_symbol_id and source_symbol_id != target_symbol_id: - # Create relationship - relationship = scip_pb2.Relationship() - relationship.symbol = source_symbol_id - relationship.is_reference = True - - # Check for duplicates - if not any(rel.symbol == source_symbol_id for rel in target_symbol_info.relationships): - target_symbol_info.relationships.append(relationship) - relationships_added += 1 - - return relationships_added - - def _extract_call_name(self, func_node: ast.AST) -> Optional[str]: - """Extract the function name from a Call node's func attribute.""" - if isinstance(func_node, ast.Name): - return func_node.id - elif isinstance(func_node, ast.Attribute): - return func_node.attr - return None - - def _file_path_to_module_path(self, file_path: str) -> str: - """Convert file path to Python module path.""" - # Remove .py extension and convert path separators to dots - module_path = file_path.replace('\\', '/').replace('.py', '').replace('/', '.') - - # Remove common prefixes - if module_path.startswith('src.'): - module_path = module_path[4:] - - return module_path - - def _extract_symbol_name_from_occurrence(self, occurrence: scip_pb2.Occurrence) -> str: - """Extract simple symbol name from SCIP occurrence.""" - symbol = occurrence.symbol - if symbol.startswith('local '): - return symbol[6:].split('.')[0] # Get first part after 'local ' - return symbol.split('.')[-1] # Get last part of qualified name - - def _find_containing_symbol_in_python(self, occurrence: scip_pb2.Occurrence, - document: scip_pb2.Document) -> Optional[str]: - """Find which Python symbol contains this occurrence.""" - if not occurrence.range or not occurrence.range.start: - return None - - occurrence_line = occurrence.range.start[0] if len(occurrence.range.start) > 0 else 0 - - # Find the most specific containing symbol - containing_symbol = None - for symbol_info in document.symbols: - # Simple heuristic: assume we're in the first function/class found - if symbol_info.kind in [11, 3]: # Function or Class - containing_symbol = symbol_info.symbol - break - - return containing_symbol - - -def create_python_scip_factory(project_root: str) -> PythonSCIPIndexFactory: - """ - Factory creator for Python SCIP factory. - Ensures all required components are properly assembled via constructor injection. - """ - symbol_generator = SCIPSymbolGenerator( - scheme="scip-python", - package_manager="local", - package_name=Path(project_root).name, - version="HEAD" - ) - - relationship_extractor = PythonRelationshipExtractor() - enum_mapper = PythonEnumMapper() - position_calculator = SCIPPositionCalculator() - - return PythonSCIPIndexFactory( - project_root=project_root, - symbol_generator=symbol_generator, - relationship_extractor=relationship_extractor, # Guaranteed to be provided - enum_mapper=enum_mapper, - position_calculator=position_calculator - ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/python/relationship_extractor.py b/src/code_index_mcp/scip/framework/python/relationship_extractor.py deleted file mode 100644 index bc6778e..0000000 --- a/src/code_index_mcp/scip/framework/python/relationship_extractor.py +++ /dev/null @@ -1,205 +0,0 @@ -"""Python relationship extractor implementation.""" - -import ast -from typing import Iterator -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..types import SCIPContext, Relationship -from ...core.relationship_types import InternalRelationshipType - - -class PythonRelationshipExtractor(BaseRelationshipExtractor): - """Python-specific relationship extractor using AST analysis.""" - - def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract inheritance relationships from Python classes.""" - try: - tree = ast.parse(context.content) - - for node in ast.walk(tree): - if isinstance(node, ast.ClassDef): - class_symbol_id = self._create_class_symbol_id(node.name, context) - - # Extract base classes - for base in node.bases: - if isinstance(base, ast.Name): - parent_symbol_id = self._create_class_symbol_id(base.id, context) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - elif isinstance(base, ast.Attribute): - # Handle module.ClassName inheritance - parent_name = self._get_attribute_name(base) - if parent_name: - parent_symbol_id = self._create_class_symbol_id(parent_name, context) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=parent_symbol_id, - relationship_type=InternalRelationshipType.INHERITS - ) - - except SyntaxError: - # Skip files with syntax errors - return - - def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract function/method call relationships.""" - try: - tree = ast.parse(context.content) - - for node in ast.walk(tree): - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): - function_symbol_id = self._create_function_symbol_id(node.name, context) - - # Find function calls within this function - for child in ast.walk(node): - if isinstance(child, ast.Call): - target_function = self._extract_call_target(child) - if target_function: - target_symbol_id = self._create_function_symbol_id(target_function, context) - yield Relationship( - source_symbol=function_symbol_id, - target_symbol=target_symbol_id, - relationship_type=InternalRelationshipType.CALLS - ) - - except SyntaxError: - # Skip files with syntax errors - return - - def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract import/dependency relationships.""" - try: - tree = ast.parse(context.content) - - for node in ast.walk(tree): - if isinstance(node, ast.Import): - for alias in node.names: - module_symbol_id = f"python-stdlib {alias.name}" - file_symbol_id = self._create_file_symbol_id(context.file_path) - - yield Relationship( - source_symbol=file_symbol_id, - target_symbol=module_symbol_id, - relationship_type=InternalRelationshipType.IMPORTS - ) - - elif isinstance(node, ast.ImportFrom): - if node.module: - module_symbol_id = f"python-stdlib {node.module}" - file_symbol_id = self._create_file_symbol_id(context.file_path) - - yield Relationship( - source_symbol=file_symbol_id, - target_symbol=module_symbol_id, - relationship_type=InternalRelationshipType.IMPORTS - ) - - except SyntaxError: - # Skip files with syntax errors - return - - def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract composition relationships (class attributes).""" - try: - tree = ast.parse(context.content) - - for node in ast.walk(tree): - if isinstance(node, ast.ClassDef): - class_symbol_id = self._create_class_symbol_id(node.name, context) - - # Find attribute assignments in __init__ method - for child in ast.walk(node): - if isinstance(child, ast.FunctionDef) and child.name == "__init__": - for assign_node in ast.walk(child): - if isinstance(assign_node, ast.Assign): - for target in assign_node.targets: - if isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name) and target.value.id == "self": - # This is a self.attribute assignment - attribute_symbol_id = self._create_attribute_symbol_id(target.attr, class_symbol_id) - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=attribute_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - - except SyntaxError: - # Skip files with syntax errors - return - - def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract interface relationships (protocols, abstract base classes).""" - try: - tree = ast.parse(context.content) - - for node in ast.walk(tree): - if isinstance(node, ast.ClassDef): - class_symbol_id = self._create_class_symbol_id(node.name, context) - - # Check for abstract methods (indicating interface-like behavior) - has_abstract_methods = False - for child in ast.walk(node): - if isinstance(child, ast.FunctionDef): - # Check for @abstractmethod decorator - for decorator in child.decorator_list: - if isinstance(decorator, ast.Name) and decorator.id == "abstractmethod": - has_abstract_methods = True - break - - if has_abstract_methods: - # This class implements an interface pattern - interface_symbol_id = f"{class_symbol_id}_interface" - yield Relationship( - source_symbol=class_symbol_id, - target_symbol=interface_symbol_id, - relationship_type=InternalRelationshipType.IMPLEMENTS - ) - - except SyntaxError: - # Skip files with syntax errors - return - - def _create_class_symbol_id(self, class_name: str, context: SCIPContext) -> str: - """Create symbol ID for class.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{class_name}" if scope_path else class_name - return f"local {local_id}#" - - def _create_function_symbol_id(self, function_name: str, context: SCIPContext) -> str: - """Create symbol ID for function.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{function_name}" if scope_path else function_name - return f"local {local_id}()." - - def _create_attribute_symbol_id(self, attribute_name: str, class_symbol_id: str) -> str: - """Create symbol ID for class attribute.""" - # Extract class name from class symbol ID - class_name = class_symbol_id.replace("local ", "").replace("#", "") - return f"local {class_name}.{attribute_name}" - - def _create_file_symbol_id(self, file_path: str) -> str: - """Create symbol ID for file.""" - return f"local {file_path}" - - def _extract_call_target(self, call_node: ast.Call) -> str: - """Extract the target function name from a call node.""" - if isinstance(call_node.func, ast.Name): - return call_node.func.id - elif isinstance(call_node.func, ast.Attribute): - return call_node.func.attr - return None - - def _get_attribute_name(self, attr_node: ast.Attribute) -> str: - """Get the full attribute name (e.g., module.Class).""" - parts = [] - current = attr_node - - while isinstance(current, ast.Attribute): - parts.append(current.attr) - current = current.value - - if isinstance(current, ast.Name): - parts.append(current.id) - - return ".".join(reversed(parts)) if parts else None \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/relationship_manager.py b/src/code_index_mcp/scip/framework/relationship_manager.py deleted file mode 100644 index 5aef27f..0000000 --- a/src/code_index_mcp/scip/framework/relationship_manager.py +++ /dev/null @@ -1,406 +0,0 @@ -"""SCIP Relationship Manager - Comprehensive symbol relationship extraction and management.""" - -import logging -from typing import Dict, List, Set, Tuple, Optional, Any -from enum import Enum -from dataclasses import dataclass - -from ..proto import scip_pb2 - - -logger = logging.getLogger(__name__) - - -class RelationshipType(Enum): - """Standard relationship types for symbol analysis.""" - INHERITANCE = "inheritance" - IMPLEMENTATION = "implementation" - COMPOSITION = "composition" - DEPENDENCY = "dependency" - CALL = "call" - IMPORT = "import" - REFERENCE = "reference" - TYPE_DEFINITION = "type_definition" - OVERRIDE = "override" - INSTANTIATION = "instantiation" - - -@dataclass(frozen=True) -class SymbolRelationship: - """Immutable symbol relationship representation.""" - source_symbol: str - target_symbol: str - relationship_type: RelationshipType - confidence: float = 1.0 - source_location: Optional[str] = None - additional_info: Optional[Dict[str, Any]] = None - - -class SCIPRelationshipManager: - """ - Comprehensive relationship manager for SCIP symbol relationships. - - This manager handles the extraction, validation, and conversion of symbol - relationships to SCIP format, ensuring complete relationship networks. - """ - - def __init__(self): - """Initialize the relationship manager.""" - self._relationships: Dict[str, List[SymbolRelationship]] = {} - self._reverse_relationships: Dict[str, List[SymbolRelationship]] = {} - self._relationship_count_by_type: Dict[RelationshipType, int] = {} - - # Initialize counters - for rel_type in RelationshipType: - self._relationship_count_by_type[rel_type] = 0 - - logger.debug("Initialized SCIP Relationship Manager") - - def add_relationship(self, - source_symbol: str, - target_symbol: str, - relationship_type: RelationshipType, - confidence: float = 1.0, - source_location: Optional[str] = None, - additional_info: Optional[Dict[str, Any]] = None) -> None: - """ - Add a symbol relationship to the manager. - - Args: - source_symbol: Source symbol ID - target_symbol: Target symbol ID - relationship_type: Type of relationship - confidence: Confidence level (0.0-1.0) - source_location: Location where relationship was detected - additional_info: Additional metadata about the relationship - """ - if not self._validate_symbol_id(source_symbol): - logger.warning(f"Invalid source symbol ID: {source_symbol}") - return - - if not self._validate_symbol_id(target_symbol): - logger.warning(f"Invalid target symbol ID: {target_symbol}") - return - - if not 0.0 <= confidence <= 1.0: - logger.warning(f"Invalid confidence value: {confidence}, setting to 1.0") - confidence = 1.0 - - relationship = SymbolRelationship( - source_symbol=source_symbol, - target_symbol=target_symbol, - relationship_type=relationship_type, - confidence=confidence, - source_location=source_location, - additional_info=additional_info or {} - ) - - # Add to forward relationships - if source_symbol not in self._relationships: - self._relationships[source_symbol] = [] - - # Check for duplicates - existing = [r for r in self._relationships[source_symbol] - if r.target_symbol == target_symbol and r.relationship_type == relationship_type] - if existing: - logger.debug(f"Duplicate relationship ignored: {source_symbol} -> {target_symbol} ({relationship_type})") - return - - self._relationships[source_symbol].append(relationship) - - # Add to reverse relationships - if target_symbol not in self._reverse_relationships: - self._reverse_relationships[target_symbol] = [] - self._reverse_relationships[target_symbol].append(relationship) - - # Update counters - self._relationship_count_by_type[relationship_type] += 1 - - logger.debug(f"Added relationship: {source_symbol} --{relationship_type.value}--> {target_symbol}") - - def get_relationships(self, symbol_id: str) -> List[SymbolRelationship]: - """ - Get all outgoing relationships for a symbol. - - Args: - symbol_id: Symbol ID to get relationships for - - Returns: - List of relationships where symbol is the source - """ - return self._relationships.get(symbol_id, []) - - def get_reverse_relationships(self, symbol_id: str) -> List[SymbolRelationship]: - """ - Get all incoming relationships for a symbol. - - Args: - symbol_id: Symbol ID to get incoming relationships for - - Returns: - List of relationships where symbol is the target - """ - return self._reverse_relationships.get(symbol_id, []) - - def get_relationships_by_type(self, - symbol_id: str, - relationship_type: RelationshipType) -> List[SymbolRelationship]: - """ - Get relationships of a specific type for a symbol. - - Args: - symbol_id: Symbol ID - relationship_type: Type of relationship to filter by - - Returns: - List of relationships of the specified type - """ - all_relationships = self.get_relationships(symbol_id) - return [r for r in all_relationships if r.relationship_type == relationship_type] - - def has_relationship(self, - source_symbol: str, - target_symbol: str, - relationship_type: Optional[RelationshipType] = None) -> bool: - """ - Check if a relationship exists between two symbols. - - Args: - source_symbol: Source symbol ID - target_symbol: Target symbol ID - relationship_type: Optional specific relationship type to check - - Returns: - True if relationship exists - """ - relationships = self.get_relationships(source_symbol) - - for rel in relationships: - if rel.target_symbol == target_symbol: - if relationship_type is None or rel.relationship_type == relationship_type: - return True - - return False - - def get_inheritance_chain(self, symbol_id: str) -> List[str]: - """ - Get the complete inheritance chain for a symbol. - - Args: - symbol_id: Symbol ID to get inheritance chain for - - Returns: - List of symbol IDs in inheritance order (immediate parent first) - """ - chain = [] - visited = set() - current = symbol_id - - while current and current not in visited: - visited.add(current) - inheritance_rels = self.get_relationships_by_type(current, RelationshipType.INHERITANCE) - - if inheritance_rels: - # Take the first inheritance relationship - parent = inheritance_rels[0].target_symbol - chain.append(parent) - current = parent - else: - break - - return chain - - def get_call_graph(self, symbol_id: str, max_depth: int = 5) -> Dict[str, List[str]]: - """ - Get the call graph for a symbol (what it calls). - - Args: - symbol_id: Symbol ID to get call graph for - max_depth: Maximum depth to traverse - - Returns: - Dictionary mapping symbol IDs to their called functions - """ - call_graph = {} - visited = set() - - def traverse(current_symbol: str, depth: int): - if depth >= max_depth or current_symbol in visited: - return - - visited.add(current_symbol) - call_relationships = self.get_relationships_by_type(current_symbol, RelationshipType.CALL) - - if call_relationships: - called_symbols = [r.target_symbol for r in call_relationships] - call_graph[current_symbol] = called_symbols - - # Recursively traverse called functions - for called_symbol in called_symbols: - traverse(called_symbol, depth + 1) - - traverse(symbol_id, 0) - return call_graph - - def get_dependency_graph(self, symbol_id: str) -> Dict[str, List[str]]: - """ - Get the dependency graph for a symbol. - - Args: - symbol_id: Symbol ID to get dependencies for - - Returns: - Dictionary mapping symbol to its dependencies - """ - dependency_rels = self.get_relationships_by_type(symbol_id, RelationshipType.DEPENDENCY) - import_rels = self.get_relationships_by_type(symbol_id, RelationshipType.IMPORT) - - dependencies = [] - dependencies.extend([r.target_symbol for r in dependency_rels]) - dependencies.extend([r.target_symbol for r in import_rels]) - - return {symbol_id: dependencies} if dependencies else {} - - def convert_to_scip_relationships(self, symbol_id: str) -> List[scip_pb2.Relationship]: - """ - Convert symbol relationships to SCIP Relationship objects. - - Args: - symbol_id: Symbol ID to convert relationships for - - Returns: - List of SCIP Relationship objects - """ - relationships = self.get_relationships(symbol_id) - scip_relationships = [] - - for rel in relationships: - scip_rel = scip_pb2.Relationship() - scip_rel.symbol = rel.target_symbol - - # Map relationship types to SCIP boolean flags - if rel.relationship_type == RelationshipType.REFERENCE: - scip_rel.is_reference = True - elif rel.relationship_type == RelationshipType.IMPLEMENTATION: - scip_rel.is_implementation = True - elif rel.relationship_type == RelationshipType.TYPE_DEFINITION: - scip_rel.is_type_definition = True - elif rel.relationship_type == RelationshipType.INHERITANCE: - scip_rel.is_definition = True # Inheritance implies definition relationship - else: - # For other relationship types, mark as reference - scip_rel.is_reference = True - - scip_relationships.append(scip_rel) - - return scip_relationships - - def add_inheritance_relationship(self, child_symbol: str, parent_symbol: str, - confidence: float = 1.0, source_location: Optional[str] = None): - """Add an inheritance relationship (child inherits from parent).""" - self.add_relationship( - child_symbol, parent_symbol, RelationshipType.INHERITANCE, - confidence=confidence, source_location=source_location, - additional_info={"relationship_description": f"{child_symbol} inherits from {parent_symbol}"} - ) - - def add_call_relationship(self, caller_symbol: str, callee_symbol: str, - confidence: float = 1.0, source_location: Optional[str] = None): - """Add a call relationship (caller calls callee).""" - self.add_relationship( - caller_symbol, callee_symbol, RelationshipType.CALL, - confidence=confidence, source_location=source_location, - additional_info={"relationship_description": f"{caller_symbol} calls {callee_symbol}"} - ) - - def add_import_relationship(self, importer_symbol: str, imported_symbol: str, - confidence: float = 1.0, source_location: Optional[str] = None): - """Add an import relationship (importer imports imported).""" - self.add_relationship( - importer_symbol, imported_symbol, RelationshipType.IMPORT, - confidence=confidence, source_location=source_location, - additional_info={"relationship_description": f"{importer_symbol} imports {imported_symbol}"} - ) - - def add_composition_relationship(self, composite_symbol: str, component_symbol: str, - confidence: float = 1.0, source_location: Optional[str] = None): - """Add a composition relationship (composite contains component).""" - self.add_relationship( - composite_symbol, component_symbol, RelationshipType.COMPOSITION, - confidence=confidence, source_location=source_location, - additional_info={"relationship_description": f"{composite_symbol} contains {component_symbol}"} - ) - - def get_statistics(self) -> Dict[str, Any]: - """Get comprehensive statistics about relationships.""" - total_symbols_with_relationships = len(self._relationships) - total_relationships = sum(len(rels) for rels in self._relationships.values()) - - return { - "total_symbols_with_relationships": total_symbols_with_relationships, - "total_relationships": total_relationships, - "relationships_by_type": dict(self._relationship_count_by_type), - "average_relationships_per_symbol": ( - total_relationships / total_symbols_with_relationships - if total_symbols_with_relationships > 0 else 0 - ), - "symbols_with_incoming_relationships": len(self._reverse_relationships) - } - - def validate_relationship_integrity(self) -> List[str]: - """ - Validate the integrity of all relationships. - - Returns: - List of validation warnings/errors - """ - issues = [] - - # Check for circular inheritance - for symbol_id in self._relationships: - chain = self.get_inheritance_chain(symbol_id) - if symbol_id in chain: - issues.append(f"Circular inheritance detected for {symbol_id}") - - # Check for self-references (except for specific types) - for symbol_id, relationships in self._relationships.items(): - for rel in relationships: - if rel.target_symbol == symbol_id and rel.relationship_type not in [ - RelationshipType.REFERENCE, RelationshipType.CALL - ]: - issues.append(f"Self-reference detected: {symbol_id} -> {rel.relationship_type.value}") - - # Check confidence levels - for relationships in self._relationships.values(): - for rel in relationships: - if rel.confidence < 0.5: - issues.append(f"Low confidence relationship: {rel.source_symbol} -> {rel.target_symbol} ({rel.confidence})") - - return issues - - def _validate_symbol_id(self, symbol_id: str) -> bool: - """Validate symbol ID format.""" - return bool(symbol_id and isinstance(symbol_id, str) and len(symbol_id.strip()) > 0) - - def clear(self): - """Clear all relationships.""" - self._relationships.clear() - self._reverse_relationships.clear() - for rel_type in RelationshipType: - self._relationship_count_by_type[rel_type] = 0 - logger.debug("Cleared all relationships") - - def export_relationships(self) -> Dict[str, Any]: - """Export all relationships for serialization.""" - exported = {} - for symbol_id, relationships in self._relationships.items(): - exported[symbol_id] = [] - for rel in relationships: - exported[symbol_id].append({ - "target_symbol": rel.target_symbol, - "relationship_type": rel.relationship_type.value, - "confidence": rel.confidence, - "source_location": rel.source_location, - "additional_info": rel.additional_info - }) - return exported \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/standard_framework.py b/src/code_index_mcp/scip/framework/standard_framework.py deleted file mode 100644 index f8c7605..0000000 --- a/src/code_index_mcp/scip/framework/standard_framework.py +++ /dev/null @@ -1,354 +0,0 @@ -"""SCIP Standard Framework - SCIP standard framework enforcing compliance.""" - -import logging -import os -from abc import ABC, abstractmethod -from pathlib import Path -from typing import List, Optional, Dict, Any - -from .types import SCIPSymbolContext, SCIPSymbolExtractor -from .symbol_generator import SCIPSymbolGenerator -from .position_calculator import SCIPPositionCalculator -from .base.enum_mapper import BaseEnumMapper -from .compliance_validator import SCIPComplianceValidator -from ..proto import scip_pb2 - - -logger = logging.getLogger(__name__) - - -class SCIPStandardFramework(ABC): - """SCIP standard framework - enforces compliance across all language strategies.""" - - def __init__(self, language: str, project_root: str, version: str = "HEAD"): - """ - Initialize SCIP standard framework. - - Args: - language: Programming language (e.g., 'python', 'javascript') - project_root: Absolute path to project root - version: Project version for symbol generation - """ - self.language = language.lower() - self.project_root = Path(project_root).resolve() - self.version = version - - # Core components - mandatory initialization - self._symbol_generator = self._create_symbol_generator() - self._position_calculator = SCIPPositionCalculator() - self._enum_mapper = self._create_enum_mapper() - self._validator = SCIPComplianceValidator() - - logger.debug(f"Initialized SCIP framework for {language} project: {self.project_root.name}") - - def _create_symbol_generator(self) -> SCIPSymbolGenerator: - """Create SCIP standard symbol generator.""" - return SCIPSymbolGenerator( - scheme=f"scip-{self.language}", - package_manager="local", - package_name=self.project_root.name, - version=self.version - ) - - @abstractmethod - def _create_enum_mapper(self) -> BaseEnumMapper: - """Subclasses must implement language-specific enum mapping.""" - raise NotImplementedError("Subclasses must implement _create_enum_mapper") - - def process_file(self, file_path: str, extractor: SCIPSymbolExtractor) -> scip_pb2.Document: - """ - Standardized file processing pipeline - enforces compliance. - - Args: - file_path: Path to file to process - extractor: Symbol extractor implementation - - Returns: - SCIP Document with full compliance validation - - Raises: - ValueError: If input validation fails - RuntimeError: If processing fails or compliance validation fails - """ - - # 1. Validate input - self._validate_file_input(file_path) - - # 2. Create document base structure - document = self._create_document_base(file_path) - - # 3. Read content and create context - content = self._read_file_safe(file_path) - context = SCIPSymbolContext( - file_path=file_path, - content=content, - scope_stack=[], - imports={} - ) - - # 4. Extract symbols and generate SCIP elements - occurrences, symbols = self._extract_scip_elements(context, extractor) - - # 5. Validate and add to document - document.occurrences.extend(self._validate_occurrences(occurrences)) - document.symbols.extend(self._validate_symbols(symbols)) - - # 6. Final compliance check - if not self._validator.validate_document(document): - validation_summary = self._validator.get_validation_summary() - raise RuntimeError(f"Document failed SCIP compliance validation: {validation_summary['error_messages']}") - - logger.debug(f"Successfully processed {file_path} with {len(document.occurrences)} occurrences and {len(document.symbols)} symbols") - return document - - def process_files(self, file_paths: List[str], extractors: Dict[str, SCIPSymbolExtractor]) -> List[scip_pb2.Document]: - """ - Process multiple files with appropriate extractors. - - Args: - file_paths: List of file paths to process - extractors: Mapping of file extensions to extractors - - Returns: - List of SCIP documents - """ - documents = [] - - for file_path in file_paths: - try: - # Determine appropriate extractor - file_ext = Path(file_path).suffix.lower() - extractor = extractors.get(file_ext) - - if not extractor: - logger.warning(f"No extractor available for {file_ext}, skipping {file_path}") - continue - - # Process file - document = self.process_file(file_path, extractor) - documents.append(document) - - except Exception as e: - logger.error(f"Failed to process {file_path}: {e}") - # Continue processing other files - continue - - logger.info(f"Processed {len(documents)} files successfully out of {len(file_paths)} total") - return documents - - def create_complete_index(self, file_paths: List[str], extractors: Dict[str, SCIPSymbolExtractor]) -> scip_pb2.Index: - """ - Create complete SCIP index with all 6 essential content categories. - - Args: - file_paths: List of file paths to index - extractors: Mapping of file extensions to extractors - - Returns: - Complete SCIP Index - """ - index = scip_pb2.Index() - - # 1. Create metadata (Category 1) - index.metadata.CopyFrom(self._create_metadata()) - - # 2. Process all documents (Category 2) - documents = self.process_files(file_paths, extractors) - index.documents.extend(documents) - - # 3. Extract external symbols (Category 6) - external_symbols = self._extract_external_symbols(documents) - index.external_symbols.extend(external_symbols) - - # 4. Validate complete index - if not self._validator.validate_index(index): - validation_summary = self._validator.get_validation_summary() - raise RuntimeError(f"Index failed SCIP compliance validation: {validation_summary['error_messages']}") - - logger.info(f"Created complete SCIP index with {len(documents)} documents and {len(external_symbols)} external symbols") - return index - - def _validate_file_input(self, file_path: str) -> None: - """Validate file input parameters.""" - if not file_path: - raise ValueError("File path cannot be empty") - - path = Path(file_path) - if not path.exists(): - raise ValueError(f"File does not exist: {file_path}") - - if not path.is_file(): - raise ValueError(f"Path is not a file: {file_path}") - - def _create_document_base(self, file_path: str) -> scip_pb2.Document: - """Create base document structure.""" - document = scip_pb2.Document() - - # Set relative path from project root - try: - relative_path = Path(file_path).relative_to(self.project_root) - document.relative_path = str(relative_path).replace('\\', '/') - except ValueError: - # File is outside project root, use absolute path - document.relative_path = str(Path(file_path)).replace('\\', '/') - - document.language = self.language - - return document - - def _create_metadata(self) -> scip_pb2.Metadata: - """Create SCIP metadata with standard compliance.""" - metadata = scip_pb2.Metadata() - metadata.version = scip_pb2.ProtocolVersion.UnspecifiedProtocolVersion - - # Tool information - metadata.tool_info.name = "code-index-mcp" - metadata.tool_info.version = "2.1.1" - metadata.tool_info.arguments.extend(["scip-indexing", self.language]) - - # Project information - metadata.project_root = str(self.project_root) - metadata.text_document_encoding = scip_pb2.UTF8 - - return metadata - - def _read_file_safe(self, file_path: str) -> str: - """Read file content with encoding detection.""" - encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252'] - - for encoding in encodings: - try: - with open(file_path, 'r', encoding=encoding) as f: - return f.read() - except UnicodeDecodeError: - continue - - raise RuntimeError(f"Could not decode {file_path} with any supported encoding") - - def _extract_scip_elements(self, context: SCIPSymbolContext, extractor: SCIPSymbolExtractor) -> tuple: - """Extract SCIP elements using provided extractor.""" - occurrences = [] - symbols = [] - - try: - # Extract symbol definitions - for symbol_desc in extractor.extract_symbols(context): - try: - # Create SCIP symbol - symbol_id = self._symbol_generator.create_local_symbol(symbol_desc) - - # Map to SCIP enums - symbol_kind = self._enum_mapper.validate_and_map_symbol_kind(symbol_desc.kind) - - # Create symbol information - symbol_info = scip_pb2.SymbolInformation() - symbol_info.symbol = symbol_id - symbol_info.display_name = symbol_desc.name - symbol_info.kind = symbol_kind - - symbols.append(symbol_info) - - except Exception as e: - logger.warning(f"Failed to create symbol for {symbol_desc.name}: {e}") - continue - - # Extract symbol references - for symbol_desc, position_info in extractor.extract_references(context): - try: - # Create SCIP symbol ID - symbol_id = self._symbol_generator.create_local_symbol(symbol_desc) - - # Create SCIP range - range_obj = scip_pb2.Range() - range_obj.start.extend([position_info.start_line, position_info.start_column]) - range_obj.end.extend([position_info.end_line, position_info.end_column]) - - # Map to SCIP enums - symbol_role = self._enum_mapper.validate_and_map_symbol_role("reference") - syntax_kind = self._enum_mapper.validate_and_map_syntax_kind("identifier") - - # Create occurrence - occurrence = scip_pb2.Occurrence() - occurrence.symbol = symbol_id - occurrence.symbol_roles = symbol_role - occurrence.syntax_kind = syntax_kind - occurrence.range.CopyFrom(range_obj) - - occurrences.append(occurrence) - - except Exception as e: - logger.warning(f"Failed to create occurrence for {symbol_desc.name}: {e}") - continue - - except Exception as e: - logger.error(f"Symbol extraction failed: {e}") - raise RuntimeError(f"Failed to extract symbols: {e}") - - return occurrences, symbols - - def _validate_occurrences(self, occurrences: List[scip_pb2.Occurrence]) -> List[scip_pb2.Occurrence]: - """Validate occurrences for SCIP compliance.""" - validated = [] - - for occurrence in occurrences: - try: - # Validate symbol ID - if not self._validator.validate_symbol_id(occurrence.symbol): - logger.warning(f"Invalid symbol ID in occurrence: {occurrence.symbol}") - continue - - # Basic validation passed - validated.append(occurrence) - - except Exception as e: - logger.warning(f"Occurrence validation failed: {e}") - continue - - logger.debug(f"Validated {len(validated)} out of {len(occurrences)} occurrences") - return validated - - def _validate_symbols(self, symbols: List[scip_pb2.SymbolInformation]) -> List[scip_pb2.SymbolInformation]: - """Validate symbols for SCIP compliance.""" - validated = [] - - for symbol in symbols: - try: - # Validate symbol ID - if not self._validator.validate_symbol_id(symbol.symbol): - logger.warning(f"Invalid symbol ID in symbol info: {symbol.symbol}") - continue - - # Basic validation passed - validated.append(symbol) - - except Exception as e: - logger.warning(f"Symbol validation failed: {e}") - continue - - logger.debug(f"Validated {len(validated)} out of {len(symbols)} symbols") - return validated - - def _extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract external symbols from processed documents.""" - external_symbols = [] - - # This is a placeholder implementation - # Subclasses should implement language-specific external symbol extraction - # based on import statements and dependencies - - return external_symbols - - def get_framework_info(self) -> dict: - """Get information about this framework instance.""" - return { - 'language': self.language, - 'project_root': str(self.project_root), - 'project_name': self.project_root.name, - 'version': self.version, - 'components': { - 'symbol_generator': self._symbol_generator.get_generator_info(), - 'enum_mapper': self._enum_mapper.get_enum_info(), - 'position_calculator': True, - 'compliance_validator': True - } - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/streaming_indexer.py b/src/code_index_mcp/scip/framework/streaming_indexer.py deleted file mode 100644 index b1b26d8..0000000 --- a/src/code_index_mcp/scip/framework/streaming_indexer.py +++ /dev/null @@ -1,429 +0,0 @@ -"""SCIP Streaming Indexer - Incremental and streaming index generation for large codebases.""" - -import logging -import json -import os -import time -from typing import Dict, List, Optional, Iterator, Callable, Any, Set -from dataclasses import dataclass, asdict -from pathlib import Path -from concurrent.futures import ThreadPoolExecutor, Future -from queue import Queue, Empty -import threading - -from .caching_system import SCIPCacheManager, BatchProcessor -from .index_factory import SCIPIndexFactory -from ..proto import scip_pb2 - -logger = logging.getLogger(__name__) - - -@dataclass -class IndexingProgress: - """Progress tracking for streaming indexing.""" - total_files: int - processed_files: int - failed_files: int - start_time: float - current_file: Optional[str] = None - error_messages: List[str] = None - - def __post_init__(self): - if self.error_messages is None: - self.error_messages = [] - - @property - def progress_percentage(self) -> float: - """Calculate progress percentage.""" - if self.total_files == 0: - return 100.0 - return (self.processed_files / self.total_files) * 100.0 - - @property - def elapsed_time(self) -> float: - """Get elapsed processing time.""" - return time.time() - self.start_time - - @property - def estimated_remaining_time(self) -> float: - """Estimate remaining processing time.""" - if self.processed_files == 0: - return 0.0 - - avg_time_per_file = self.elapsed_time / self.processed_files - remaining_files = self.total_files - self.processed_files - return avg_time_per_file * remaining_files - - -class StreamingIndexer: - """Streaming SCIP indexer for incremental and large-scale indexing.""" - - def __init__(self, - factory: SCIPIndexFactory, - cache_manager: Optional[SCIPCacheManager] = None, - max_workers: int = 4, - chunk_size: int = 100): - """Initialize streaming indexer.""" - self.factory = factory - self.cache_manager = cache_manager or SCIPCacheManager() - self.max_workers = max_workers - self.chunk_size = chunk_size - - # Progress tracking - self._progress: Optional[IndexingProgress] = None - self._progress_callbacks: List[Callable[[IndexingProgress], None]] = [] - - # Threading - self._stop_event = threading.Event() - self._executor: Optional[ThreadPoolExecutor] = None - - # Results queue for streaming output - self._results_queue: Queue = Queue() - - logger.debug(f"Initialized streaming indexer with {max_workers} workers") - - def add_progress_callback(self, callback: Callable[[IndexingProgress], None]) -> None: - """Add progress callback for monitoring.""" - self._progress_callbacks.append(callback) - - def index_files_streaming(self, - file_paths: List[str], - output_callback: Optional[Callable[[scip_pb2.Document], None]] = None - ) -> Iterator[scip_pb2.Document]: - """Stream index generation for files.""" - self._progress = IndexingProgress( - total_files=len(file_paths), - processed_files=0, - failed_files=0, - start_time=time.time() - ) - - # Start processing - self._executor = ThreadPoolExecutor(max_workers=self.max_workers) - - try: - # Submit files in chunks - for chunk_start in range(0, len(file_paths), self.chunk_size): - if self._stop_event.is_set(): - break - - chunk_end = min(chunk_start + self.chunk_size, len(file_paths)) - chunk_files = file_paths[chunk_start:chunk_end] - - # Submit chunk for processing - future = self._executor.submit(self._process_file_chunk, chunk_files) - - # Process results as they become available - try: - chunk_results = future.result(timeout=300) # 5 minute timeout per chunk - - for document in chunk_results: - if output_callback: - output_callback(document) - yield document - - # Update progress - self._progress.processed_files += 1 - self._notify_progress() - - except Exception as e: - logger.error(f"Chunk processing failed: {e}") - self._progress.failed_files += len(chunk_files) - self._progress.error_messages.append(str(e)) - self._notify_progress() - - finally: - if self._executor: - self._executor.shutdown(wait=True) - - logger.info(f"Streaming indexing completed. Processed: {self._progress.processed_files}, " - f"Failed: {self._progress.failed_files}") - - def create_incremental_index(self, - modified_files: List[str], - existing_index: Optional[scip_pb2.Index] = None - ) -> scip_pb2.Index: - """Create incremental index for modified files.""" - logger.info(f"Creating incremental index for {len(modified_files)} modified files") - - # Start with existing index or create new one - if existing_index: - updated_index = scip_pb2.Index() - updated_index.CopyFrom(existing_index) - else: - updated_index = scip_pb2.Index() - updated_index.metadata.CopyFrom(self.factory.create_metadata(self.factory.project_root)) - - # Track existing documents by path for replacement - existing_docs_by_path = {doc.relative_path: doc for doc in updated_index.documents} - - # Process modified files - new_documents = [] - for file_path in modified_files: - try: - # Check cache first - cached_doc = self.cache_manager.get_document_cache(file_path) - if cached_doc: - new_documents.append(cached_doc) - logger.debug(f"Using cached document for {file_path}") - continue - - # Read and process file - content = self._read_file(file_path) - if content is None: - logger.warning(f"Could not read file: {file_path}") - continue - - # Create new document - document = self.factory.create_document(file_path, content) - new_documents.append(document) - - # Cache the document - self.cache_manager.cache_document(file_path, document) - - except Exception as e: - logger.error(f"Failed to process {file_path}: {e}") - continue - - # Replace or add documents in the index - updated_documents = [] - relative_paths_processed = set() - - for doc in new_documents: - updated_documents.append(doc) - relative_paths_processed.add(doc.relative_path) - - # Add unchanged documents from existing index - if existing_index: - for doc in existing_index.documents: - if doc.relative_path not in relative_paths_processed: - updated_documents.append(doc) - - # Update the index - updated_index.documents[:] = updated_documents - - # Extract external symbols from all documents - external_symbols = self.factory.extract_external_symbols(updated_documents) - updated_index.external_symbols[:] = external_symbols - - logger.info(f"Incremental index created with {len(updated_documents)} documents") - return updated_index - - def save_index_streaming(self, - index: scip_pb2.Index, - output_path: str, - compress: bool = True) -> None: - """Save index with streaming compression for large indexes.""" - logger.info(f"Saving index to {output_path} (compress={compress})") - - try: - if compress: - # Use compression for large indexes - import gzip - with gzip.open(output_path, 'wb') as f: - f.write(index.SerializeToString()) - else: - with open(output_path, 'wb') as f: - f.write(index.SerializeToString()) - - logger.info(f"Index saved successfully to {output_path}") - - except Exception as e: - logger.error(f"Failed to save index: {e}") - raise - - def load_index_streaming(self, input_path: str) -> scip_pb2.Index: - """Load index with streaming decompression.""" - logger.info(f"Loading index from {input_path}") - - try: - if input_path.endswith('.gz'): - import gzip - with gzip.open(input_path, 'rb') as f: - data = f.read() - else: - with open(input_path, 'rb') as f: - data = f.read() - - index = scip_pb2.Index() - index.ParseFromString(data) - - logger.info(f"Index loaded successfully with {len(index.documents)} documents") - return index - - except Exception as e: - logger.error(f"Failed to load index: {e}") - raise - - def watch_and_update(self, - watch_directory: str, - output_path: str, - update_interval: float = 5.0) -> None: - """Watch directory for changes and update index incrementally.""" - logger.info(f"Starting file watcher for {watch_directory}") - - last_update = time.time() - known_files = set() - last_index = None - - while not self._stop_event.is_set(): - try: - # Scan for changes - current_files = set() - modified_files = [] - - for ext in self.factory.get_supported_extensions(): - pattern = f"**/*{ext}" - for file_path in Path(watch_directory).rglob(pattern): - if file_path.is_file(): - current_files.add(str(file_path)) - - # Check if file is new or modified - if str(file_path) not in known_files or \ - file_path.stat().st_mtime > last_update: - modified_files.append(str(file_path)) - - # Update index if there are changes - if modified_files: - logger.info(f"Detected {len(modified_files)} modified files") - - # Create incremental index - updated_index = self.create_incremental_index(modified_files, last_index) - - # Save updated index - self.save_index_streaming(updated_index, output_path) - - last_index = updated_index - known_files = current_files - last_update = time.time() - - # Sleep before next check - time.sleep(update_interval) - - except Exception as e: - logger.error(f"Error in file watcher: {e}") - time.sleep(update_interval) - - def stop(self) -> None: - """Stop streaming indexer.""" - self._stop_event.set() - if self._executor: - self._executor.shutdown(wait=False) - logger.info("Streaming indexer stopped") - - def get_progress(self) -> Optional[IndexingProgress]: - """Get current indexing progress.""" - return self._progress - - def _process_file_chunk(self, file_paths: List[str]) -> List[scip_pb2.Document]: - """Process a chunk of files.""" - documents = [] - - for file_path in file_paths: - if self._stop_event.is_set(): - break - - try: - self._progress.current_file = file_path - self._notify_progress() - - # Check cache first - cached_doc = self.cache_manager.get_document_cache(file_path) - if cached_doc: - documents.append(cached_doc) - continue - - # Read and process file - content = self._read_file(file_path) - if content is None: - logger.warning(f"Could not read file: {file_path}") - continue - - # Create document - document = self.factory.create_document(file_path, content) - documents.append(document) - - # Cache the document - self.cache_manager.cache_document(file_path, document) - - except Exception as e: - logger.error(f"Failed to process {file_path}: {e}") - continue - - return documents - - def _notify_progress(self) -> None: - """Notify all progress callbacks.""" - if self._progress: - for callback in self._progress_callbacks: - try: - callback(self._progress) - except Exception as e: - logger.warning(f"Progress callback failed: {e}") - - def _read_file(self, file_path: str) -> Optional[str]: - """Read file content with encoding detection.""" - encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252'] - - for encoding in encodings: - try: - with open(file_path, 'r', encoding=encoding) as f: - return f.read() - except UnicodeDecodeError: - continue - except (OSError, PermissionError, FileNotFoundError) as e: - logger.warning(f"Could not read {file_path}: {e}") - return None - - logger.warning(f"Could not decode {file_path} with any supported encoding") - return None - - -class IndexMerger: - """Utility for merging multiple SCIP indexes.""" - - @staticmethod - def merge_indexes(indexes: List[scip_pb2.Index], - output_metadata: Optional[scip_pb2.Metadata] = None) -> scip_pb2.Index: - """Merge multiple SCIP indexes into one.""" - if not indexes: - raise ValueError("No indexes provided for merging") - - logger.info(f"Merging {len(indexes)} indexes") - - merged_index = scip_pb2.Index() - - # Use provided metadata or first index's metadata - if output_metadata: - merged_index.metadata.CopyFrom(output_metadata) - else: - merged_index.metadata.CopyFrom(indexes[0].metadata) - - # Collect all documents and external symbols - all_documents = [] - all_external_symbols = [] - seen_document_paths = set() - seen_external_symbols = set() - - for index in indexes: - # Add documents (avoid duplicates by path) - for doc in index.documents: - if doc.relative_path not in seen_document_paths: - all_documents.append(doc) - seen_document_paths.add(doc.relative_path) - else: - logger.warning(f"Duplicate document path: {doc.relative_path}") - - # Add external symbols (avoid duplicates by symbol ID) - for ext_symbol in index.external_symbols: - if ext_symbol.symbol not in seen_external_symbols: - all_external_symbols.append(ext_symbol) - seen_external_symbols.add(ext_symbol.symbol) - - merged_index.documents.extend(all_documents) - merged_index.external_symbols.extend(all_external_symbols) - - logger.info(f"Merged index contains {len(all_documents)} documents " - f"and {len(all_external_symbols)} external symbols") - - return merged_index \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/symbol_generator.py b/src/code_index_mcp/scip/framework/symbol_generator.py deleted file mode 100644 index 74fde92..0000000 --- a/src/code_index_mcp/scip/framework/symbol_generator.py +++ /dev/null @@ -1,144 +0,0 @@ -"""SCIP Symbol Generator - Strict format compliance for SCIP symbol ID generation.""" - -import re -import logging -from typing import Optional -from .types import SCIPSymbolDescriptor - - -logger = logging.getLogger(__name__) - - -class SCIPSymbolGenerator: - """SCIP standard symbol generator - strict format compliance.""" - - # SCIP symbol format validation patterns - SCHEME_PATTERN = re.compile(r'^[a-zA-Z][a-zA-Z0-9\-_]*$') - LOCAL_ID_PATTERN = re.compile(r'^[^\s]+$') - GLOBAL_SYMBOL_PATTERN = re.compile(r'^[^\s]+\s+[^\s]+\s+[^\s]+(\s+[^\s]+)?$') - - def __init__(self, scheme: str, package_manager: str, package_name: str, version: str): - """Initialize symbol generator with validation.""" - self._validate_scheme(scheme) - self._validate_package_info(package_manager, package_name, version) - - self.scheme = scheme - self.package = f"{package_manager} {package_name} {version}" - - def create_local_symbol(self, descriptor: SCIPSymbolDescriptor) -> str: - """Create local symbol ID - enforced SCIP format.""" - local_id = descriptor.to_scip_descriptor() - - # Validate local ID format - if not self._is_valid_local_id(local_id): - raise ValueError(f"Invalid local symbol ID: {local_id}") - - return f"local {local_id}" - - def create_global_symbol(self, descriptor: SCIPSymbolDescriptor) -> str: - """Create global symbol ID - complete SCIP format.""" - descriptor_str = descriptor.to_scip_descriptor() - - symbol_id = f"{self.scheme} {self.package} {descriptor_str}" - - # Validate global symbol format - if not self._is_valid_global_symbol(symbol_id): - raise ValueError(f"Invalid global symbol ID: {symbol_id}") - - return symbol_id - - def _validate_scheme(self, scheme: str) -> None: - """Validate scheme format against SCIP standards.""" - if not scheme: - raise ValueError("Scheme cannot be empty") - - if not self.SCHEME_PATTERN.match(scheme): - raise ValueError(f"Invalid scheme format: {scheme}. Must match pattern: {self.SCHEME_PATTERN.pattern}") - - if ' ' in scheme.replace(' ', ''): # Allow double space escaping - raise ValueError(f"Scheme cannot contain spaces: {scheme}") - - def _validate_package_info(self, package_manager: str, package_name: str, version: str) -> None: - """Validate package information components.""" - if not package_manager: - raise ValueError("Package manager cannot be empty") - if not package_name: - raise ValueError("Package name cannot be empty") - - # Version can be empty for local projects - for component in [package_manager, package_name, version]: - if component and (' ' in component): - raise ValueError(f"Package component cannot contain spaces: {component}") - - def _is_valid_local_id(self, local_id: str) -> bool: - """Validate local ID format compliance.""" - if not local_id: - return False - - # Check for leading/trailing spaces - if local_id.startswith(' ') or local_id.endswith(' '): - return False - - # Check basic pattern compliance - return self.LOCAL_ID_PATTERN.match(local_id) is not None - - def _is_valid_global_symbol(self, symbol_id: str) -> bool: - """Validate global symbol format compliance.""" - if not symbol_id: - return False - - # Split into components - parts = symbol_id.split(' ') - if len(parts) < 4: - return False - - # Validate each part is non-empty - return all(part.strip() for part in parts) - - def validate_symbol_id(self, symbol_id: str) -> bool: - """Validate any symbol ID against SCIP grammar.""" - if not symbol_id: - return False - - if symbol_id.startswith('local '): - return self._is_valid_local_id(symbol_id[6:]) - else: - return self._is_valid_global_symbol(symbol_id) - - def parse_symbol_id(self, symbol_id: str) -> Optional[dict]: - """Parse symbol ID into components for analysis.""" - if not self.validate_symbol_id(symbol_id): - return None - - if symbol_id.startswith('local '): - return { - 'type': 'local', - 'local_id': symbol_id[6:], - 'scheme': None, - 'package': None, - 'descriptor': symbol_id[6:] - } - else: - parts = symbol_id.split(' ', 3) - if len(parts) >= 4: - return { - 'type': 'global', - 'scheme': parts[0], - 'manager': parts[1], - 'package': parts[2], - 'descriptor': parts[3] - } - - return None - - def get_generator_info(self) -> dict: - """Get information about this generator instance.""" - return { - 'scheme': self.scheme, - 'package': self.package, - 'validation_patterns': { - 'scheme': self.SCHEME_PATTERN.pattern, - 'local_id': self.LOCAL_ID_PATTERN.pattern, - 'global_symbol': self.GLOBAL_SYMBOL_PATTERN.pattern - } - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/types.py b/src/code_index_mcp/scip/framework/types.py deleted file mode 100644 index f2da49f..0000000 --- a/src/code_index_mcp/scip/framework/types.py +++ /dev/null @@ -1,79 +0,0 @@ -"""SCIP Framework Types - Core type definitions for SCIP standard compliance.""" - -from dataclasses import dataclass -from typing import List, Dict, Protocol, Iterator, Tuple, Optional -from abc import ABC, abstractmethod - - -@dataclass(frozen=True) -class SCIPSymbolDescriptor: - """SCIP symbol descriptor - immutable data structure for symbol information.""" - name: str - kind: str # function, class, variable, etc. - scope_path: List[str] - descriptor_suffix: str # (). # (param) etc. - - def to_scip_descriptor(self) -> str: - """Convert to SCIP standard descriptor format.""" - scope = ".".join(self.scope_path) if self.scope_path else "" - full_path = f"{scope}.{self.name}" if scope else self.name - return f"{full_path}{self.descriptor_suffix}" - - -@dataclass(frozen=True) -class SCIPPositionInfo: - """SCIP position information - immutable position data with validation.""" - start_line: int - start_column: int - end_line: int - end_column: int - - def validate(self) -> bool: - """Validate position information for SCIP compliance.""" - return ( - self.start_line <= self.end_line and - (self.start_line < self.end_line or self.start_column <= self.end_column) and - all(x >= 0 for x in [self.start_line, self.start_column, self.end_line, self.end_column]) - ) - - -@dataclass -class SCIPSymbolContext: - """Context information for symbol extraction and processing.""" - file_path: str - content: str - scope_stack: List[str] - imports: Dict[str, str] - - def with_scope(self, scope_name: str) -> 'SCIPSymbolContext': - """Create new context with additional scope.""" - return SCIPSymbolContext( - file_path=self.file_path, - content=self.content, - scope_stack=self.scope_stack + [scope_name], - imports=self.imports.copy() - ) - - -# Alias for compatibility -SCIPContext = SCIPSymbolContext - -# Import and alias Relationship type -from .relationship_manager import SymbolRelationship -Relationship = SymbolRelationship - - -class SCIPSymbolExtractor(Protocol): - """Symbol extractor protocol - mandatory interface for symbol extraction.""" - - def extract_symbols(self, context: SCIPSymbolContext) -> Iterator[SCIPSymbolDescriptor]: - """Extract symbol definitions from context.""" - ... - - def extract_references(self, context: SCIPSymbolContext) -> Iterator[Tuple[SCIPSymbolDescriptor, SCIPPositionInfo]]: - """Extract symbol references with position information.""" - ... - - def extract_relationships(self, context: SCIPSymbolContext) -> Iterator[Tuple[str, str, str]]: - """Extract symbol relationships (source, target, relationship_type).""" - ... \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/unified_api.py b/src/code_index_mcp/scip/framework/unified_api.py deleted file mode 100644 index 2e955a7..0000000 --- a/src/code_index_mcp/scip/framework/unified_api.py +++ /dev/null @@ -1,456 +0,0 @@ -"""SCIP Framework Unified API - Single entry point for all SCIP framework functionality.""" - -import logging -import os -from typing import Dict, List, Optional, Set, Any, Callable, Iterator -from pathlib import Path -from dataclasses import dataclass - -from .index_factory import SCIPIndexFactory -from .python import create_python_scip_factory, PythonSCIPIndexFactory -from .javascript import create_javascript_scip_factory, JavaScriptSCIPIndexFactory -from .java import create_java_scip_factory, JavaSCIPIndexFactory -from .fallback import create_fallback_scip_factory, FallbackSCIPIndexFactory -from .caching_system import SCIPCacheManager, BatchProcessor -from .streaming_indexer import StreamingIndexer, IndexingProgress, IndexMerger -from .compliance_validator import SCIPComplianceValidator -from ..proto import scip_pb2 - -logger = logging.getLogger(__name__) - - -@dataclass -class SCIPConfig: - """Configuration for SCIP framework.""" - project_root: str - cache_enabled: bool = True - cache_dir: Optional[str] = None - max_workers: int = 4 - batch_size: int = 50 - streaming_chunk_size: int = 100 - validate_compliance: bool = True - supported_languages: Optional[Set[str]] = None - exclude_patterns: Optional[List[str]] = None - - def __post_init__(self): - if self.supported_languages is None: - self.supported_languages = {'python', 'javascript', 'typescript', 'java', 'fallback'} - - if self.exclude_patterns is None: - self.exclude_patterns = [ - '__pycache__', '.git', 'node_modules', '.vscode', - '.idea', '*.pyc', '*.pyo', '*.class' - ] - - -class SCIPFrameworkAPI: - """Unified API for SCIP framework - single entry point for all functionality.""" - - def __init__(self, config: SCIPConfig): - """Initialize SCIP framework with configuration.""" - self.config = config - - # Initialize core components - self.cache_manager = None - if config.cache_enabled: - self.cache_manager = SCIPCacheManager( - cache_dir=config.cache_dir, - max_memory_entries=1000 - ) - - self.validator = SCIPComplianceValidator() if config.validate_compliance else None - - # Language-specific factories - self._factories: Dict[str, SCIPIndexFactory] = {} - self._init_factories() - - # Streaming components - self._streaming_indexers: Dict[str, StreamingIndexer] = {} - - logger.info(f"Initialized SCIP Framework API for project: {config.project_root}") - logger.info(f"Supported languages: {config.supported_languages}") - - def detect_project_languages(self, scan_depth: int = 3) -> Set[str]: - """Automatically detect programming languages in the project.""" - detected_languages = set() - project_path = Path(self.config.project_root) - - # Language detection by file extensions - language_extensions = { - 'python': {'.py', '.pyw', '.pyx'}, - 'javascript': {'.js', '.jsx', '.mjs', '.cjs'}, - 'typescript': {'.ts', '.tsx'}, - 'java': {'.java'}, - 'fallback': set() # Fallback handles everything else - } - - # Scan project files - for depth in range(scan_depth + 1): - pattern = '*/' * depth + '*' - - for file_path in project_path.glob(pattern): - if file_path.is_file(): - file_ext = file_path.suffix.lower() - - for lang, extensions in language_extensions.items(): - if file_ext in extensions and lang in self.config.supported_languages: - detected_languages.add(lang) - - logger.info(f"Detected languages: {detected_languages}") - return detected_languages - - def create_complete_index(self, - languages: Optional[Set[str]] = None, - progress_callback: Optional[Callable[[IndexingProgress], None]] = None - ) -> scip_pb2.Index: - """Create complete SCIP index for the project.""" - if languages is None: - languages = self.detect_project_languages() - - logger.info(f"Creating complete index for languages: {languages}") - - # Collect all files by language - files_by_language = self._collect_files_by_language(languages) - - # Create index with metadata - index = scip_pb2.Index() - - # Use first available factory for metadata (they should be consistent) - first_factory = next(iter(self._factories.values())) - index.metadata.CopyFrom(first_factory.create_metadata(self.config.project_root)) - - # Process files by language - all_documents = [] - all_external_symbols = [] - - for language, file_paths in files_by_language.items(): - if language not in self._factories: - logger.warning(f"No factory available for language: {language}") - continue - - logger.info(f"Processing {len(file_paths)} {language} files") - - # Get streaming indexer for this language - streaming_indexer = self._get_streaming_indexer(language) - if progress_callback: - streaming_indexer.add_progress_callback(progress_callback) - - # Process files with streaming - language_documents = list(streaming_indexer.index_files_streaming(file_paths)) - all_documents.extend(language_documents) - - # Extract external symbols - factory = self._factories[language] - external_symbols = factory.extract_external_symbols(language_documents) - all_external_symbols.extend(external_symbols) - - # Add all documents and external symbols to index - index.documents.extend(all_documents) - index.external_symbols.extend(all_external_symbols) - - # Validate if requested - if self.validator: - is_valid = self.validator.validate_index(index) - if not is_valid: - logger.warning("Generated index failed compliance validation") - validation_summary = self.validator.get_validation_summary() - logger.warning(f"Validation errors: {validation_summary['error_messages']}") - - logger.info(f"Complete index created with {len(all_documents)} documents " - f"and {len(all_external_symbols)} external symbols") - - return index - - def create_incremental_index(self, - modified_files: List[str], - existing_index_path: Optional[str] = None - ) -> scip_pb2.Index: - """Create incremental index for modified files.""" - logger.info(f"Creating incremental index for {len(modified_files)} files") - - # Load existing index if provided - existing_index = None - if existing_index_path and os.path.exists(existing_index_path): - try: - streaming_indexer = next(iter(self._streaming_indexers.values())) - existing_index = streaming_indexer.load_index_streaming(existing_index_path) - logger.info(f"Loaded existing index with {len(existing_index.documents)} documents") - except Exception as e: - logger.warning(f"Failed to load existing index: {e}") - - # Group files by language - files_by_language = self._group_files_by_language(modified_files) - - # Create incremental updates for each language - language_indexes = [] - for language, file_paths in files_by_language.items(): - if language not in self._factories: - continue - - streaming_indexer = self._get_streaming_indexer(language) - lang_index = streaming_indexer.create_incremental_index(file_paths, existing_index) - language_indexes.append(lang_index) - - # Merge language indexes - if len(language_indexes) == 1: - return language_indexes[0] - elif len(language_indexes) > 1: - return IndexMerger.merge_indexes(language_indexes) - else: - # No valid files to process - return existing_index or scip_pb2.Index() - - def save_index(self, - index: scip_pb2.Index, - output_path: str, - compress: bool = True) -> None: - """Save SCIP index to file.""" - streaming_indexer = self._get_any_streaming_indexer() - streaming_indexer.save_index_streaming(index, output_path, compress) - - def load_index(self, input_path: str) -> scip_pb2.Index: - """Load SCIP index from file.""" - streaming_indexer = self._get_any_streaming_indexer() - return streaming_indexer.load_index_streaming(input_path) - - def validate_index(self, index: scip_pb2.Index) -> Dict[str, Any]: - """Validate SCIP index compliance.""" - if not self.validator: - return {"validation_enabled": False} - - is_valid = self.validator.validate_index(index) - return { - "is_valid": is_valid, - "validation_enabled": True, - **self.validator.get_validation_summary() - } - - def get_cache_statistics(self) -> Dict[str, Any]: - """Get cache performance statistics.""" - if not self.cache_manager: - return {"cache_enabled": False} - - return { - "cache_enabled": True, - **self.cache_manager.get_cache_statistics() - } - - def clear_cache(self) -> None: - """Clear all caches.""" - if self.cache_manager: - self.cache_manager.invalidate_all_cache() - logger.info("Cache cleared") - - def start_file_watcher(self, - output_path: str, - update_interval: float = 5.0) -> None: - """Start file watcher for automatic index updates.""" - # Use Python factory's streaming indexer for watching - # (could be enhanced to support multiple languages) - streaming_indexer = self._get_streaming_indexer('python') - streaming_indexer.watch_and_update( - self.config.project_root, - output_path, - update_interval - ) - - def stop_all_watchers(self) -> None: - """Stop all file watchers and streaming indexers.""" - for indexer in self._streaming_indexers.values(): - indexer.stop() - logger.info("All watchers stopped") - - def analyze_symbol_relationships(self, index: scip_pb2.Index) -> Dict[str, Any]: - """Analyze symbol relationships in the index.""" - relationship_stats = { - "total_symbols": len(index.external_symbols), - "documents_with_symbols": 0, - "symbols_per_document": {}, - "symbol_types": {}, - "relationship_patterns": [] - } - - # Analyze documents - for doc in index.documents: - symbol_count = len(doc.symbols) - occurrence_count = len(doc.occurrences) - - if symbol_count > 0: - relationship_stats["documents_with_symbols"] += 1 - - relationship_stats["symbols_per_document"][doc.relative_path] = { - "symbols": symbol_count, - "occurrences": occurrence_count - } - - # Analyze symbol types in document - for symbol in doc.symbols: - symbol_kind_name = self._get_symbol_kind_name(symbol.kind) - if symbol_kind_name not in relationship_stats["symbol_types"]: - relationship_stats["symbol_types"][symbol_kind_name] = 0 - relationship_stats["symbol_types"][symbol_kind_name] += 1 - - return relationship_stats - - def export_index_json(self, index: scip_pb2.Index, output_path: str) -> None: - """Export index to JSON format for analysis.""" - from google.protobuf.json_format import MessageToDict - - try: - index_dict = MessageToDict(index) - - import json - with open(output_path, 'w', encoding='utf-8') as f: - json.dump(index_dict, f, indent=2, ensure_ascii=False) - - logger.info(f"Index exported to JSON: {output_path}") - - except Exception as e: - logger.error(f"Failed to export index to JSON: {e}") - raise - - def get_framework_info(self) -> Dict[str, Any]: - """Get comprehensive framework information.""" - return { - "config": { - "project_root": self.config.project_root, - "cache_enabled": self.config.cache_enabled, - "max_workers": self.config.max_workers, - "batch_size": self.config.batch_size, - "supported_languages": list(self.config.supported_languages), - "validate_compliance": self.config.validate_compliance - }, - "factories": list(self._factories.keys()), - "streaming_indexers": list(self._streaming_indexers.keys()), - "cache_statistics": self.get_cache_statistics(), - "detected_languages": list(self.detect_project_languages()) - } - - def _init_factories(self) -> None: - """Initialize language-specific factories.""" - if 'python' in self.config.supported_languages: - self._factories['python'] = create_python_scip_factory(self.config.project_root) - - if 'javascript' in self.config.supported_languages or 'typescript' in self.config.supported_languages: - self._factories['javascript'] = create_javascript_scip_factory(self.config.project_root) - self._factories['typescript'] = self._factories['javascript'] # Same factory - - if 'java' in self.config.supported_languages: - self._factories['java'] = create_java_scip_factory(self.config.project_root) - - if 'fallback' in self.config.supported_languages: - self._factories['fallback'] = create_fallback_scip_factory(self.config.project_root) - - def _get_streaming_indexer(self, language: str) -> StreamingIndexer: - """Get or create streaming indexer for language.""" - if language not in self._streaming_indexers: - if language not in self._factories: - raise ValueError(f"No factory available for language: {language}") - - factory = self._factories[language] - self._streaming_indexers[language] = StreamingIndexer( - factory=factory, - cache_manager=self.cache_manager, - max_workers=self.config.max_workers, - chunk_size=self.config.streaming_chunk_size - ) - - return self._streaming_indexers[language] - - def _get_any_streaming_indexer(self) -> StreamingIndexer: - """Get any available streaming indexer.""" - if not self._streaming_indexers: - # Create one for the first available language - first_language = next(iter(self._factories.keys())) - return self._get_streaming_indexer(first_language) - - return next(iter(self._streaming_indexers.values())) - - def _collect_files_by_language(self, languages: Set[str]) -> Dict[str, List[str]]: - """Collect all project files grouped by language.""" - files_by_language = {lang: [] for lang in languages} - - project_path = Path(self.config.project_root) - - # Language to extensions mapping - language_extensions = { - 'python': {'.py', '.pyw', '.pyx'}, - 'javascript': {'.js', '.jsx', '.mjs', '.cjs'}, - 'typescript': {'.ts', '.tsx'}, - 'java': {'.java'} - } - - # Scan all files - for file_path in project_path.rglob('*'): - if not file_path.is_file(): - continue - - # Skip excluded patterns - if self._should_exclude_file(str(file_path)): - continue - - file_ext = file_path.suffix.lower() - - # Categorize by language - for lang in languages: - if lang in language_extensions: - if file_ext in language_extensions[lang]: - files_by_language[lang].append(str(file_path)) - break - - # Log file counts - for lang, files in files_by_language.items(): - if files: - logger.info(f"Found {len(files)} {lang} files") - - return files_by_language - - def _group_files_by_language(self, file_paths: List[str]) -> Dict[str, List[str]]: - """Group given files by language.""" - files_by_language = {} - - language_extensions = { - 'python': {'.py', '.pyw', '.pyx'}, - 'javascript': {'.js', '.jsx', '.mjs', '.cjs'}, - 'typescript': {'.ts', '.tsx'}, - 'java': {'.java'} - } - - for file_path in file_paths: - file_ext = Path(file_path).suffix.lower() - - for lang, extensions in language_extensions.items(): - if file_ext in extensions and lang in self.config.supported_languages: - if lang not in files_by_language: - files_by_language[lang] = [] - files_by_language[lang].append(file_path) - break - - return files_by_language - - def _should_exclude_file(self, file_path: str) -> bool: - """Check if file should be excluded based on patterns.""" - path_str = str(file_path) - - for pattern in self.config.exclude_patterns: - if pattern in path_str: - return True - - return False - - def _get_symbol_kind_name(self, symbol_kind: int) -> str: - """Get human-readable symbol kind name.""" - # Use enum mapper from any factory - if self._factories: - factory = next(iter(self._factories.values())) - if hasattr(factory, '_enum_mapper'): - return factory._enum_mapper.get_symbol_kind_name(symbol_kind) or f"Unknown({symbol_kind})" - - return f"SymbolKind({symbol_kind})" - - -# Convenience function for quick setup -def create_scip_framework(project_root: str, **kwargs) -> SCIPFrameworkAPI: - """Create SCIP framework with default configuration.""" - config = SCIPConfig(project_root=project_root, **kwargs) - return SCIPFrameworkAPI(config) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/__init__.py b/src/code_index_mcp/scip/framework/zig/__init__.py deleted file mode 100644 index e4a0910..0000000 --- a/src/code_index_mcp/scip/framework/zig/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Zig SCIP framework module.""" - -from .factory import ZigSCIPIndexFactory, create_zig_scip_factory -from .enum_mapper import ZigEnumMapper -from .relationship_extractor import ZigRelationshipExtractor -from .tree_sitter_analyzer import ZigTreeSitterAnalyzer - -__all__ = [ - 'ZigSCIPIndexFactory', - 'create_zig_scip_factory', - 'ZigEnumMapper', - 'ZigRelationshipExtractor', - 'ZigTreeSitterAnalyzer' -] \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/enum_mapper.py b/src/code_index_mcp/scip/framework/zig/enum_mapper.py deleted file mode 100644 index c4fb191..0000000 --- a/src/code_index_mcp/scip/framework/zig/enum_mapper.py +++ /dev/null @@ -1,217 +0,0 @@ -"""Zig enum mapper implementation.""" - -from typing import Dict -from ..base.enum_mapper import BaseEnumMapper -from ...proto import scip_pb2 - - -class ZigEnumMapper(BaseEnumMapper): - """Zig-specific enum mapper for SCIP compliance.""" - - # Zig symbol kind mappings - SYMBOL_KIND_MAP = { - 'function': scip_pb2.Function, - 'method': scip_pb2.Method, - 'struct': scip_pb2.Struct, - 'union': scip_pb2.Struct, - 'enum': scip_pb2.Enum, - 'field': scip_pb2.Field, - 'variable': scip_pb2.Variable, - 'parameter': scip_pb2.Parameter, - 'constant': scip_pb2.Constant, - 'type': scip_pb2.Type, - 'namespace': scip_pb2.Namespace, - 'module': scip_pb2.Module, - 'local_variable': scip_pb2.Variable, - 'global_variable': scip_pb2.Variable, - 'error_set': scip_pb2.Type, - 'test_declaration': scip_pb2.Function, - 'comptime_declaration': scip_pb2.Function, - } - - # Zig syntax kind mappings - SYNTAX_KIND_MAP = { - 'function_declaration': scip_pb2.IdentifierFunctionDefinition, - 'struct_declaration': scip_pb2.IdentifierType, - 'union_declaration': scip_pb2.IdentifierType, - 'enum_declaration': scip_pb2.IdentifierType, - 'field_declaration': scip_pb2.IdentifierAttribute, - 'variable_declaration': scip_pb2.IdentifierLocal, - 'parameter_declaration': scip_pb2.IdentifierParameter, - 'constant_declaration': scip_pb2.IdentifierConstant, - 'type_declaration': scip_pb2.IdentifierType, - 'test_declaration': scip_pb2.IdentifierFunctionDefinition, - 'comptime_declaration': scip_pb2.IdentifierFunctionDefinition, - 'identifier': scip_pb2.Identifier, - 'keyword': scip_pb2.IdentifierKeyword, - 'string_literal': scip_pb2.StringLiteral, - 'numeric_literal': scip_pb2.NumericLiteral, - 'boolean_literal': scip_pb2.BooleanLiteral, - 'comment': scip_pb2.Comment, - 'punctuation': scip_pb2.PunctuationDelimiter, - } - - # Zig symbol role mappings (official SCIP naming) - SYMBOL_ROLE_MAP = { - 'definition': scip_pb2.Definition, - 'import': scip_pb2.Import, - 'write': scip_pb2.Write, # Official SCIP naming - 'read': scip_pb2.Read, # Official SCIP naming - 'generated': scip_pb2.Generated, - 'test': scip_pb2.Test, - 'type': scip_pb2.Type, # Add missing Type role - 'reference': scip_pb2.Read, # Default reference is read access - } - - def map_symbol_kind(self, language_kind: str) -> int: - """Map Zig symbol type to SCIP SymbolKind.""" - kind = self.SYMBOL_KIND_MAP.get(language_kind, scip_pb2.UnspecifiedSymbolKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SymbolKind'): - raise ValueError(f"Invalid SymbolKind: {kind} for language_kind: {language_kind}") - - return kind - - def map_syntax_kind(self, language_syntax: str) -> int: - """Map Zig syntax element to SCIP SyntaxKind.""" - kind = self.SYNTAX_KIND_MAP.get(language_syntax, scip_pb2.UnspecifiedSyntaxKind) - - # Validate enum value - if not self.validate_enum_value(kind, 'SyntaxKind'): - raise ValueError(f"Invalid SyntaxKind: {kind} for language_syntax: {language_syntax}") - - return kind - - def map_symbol_role(self, language_role: str) -> int: - """Map Zig symbol role to SCIP SymbolRole.""" - role = self.SYMBOL_ROLE_MAP.get(language_role, scip_pb2.Read) - - # Validate enum value - if not self.validate_enum_value(role, 'SymbolRole'): - raise ValueError(f"Invalid SymbolRole: {role} for language_role: {language_role}") - - return role - - def get_zig_node_symbol_kind(self, node_type: str) -> str: - """ - Map Zig tree-sitter node type to internal symbol kind string. - - Args: - node_type: Zig tree-sitter node type (e.g., 'function_declaration', 'struct_declaration') - - Returns: - Internal symbol kind string for use with map_symbol_kind() - """ - node_kind_map = { - 'function_declaration': 'function', - 'struct_declaration': 'struct', - 'union_declaration': 'union', - 'enum_declaration': 'enum', - 'field_declaration': 'field', - 'variable_declaration': 'variable', - 'parameter_declaration': 'parameter', - 'constant_declaration': 'constant', - 'type_declaration': 'type', - 'test_declaration': 'test_declaration', - 'comptime_declaration': 'comptime_declaration', - 'error_set_declaration': 'error_set', - 'container_field': 'field', - 'builtin_call_expr': 'function', - } - - return node_kind_map.get(node_type, 'variable') - - def get_zig_node_syntax_kind(self, node_type: str, context: str = None) -> str: - """ - Map Zig tree-sitter node type to internal syntax kind string. - - Args: - node_type: Zig tree-sitter node type - context: Additional context for disambiguation - - Returns: - Internal syntax kind string for use with map_syntax_kind() - """ - node_syntax_map = { - 'function_declaration': 'function_declaration', - 'struct_declaration': 'struct_declaration', - 'union_declaration': 'union_declaration', - 'enum_declaration': 'enum_declaration', - 'field_declaration': 'field_declaration', - 'variable_declaration': 'variable_declaration', - 'parameter_declaration': 'parameter_declaration', - 'constant_declaration': 'constant_declaration', - 'type_declaration': 'type_declaration', - 'test_declaration': 'test_declaration', - 'comptime_declaration': 'comptime_declaration', - 'identifier': 'identifier', - 'string_literal': 'string_literal', - 'integer_literal': 'numeric_literal', - 'float_literal': 'numeric_literal', - 'builtin_identifier': 'keyword', - 'boolean_literal': 'boolean_literal', - } - - return node_syntax_map.get(node_type, 'identifier') - - def get_zig_node_symbol_role(self, node_type: str, context: str = None) -> str: - """ - Map Zig tree-sitter node type to internal symbol role string. - - Args: - node_type: Zig tree-sitter node type - context: Additional context (e.g., 'in_assignment', 'in_call') - - Returns: - Internal symbol role string for use with map_symbol_role() - """ - if context == 'definition': - return 'definition' - elif context == 'assignment': - return 'write' - elif context == 'import': - return 'import' - elif context == 'test': - return 'test' - elif node_type in ['function_declaration', 'struct_declaration', 'union_declaration', - 'enum_declaration', 'field_declaration', 'variable_declaration', - 'constant_declaration', 'type_declaration', 'test_declaration']: - return 'definition' - else: - return 'reference' - - def is_valid_zig_symbol_kind(self, symbol_kind: str) -> bool: - """Check if symbol kind is valid for Zig.""" - return symbol_kind in self.SYMBOL_KIND_MAP - - def is_valid_zig_syntax_kind(self, syntax_kind: str) -> bool: - """Check if syntax kind is valid for Zig.""" - return syntax_kind in self.SYNTAX_KIND_MAP - - def is_valid_zig_symbol_role(self, symbol_role: str) -> bool: - """Check if symbol role is valid for Zig.""" - return symbol_role in self.SYMBOL_ROLE_MAP - - def get_all_zig_symbol_kinds(self) -> list: - """Get all available Zig symbol kinds.""" - return list(self.SYMBOL_KIND_MAP.keys()) - - def get_all_zig_syntax_kinds(self) -> list: - """Get all available Zig syntax kinds.""" - return list(self.SYNTAX_KIND_MAP.keys()) - - def get_all_zig_symbol_roles(self) -> list: - """Get all available Zig symbol roles.""" - return list(self.SYMBOL_ROLE_MAP.keys()) - - def get_zig_specific_kinds(self) -> Dict[str, str]: - """Get Zig-specific symbol kinds.""" - return { - 'error_set': 'error_set', - 'test_declaration': 'test_declaration', - 'comptime_declaration': 'comptime_declaration', - 'builtin_function': 'function', - 'global_variable': 'global_variable', - 'local_variable': 'local_variable', - } \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/factory.py b/src/code_index_mcp/scip/framework/zig/factory.py deleted file mode 100644 index afef434..0000000 --- a/src/code_index_mcp/scip/framework/zig/factory.py +++ /dev/null @@ -1,388 +0,0 @@ -"""Zig SCIP Index Factory implementation.""" - -import os -from pathlib import Path -from typing import Set, List, Iterator, Optional -from ..base.index_factory import SCIPIndexFactory -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..base.enum_mapper import BaseEnumMapper -from ..symbol_generator import SCIPSymbolGenerator -from ..position_calculator import SCIPPositionCalculator -from ..types import SCIPContext, SCIPSymbolDescriptor -from .relationship_extractor import ZigRelationshipExtractor -from .enum_mapper import ZigEnumMapper -from .tree_sitter_analyzer import ZigTreeSitterAnalyzer -from ...proto import scip_pb2 - -import tree_sitter -from tree_sitter_zig import language as zig_language - - -class ZigSCIPIndexFactory(SCIPIndexFactory): - """Zig-specific SCIP Index factory implementation with constructor injection.""" - - def __init__(self, - project_root: str, - symbol_generator: SCIPSymbolGenerator, - relationship_extractor: BaseRelationshipExtractor, - enum_mapper: BaseEnumMapper, - position_calculator: SCIPPositionCalculator): - """Initialize Zig factory with required components via constructor injection.""" - super().__init__(project_root, symbol_generator, relationship_extractor, - enum_mapper, position_calculator) - self.tree_analyzer = ZigTreeSitterAnalyzer() - - def get_language(self) -> str: - """Return language identifier.""" - return "zig" - - def get_supported_extensions(self) -> Set[str]: - """Return supported file extensions.""" - return {'.zig', '.zon'} - - def _extract_symbols(self, context: SCIPContext) -> Iterator[scip_pb2.SymbolInformation]: - """Extract Zig symbol definitions using tree-sitter analysis.""" - try: - tree = self.tree_analyzer.parse(context.content, context.file_path) - - for node in self.tree_analyzer.walk(tree): - if self.tree_analyzer.is_symbol_definition(node): - symbol_info = self._create_symbol_from_tree_node(node, context) - if symbol_info: - yield symbol_info - - except SyntaxError as e: - # Handle syntax errors gracefully - pass - - def _extract_occurrences(self, context: SCIPContext) -> Iterator[scip_pb2.Occurrence]: - """Extract Zig symbol occurrences.""" - try: - tree = self.tree_analyzer.parse(context.content, context.file_path) - - for node in self.tree_analyzer.walk(tree): - if (self.tree_analyzer.is_symbol_definition(node) or - self.tree_analyzer.is_symbol_reference(node)): - occurrence = self._create_occurrence_from_tree_node(node, context) - if occurrence: - yield occurrence - - except SyntaxError as e: - # Handle syntax errors gracefully - pass - - def extract_external_symbols(self, documents: List[scip_pb2.Document]) -> List[scip_pb2.SymbolInformation]: - """Extract Zig external symbols from imports.""" - external_symbols = [] - - for doc in documents: - try: - content = self._read_file(os.path.join(self.project_root, doc.relative_path)) - tree = self.tree_analyzer.parse(content, doc.relative_path) - - # Extract import statements - import_statements = self.tree_analyzer.extract_import_statements(tree) - for import_path in import_statements: - external_symbol = self._create_external_symbol_from_import(import_path) - if external_symbol: - external_symbols.append(external_symbol) - - except Exception as e: - # Skip problematic files - continue - - return external_symbols - - def build_cross_document_relationships(self, documents: List[scip_pb2.Document], full_index: scip_pb2.Index) -> int: - """ - Build Zig-specific cross-document relationships. - - This implementation provides basic cross-document relationship support - for Zig. A more sophisticated implementation would analyze @import statements - and module dependencies. - """ - # For now, use a simplified approach - # TODO: Implement proper Zig import analysis - return 0 # Placeholder - no relationships added yet - - def _create_symbol_from_tree_node(self, node, context: SCIPContext) -> Optional[scip_pb2.SymbolInformation]: - """Create SCIP symbol information from tree-sitter node.""" - symbol_info = scip_pb2.SymbolInformation() - - symbol_name = self.tree_analyzer.get_symbol_name(node) - if not symbol_name: - return None - - if node.type == 'function_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="function", - scope_path=context.scope_stack, - descriptor_suffix="()." - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('function') - - elif node.type == 'struct_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="struct", - scope_path=context.scope_stack, - descriptor_suffix="#" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('struct') - - elif node.type == 'union_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="union", - scope_path=context.scope_stack, - descriptor_suffix="#" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('union') - - elif node.type == 'enum_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="enum", - scope_path=context.scope_stack, - descriptor_suffix="#" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('enum') - - elif node.type == 'variable_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="variable", - scope_path=context.scope_stack, - descriptor_suffix="" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('variable') - - elif node.type == 'constant_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="constant", - scope_path=context.scope_stack, - descriptor_suffix="" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('constant') - - elif node.type == 'type_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="type", - scope_path=context.scope_stack, - descriptor_suffix="" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('type') - - elif node.type == 'container_field': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="field", - scope_path=context.scope_stack, - descriptor_suffix="" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('field') - - elif node.type == 'parameter_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="parameter", - scope_path=context.scope_stack, - descriptor_suffix="" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('parameter') - - elif node.type == 'test_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="test_declaration", - scope_path=context.scope_stack, - descriptor_suffix="()." - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('test_declaration') - - elif node.type == 'comptime_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="comptime_declaration", - scope_path=context.scope_stack, - descriptor_suffix="()." - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('comptime_declaration') - - elif node.type == 'error_set_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="error_set", - scope_path=context.scope_stack, - descriptor_suffix="#" - ) - symbol_info.symbol = self.symbol_generator.create_local_symbol(descriptor) - symbol_info.display_name = symbol_name - symbol_info.kind = self.enum_mapper.map_symbol_kind('error_set') - - else: - return None - - return symbol_info - - def _create_occurrence_from_tree_node(self, node, context: SCIPContext) -> Optional[scip_pb2.Occurrence]: - """Create SCIP occurrence from tree-sitter node.""" - occurrence = scip_pb2.Occurrence() - - # Calculate position using position calculator - try: - position_info = self.position_calculator.calculate_positions_from_tree_node( - context.content, node - ) - - # Set range - occurrence.range.start.extend([position_info.start_line, position_info.start_column]) - occurrence.range.end.extend([position_info.end_line, position_info.end_column]) - - except Exception as e: - # Skip if position calculation fails - return None - - symbol_name = self.tree_analyzer.get_symbol_name(node) - if not symbol_name: - return None - - # Set symbol and roles based on node type - if node.type == 'function_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="function", - scope_path=context.scope_stack, - descriptor_suffix="()." - ) - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('function_declaration') - - elif node.type in ['struct_declaration', 'union_declaration', 'enum_declaration']: - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind=node.type.replace('_declaration', ''), - scope_path=context.scope_stack, - descriptor_suffix="#" - ) - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind(f'{node.type}') - - elif node.type in ['variable_declaration', 'constant_declaration']: - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind=node.type.replace('_declaration', ''), - scope_path=context.scope_stack, - descriptor_suffix="" - ) - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('definition') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind(f'{node.type}') - - elif node.type == 'test_declaration': - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="test_declaration", - scope_path=context.scope_stack, - descriptor_suffix="()." - ) - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('test') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('test_declaration') - - elif node.type == 'identifier': - # Handle variable references - descriptor = SCIPSymbolDescriptor( - name=symbol_name, - kind="variable", - scope_path=context.scope_stack, - descriptor_suffix="" - ) - occurrence.symbol = self.symbol_generator.create_local_symbol(descriptor) - occurrence.symbol_roles = self.enum_mapper.map_symbol_role('reference') - occurrence.syntax_kind = self.enum_mapper.map_syntax_kind('identifier') - - else: - return None - - return occurrence - - def _create_external_symbol_from_import(self, import_path: str) -> Optional[scip_pb2.SymbolInformation]: - """Create external symbol from import statement.""" - symbol_info = scip_pb2.SymbolInformation() - - # Determine if it's a standard library, C library, or external import - if import_path.startswith('std'): - symbol_info.symbol = f"zig-std {import_path}" - symbol_info.display_name = import_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"Zig standard library: {import_path}") - elif import_path.startswith('c'): - symbol_info.symbol = f"c-lib {import_path}" - symbol_info.display_name = import_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"C library: {import_path}") - elif import_path.startswith('./') or import_path.startswith('../'): - symbol_info.symbol = f"local {import_path}" - symbol_info.display_name = import_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"Local module: {import_path}") - else: - symbol_info.symbol = f"zig-external {import_path}" - symbol_info.display_name = import_path - symbol_info.kind = self.enum_mapper.map_symbol_kind('module') - symbol_info.documentation.append(f"External Zig module: {import_path}") - - return symbol_info - - -def create_zig_scip_factory(project_root: str) -> ZigSCIPIndexFactory: - """ - Factory creator for Zig SCIP factory. - Ensures all required components are properly assembled via constructor injection. - """ - symbol_generator = SCIPSymbolGenerator( - scheme="scip-zig", - package_manager="zig", - package_name=Path(project_root).name, - version="HEAD" - ) - - relationship_extractor = ZigRelationshipExtractor() - enum_mapper = ZigEnumMapper() - position_calculator = SCIPPositionCalculator() - - return ZigSCIPIndexFactory( - project_root=project_root, - symbol_generator=symbol_generator, - relationship_extractor=relationship_extractor, # Guaranteed to be provided - enum_mapper=enum_mapper, - position_calculator=position_calculator - ) \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/relationship_extractor.py b/src/code_index_mcp/scip/framework/zig/relationship_extractor.py deleted file mode 100644 index 8bbad94..0000000 --- a/src/code_index_mcp/scip/framework/zig/relationship_extractor.py +++ /dev/null @@ -1,322 +0,0 @@ -"""Zig relationship extractor implementation.""" - -from typing import Iterator, Optional, List -from ..base.relationship_extractor import BaseRelationshipExtractor -from ..types import SCIPContext, Relationship -from ...core.relationship_types import InternalRelationshipType - -import tree_sitter -from tree_sitter_zig import language as zig_language - - -class ZigRelationshipExtractor(BaseRelationshipExtractor): - """Zig-specific relationship extractor using tree-sitter analysis.""" - - def __init__(self): - """Initialize the Zig relationship extractor.""" - lang = tree_sitter.Language(zig_language()) - self.parser = tree_sitter.Parser(lang) - - def extract_inheritance_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract inheritance relationships from Zig (limited, as Zig doesn't have traditional inheritance).""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - # Zig doesn't have traditional inheritance, but we can extract composition relationships - # where structs contain other struct types - for node in self._walk_tree(tree.root_node): - if node.type == 'struct_declaration': - struct_name = self._get_struct_name(node) - if not struct_name: - continue - - struct_symbol_id = self._create_struct_symbol_id(struct_name, context) - - # Look for embedded structs or type fields that reference other types - for field_node in self._walk_tree(node): - if field_node.type == 'container_field': - field_type = self._get_field_type(field_node, context.content) - if field_type and self._is_custom_type(field_type): - type_symbol_id = self._create_type_symbol_id(field_type, context) - yield Relationship( - source_symbol=struct_symbol_id, - target_symbol=type_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_call_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract function call relationships.""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - for node in self._walk_tree(tree.root_node): - if node.type == 'function_declaration': - function_name = self._get_function_name(node) - if not function_name: - continue - - function_symbol_id = self._create_function_symbol_id(function_name, context) - - # Find function calls within this function - for call_node in self._walk_tree(node): - if call_node.type == 'call_expression': - target_function = self._get_call_target(call_node, context.content) - if target_function and target_function != function_name: - target_symbol_id = self._create_function_symbol_id(target_function, context) - yield Relationship( - source_symbol=function_symbol_id, - target_symbol=target_symbol_id, - relationship_type=InternalRelationshipType.CALLS - ) - elif call_node.type == 'builtin_call_expr': - # Handle builtin functions like @import, @cInclude, etc. - builtin_name = self._get_builtin_name(call_node, context.content) - if builtin_name: - builtin_symbol_id = f"zig-builtin {builtin_name}" - yield Relationship( - source_symbol=function_symbol_id, - target_symbol=builtin_symbol_id, - relationship_type=InternalRelationshipType.CALLS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_import_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract import/dependency relationships.""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - file_symbol_id = self._create_file_symbol_id(context.file_path) - - for node in self._walk_tree(tree.root_node): - if node.type == 'builtin_call_expr': - builtin_name = self._get_builtin_name(node, context.content) - if builtin_name in ['@import', '@cImport', '@cInclude']: - import_path = self._get_import_path(node, context.content) - if import_path: - # Determine if it's a standard library, C library, or local import - if import_path.startswith('std'): - module_symbol_id = f"zig-std {import_path}" - elif builtin_name in ['@cImport', '@cInclude']: - module_symbol_id = f"c-lib {import_path}" - elif import_path.startswith('./') or import_path.startswith('../'): - module_symbol_id = f"local {import_path}" - else: - module_symbol_id = f"zig-external {import_path}" - - yield Relationship( - source_symbol=file_symbol_id, - target_symbol=module_symbol_id, - relationship_type=InternalRelationshipType.IMPORTS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_composition_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract composition relationships (struct fields, union fields).""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - for node in self._walk_tree(tree.root_node): - if node.type in ['struct_declaration', 'union_declaration']: - container_name = self._get_container_name(node) - if not container_name: - continue - - container_symbol_id = self._create_container_symbol_id(container_name, node.type, context) - - # Find fields in this container - for field_node in self._walk_tree(node): - if field_node.type == 'container_field': - field_name = self._get_field_name(field_node, context.content) - if field_name: - field_symbol_id = self._create_field_symbol_id(field_name, container_symbol_id) - yield Relationship( - source_symbol=container_symbol_id, - target_symbol=field_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - - except Exception: - # Skip files with parsing errors - return - - def extract_interface_relationships(self, context: SCIPContext) -> Iterator[Relationship]: - """Extract interface relationships (Zig doesn't have interfaces, but has error sets and protocols).""" - try: - tree = self.parser.parse(bytes(context.content, 'utf8')) - - for node in self._walk_tree(tree.root_node): - if node.type == 'error_set_declaration': - error_set_name = self._get_error_set_name(node, context.content) - if not error_set_name: - continue - - error_set_symbol_id = self._create_error_set_symbol_id(error_set_name, context) - - # Find error values in this error set - for error_node in self._walk_tree(node): - if error_node.type == 'identifier': - error_name = self._get_node_text(error_node, context.content) - if error_name and error_name != error_set_name: - error_symbol_id = self._create_error_symbol_id(error_name, error_set_symbol_id) - yield Relationship( - source_symbol=error_set_symbol_id, - target_symbol=error_symbol_id, - relationship_type=InternalRelationshipType.CONTAINS - ) - - except Exception: - # Skip files with parsing errors - return - - def _walk_tree(self, node) -> Iterator: - """Walk tree-sitter tree nodes.""" - yield node - for child in node.children: - yield from self._walk_tree(child) - - def _get_node_text(self, node, content: str) -> str: - """Get text content of a tree-sitter node.""" - return content[node.start_byte:node.end_byte] - - def _get_struct_name(self, struct_node) -> Optional[str]: - """Extract struct name from struct declaration node.""" - for child in struct_node.children: - if child.type == 'identifier': - return child.text.decode('utf8') - return None - - def _get_function_name(self, function_node) -> Optional[str]: - """Extract function name from function declaration node.""" - for child in function_node.children: - if child.type == 'identifier': - return child.text.decode('utf8') - return None - - def _get_container_name(self, container_node) -> Optional[str]: - """Extract container name from struct/union declaration node.""" - for child in container_node.children: - if child.type == 'identifier': - return child.text.decode('utf8') - return None - - def _get_field_name(self, field_node, content: str) -> Optional[str]: - """Extract field name from container field node.""" - for child in field_node.children: - if child.type == 'identifier': - return self._get_node_text(child, content) - return None - - def _get_field_type(self, field_node, content: str) -> Optional[str]: - """Extract field type from container field node.""" - # Look for type information in the field - for child in field_node.children: - if child.type in ['type_expression', 'identifier']: - return self._get_node_text(child, content) - return None - - def _get_call_target(self, call_node, content: str) -> Optional[str]: - """Extract target function name from call expression.""" - for child in call_node.children: - if child.type == 'identifier': - return self._get_node_text(child, content) - elif child.type == 'field_expression': - # Handle method calls like obj.method() - for grandchild in child.children: - if grandchild.type == 'identifier': - return self._get_node_text(grandchild, content) - return None - - def _get_builtin_name(self, builtin_node, content: str) -> Optional[str]: - """Extract builtin function name from builtin call expression.""" - builtin_text = self._get_node_text(builtin_node, content) - if builtin_text.startswith('@'): - # Extract just the builtin name (e.g., "@import" from "@import(...)") - paren_index = builtin_text.find('(') - if paren_index > 0: - return builtin_text[:paren_index] - return builtin_text - return None - - def _get_import_path(self, import_node, content: str) -> Optional[str]: - """Extract import path from import expression.""" - # Look for string literal in the import call - for child in self._walk_tree(import_node): - if child.type == 'string_literal': - path_text = self._get_node_text(child, content) - # Remove quotes - return path_text.strip('"\'') - return None - - def _get_error_set_name(self, error_set_node, content: str) -> Optional[str]: - """Extract error set name from error set declaration.""" - for child in error_set_node.children: - if child.type == 'identifier': - return self._get_node_text(child, content) - return None - - def _is_custom_type(self, type_name: str) -> bool: - """Check if a type name represents a custom type (not a builtin).""" - builtin_types = { - 'i8', 'i16', 'i32', 'i64', 'i128', - 'u8', 'u16', 'u32', 'u64', 'u128', - 'f16', 'f32', 'f64', 'f128', - 'bool', 'void', 'noreturn', 'type', - 'anyerror', 'anyframe', 'anyopaque' - } - return type_name not in builtin_types and not type_name.startswith('*') - - def _create_struct_symbol_id(self, struct_name: str, context: SCIPContext) -> str: - """Create symbol ID for struct.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{struct_name}" if scope_path else struct_name - return f"local {local_id}#" - - def _create_function_symbol_id(self, function_name: str, context: SCIPContext) -> str: - """Create symbol ID for function.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{function_name}" if scope_path else function_name - return f"local {local_id}()." - - def _create_container_symbol_id(self, container_name: str, container_type: str, context: SCIPContext) -> str: - """Create symbol ID for struct/union container.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{container_name}" if scope_path else container_name - return f"local {local_id}#" - - def _create_type_symbol_id(self, type_name: str, context: SCIPContext) -> str: - """Create symbol ID for type.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{type_name}" if scope_path else type_name - return f"local {local_id}#" - - def _create_field_symbol_id(self, field_name: str, container_symbol_id: str) -> str: - """Create symbol ID for field.""" - # Extract container name from container symbol ID - container_name = container_symbol_id.replace("local ", "").replace("#", "") - return f"local {container_name}.{field_name}" - - def _create_error_set_symbol_id(self, error_set_name: str, context: SCIPContext) -> str: - """Create symbol ID for error set.""" - scope_path = ".".join(context.scope_stack) if context.scope_stack else "" - local_id = f"{scope_path}.{error_set_name}" if scope_path else error_set_name - return f"local {local_id}#" - - def _create_error_symbol_id(self, error_name: str, error_set_symbol_id: str) -> str: - """Create symbol ID for error value.""" - # Extract error set name from error set symbol ID - error_set_name = error_set_symbol_id.replace("local ", "").replace("#", "") - return f"local {error_set_name}.{error_name}" - - def _create_file_symbol_id(self, file_path: str) -> str: - """Create symbol ID for file.""" - return f"local {file_path}" \ No newline at end of file diff --git a/src/code_index_mcp/scip/framework/zig/tree_sitter_analyzer.py b/src/code_index_mcp/scip/framework/zig/tree_sitter_analyzer.py deleted file mode 100644 index 1b8fec0..0000000 --- a/src/code_index_mcp/scip/framework/zig/tree_sitter_analyzer.py +++ /dev/null @@ -1,357 +0,0 @@ -"""Zig tree-sitter analyzer implementation.""" - -from typing import Iterator, Optional, Set, List, Dict, Any -from ..types import SCIPContext -from ..base.language_analyzer import BaseLanguageAnalyzer - -import tree_sitter -from tree_sitter_zig import language as zig_language - - -class ZigTreeSitterAnalyzer(BaseLanguageAnalyzer): - """Zig analyzer using tree-sitter for AST parsing.""" - - def __init__(self): - """Initialize the Zig tree-sitter analyzer.""" - lang = tree_sitter.Language(zig_language()) - self.parser = tree_sitter.Parser(lang) - self._processed_nodes: Set[int] = set() - - def parse(self, content: str, filename: str = ""): - """Parse Zig source code into tree-sitter AST.""" - try: - return self.parser.parse(bytes(content, 'utf8')) - except Exception as e: - raise SyntaxError(f"Zig syntax error in {filename}: {e}") - - def walk(self, tree) -> Iterator: - """Walk tree-sitter tree nodes, avoiding duplicates.""" - for node in self._walk_node(tree.root_node): - node_id = id(node) - if node_id not in self._processed_nodes: - self._processed_nodes.add(node_id) - yield node - - def _walk_node(self, node) -> Iterator: - """Recursively walk tree nodes.""" - yield node - for child in node.children: - yield from self._walk_node(child) - - def is_symbol_definition(self, node) -> bool: - """Check if tree-sitter node represents a symbol definition.""" - return node.type in { - 'function_declaration', - 'struct_declaration', - 'union_declaration', - 'enum_declaration', - 'variable_declaration', - 'constant_declaration', - 'type_declaration', - 'container_field', - 'parameter_declaration', - 'test_declaration', - 'comptime_declaration', - 'error_set_declaration', - } - - def is_symbol_reference(self, node) -> bool: - """Check if tree-sitter node represents a symbol reference.""" - return node.type in { - 'identifier', - 'call_expression', - 'field_expression', - 'builtin_call_expr', - } - - def get_symbol_name(self, node) -> Optional[str]: - """Extract symbol name from tree-sitter node.""" - if node.type in ['function_declaration', 'struct_declaration', 'union_declaration', - 'enum_declaration', 'variable_declaration', 'constant_declaration', - 'type_declaration', 'test_declaration', 'comptime_declaration']: - # Look for identifier child - for child in node.children: - if child.type == 'identifier': - return child.text.decode('utf8') - - elif node.type == 'container_field': - # Field in struct/union/enum - for child in node.children: - if child.type == 'identifier': - return child.text.decode('utf8') - - elif node.type == 'parameter_declaration': - # Function parameter - for child in node.children: - if child.type == 'identifier': - return child.text.decode('utf8') - - elif node.type == 'identifier': - return node.text.decode('utf8') - - return None - - def get_node_position(self, node) -> tuple: - """Get position information from tree-sitter node.""" - start_line = node.start_point[0] - start_col = node.start_point[1] - end_line = node.end_point[0] - end_col = node.end_point[1] - - return (start_line, start_col, end_line, end_col) - - def extract_function_info(self, tree) -> List[Dict[str, Any]]: - """Extract function information from the AST.""" - functions = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'function_declaration': - function_info = { - 'name': self.get_symbol_name(node), - 'type': 'function', - 'position': self.get_node_position(node), - 'is_public': self._is_public_function(node), - 'is_extern': self._is_extern_function(node), - 'return_type': self._extract_return_type(node), - 'parameters': self._extract_function_parameters(node), - } - functions.append(function_info) - - return functions - - def extract_struct_info(self, tree) -> List[Dict[str, Any]]: - """Extract struct information from the AST.""" - structs = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'struct_declaration': - struct_info = { - 'name': self.get_symbol_name(node), - 'type': 'struct', - 'position': self.get_node_position(node), - 'is_public': self._is_public_declaration(node), - 'fields': self._extract_struct_fields(node), - } - structs.append(struct_info) - - return structs - - def extract_union_info(self, tree) -> List[Dict[str, Any]]: - """Extract union information from the AST.""" - unions = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'union_declaration': - union_info = { - 'name': self.get_symbol_name(node), - 'type': 'union', - 'position': self.get_node_position(node), - 'is_public': self._is_public_declaration(node), - 'fields': self._extract_union_fields(node), - } - unions.append(union_info) - - return unions - - def extract_enum_info(self, tree) -> List[Dict[str, Any]]: - """Extract enum information from the AST.""" - enums = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'enum_declaration': - enum_info = { - 'name': self.get_symbol_name(node), - 'type': 'enum', - 'position': self.get_node_position(node), - 'is_public': self._is_public_declaration(node), - 'values': self._extract_enum_values(node), - } - enums.append(enum_info) - - return enums - - def extract_variable_info(self, tree) -> List[Dict[str, Any]]: - """Extract variable information from the AST.""" - variables = [] - - for node in self._walk_node(tree.root_node): - if node.type in ['variable_declaration', 'constant_declaration']: - variable_info = { - 'name': self.get_symbol_name(node), - 'type': 'constant' if node.type == 'constant_declaration' else 'variable', - 'position': self.get_node_position(node), - 'is_public': self._is_public_declaration(node), - 'variable_type': self._extract_variable_type(node), - 'is_mutable': node.type == 'variable_declaration', - } - variables.append(variable_info) - - return variables - - def extract_test_info(self, tree) -> List[Dict[str, Any]]: - """Extract test declaration information from the AST.""" - tests = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'test_declaration': - test_info = { - 'name': self.get_symbol_name(node) or self._extract_test_name(node), - 'type': 'test', - 'position': self.get_node_position(node), - } - tests.append(test_info) - - return tests - - def extract_import_statements(self, tree) -> List[str]: - """Extract import statements from the AST.""" - imports = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'builtin_call_expr': - builtin_text = node.text.decode('utf8') - if builtin_text.startswith('@import'): - import_path = self._extract_import_path(node) - if import_path: - imports.append(import_path) - - return imports - - def extract_error_set_info(self, tree) -> List[Dict[str, Any]]: - """Extract error set information from the AST.""" - error_sets = [] - - for node in self._walk_node(tree.root_node): - if node.type == 'error_set_declaration': - error_set_info = { - 'name': self.get_symbol_name(node), - 'type': 'error_set', - 'position': self.get_node_position(node), - 'errors': self._extract_error_values(node), - } - error_sets.append(error_set_info) - - return error_sets - - def _is_public_declaration(self, node) -> bool: - """Check if a declaration is public.""" - # Look for 'pub' keyword in parent or siblings - parent = node.parent - if parent: - for child in parent.children: - if child.type == 'keyword' and child.text.decode('utf8') == 'pub': - return True - return False - - def _is_public_function(self, node) -> bool: - """Check if a function is public.""" - return self._is_public_declaration(node) - - def _is_extern_function(self, node) -> bool: - """Check if a function is extern.""" - # Look for 'extern' keyword - parent = node.parent - if parent: - for child in parent.children: - if child.type == 'keyword' and child.text.decode('utf8') == 'extern': - return True - return False - - def _extract_return_type(self, function_node) -> Optional[str]: - """Extract return type from function declaration.""" - # Look for return type after the parameter list - for child in function_node.children: - if child.type in ['type_expression', 'identifier']: - return child.text.decode('utf8') - return None - - def _extract_function_parameters(self, function_node) -> List[Dict[str, str]]: - """Extract parameter information from function declaration.""" - parameters = [] - - for child in function_node.children: - if child.type == 'parameter_list': - for param_child in child.children: - if param_child.type == 'parameter_declaration': - param_name = self.get_symbol_name(param_child) - param_type = self._extract_parameter_type(param_child) - if param_name: - parameters.append({ - 'name': param_name, - 'type': param_type or 'unknown' - }) - - return parameters - - def _extract_parameter_type(self, param_node) -> Optional[str]: - """Extract parameter type from parameter declaration.""" - for child in param_node.children: - if child.type in ['type_expression', 'identifier']: - return child.text.decode('utf8') - return None - - def _extract_struct_fields(self, struct_node) -> List[str]: - """Extract field names from struct declaration.""" - fields = [] - - for child in struct_node.children: - if child.type == 'container_declaration': - for field_child in child.children: - if field_child.type == 'container_field': - field_name = self.get_symbol_name(field_child) - if field_name: - fields.append(field_name) - - return fields - - def _extract_union_fields(self, union_node) -> List[str]: - """Extract field names from union declaration.""" - return self._extract_struct_fields(union_node) # Same logic - - def _extract_enum_values(self, enum_node) -> List[str]: - """Extract enum value names from enum declaration.""" - values = [] - - for child in enum_node.children: - if child.type == 'container_declaration': - for value_child in child.children: - if value_child.type == 'container_field': - value_name = self.get_symbol_name(value_child) - if value_name: - values.append(value_name) - - return values - - def _extract_variable_type(self, var_node) -> Optional[str]: - """Extract variable type from variable declaration.""" - for child in var_node.children: - if child.type in ['type_expression', 'identifier']: - return child.text.decode('utf8') - return None - - def _extract_test_name(self, test_node) -> Optional[str]: - """Extract test name from test declaration.""" - # Test name is usually in a string literal - for child in test_node.children: - if child.type == 'string_literal': - return child.text.decode('utf8').strip('"\'') - return None - - def _extract_import_path(self, import_node) -> Optional[str]: - """Extract import path from @import call.""" - for child in self._walk_node(import_node): - if child.type == 'string_literal': - return child.text.decode('utf8').strip('"\'') - return None - - def _extract_error_values(self, error_set_node) -> List[str]: - """Extract error values from error set declaration.""" - errors = [] - - for child in error_set_node.children: - if child.type == 'error_set': - for error_child in child.children: - if error_child.type == 'identifier': - errors.append(error_child.text.decode('utf8')) - - return errors \ No newline at end of file diff --git a/src/code_index_mcp/scip/language_manager.py b/src/code_index_mcp/scip/language_manager.py deleted file mode 100644 index 118ad73..0000000 --- a/src/code_index_mcp/scip/language_manager.py +++ /dev/null @@ -1,522 +0,0 @@ -"""SCIP Language Manager - Direct factory management without strategy layer.""" - -import logging -import os -from pathlib import Path -from typing import Dict, List, Optional, Set, Callable, Any - -from .framework.types import SCIPContext -from .framework.base.index_factory import SCIPIndexFactory -from .proto import scip_pb2 - -# Import all language factory creators -from .framework.python import create_python_scip_factory -from .framework.javascript import create_javascript_scip_factory -from .framework.java import create_java_scip_factory -from .framework.objective_c import create_objective_c_scip_factory -from .framework.zig import create_zig_scip_factory -from .framework.fallback import create_fallback_scip_factory - -logger = logging.getLogger(__name__) - - -class LanguageNotSupportedException(Exception): - """Exception raised when a language is not supported.""" - pass - - -class SCIPLanguageManager: - """ - Direct language management for SCIP indexing without strategy abstraction layer. - - This manager directly handles language detection, factory selection, and file processing - without the overhead of the strategy pattern. It provides a cleaner, more efficient - approach to SCIP index generation. - """ - - def __init__(self, project_root: str): - """Initialize the language manager for a specific project.""" - self.project_root = project_root - - # Language factory creators mapping - self._factory_creators: Dict[str, Callable[[str], SCIPIndexFactory]] = { - 'python': create_python_scip_factory, - 'javascript': create_javascript_scip_factory, - 'typescript': create_javascript_scip_factory, # Same as JavaScript - 'java': create_java_scip_factory, - 'objective_c': create_objective_c_scip_factory, - 'zig': create_zig_scip_factory, - 'fallback': create_fallback_scip_factory - } - - # Language priority for detection conflicts - self._language_priority = { - 'python': 90, - 'javascript': 85, - 'typescript': 85, - 'java': 80, - 'objective_c': 75, - 'zig': 70, - 'fallback': 10 # Always lowest priority - } - - # Extension to language mapping - self._extension_mapping = { - # Python - '.py': 'python', - '.pyw': 'python', - '.pyx': 'python', - '.pyi': 'python', - - # JavaScript/TypeScript - '.js': 'javascript', - '.jsx': 'javascript', - '.mjs': 'javascript', - '.cjs': 'javascript', - '.ts': 'typescript', - '.tsx': 'typescript', - - # Java - '.java': 'java', - - # Objective-C - '.m': 'objective_c', - '.mm': 'objective_c', - '.h': 'objective_c', # Could be C/C++ too, but we'll handle with priority - - # Zig - '.zig': 'zig', - '.zon': 'zig', - } - - # Factory cache to avoid recreating - self._factory_cache: Dict[str, SCIPIndexFactory] = {} - - logger.info(f"Initialized SCIP Language Manager for project: {project_root}") - logger.info(f"Supported languages: {list(self._factory_creators.keys())}") - - def detect_language(self, file_path: str) -> str: - """ - Detect the programming language for a given file. - - Args: - file_path: Path to the file - - Returns: - Language identifier string - """ - extension = Path(file_path).suffix.lower() - - # Direct mapping for most cases - if extension in self._extension_mapping: - return self._extension_mapping[extension] - - # Special handling for ambiguous extensions - if extension == '.h': - # Could be C, C++, or Objective-C - # For now, default to objective_c, but could add content-based detection - return 'objective_c' - - # Default to fallback for unknown extensions - return 'fallback' - - def get_factory(self, language: str) -> SCIPIndexFactory: - """ - Get or create a factory for the specified language. - - Args: - language: Language identifier - - Returns: - SCIP Index Factory for the language - - Raises: - LanguageNotSupportedException: If language is not supported - """ - if language not in self._factory_creators: - raise LanguageNotSupportedException(f"Language '{language}' is not supported") - - # Check cache first - if language not in self._factory_cache: - factory_creator = self._factory_creators[language] - self._factory_cache[language] = factory_creator(self.project_root) - logger.debug(f"Created new {language} factory for project {self.project_root}") - - return self._factory_cache[language] - - def get_factory_for_file(self, file_path: str) -> SCIPIndexFactory: - """ - Get the appropriate factory for a specific file. - - Args: - file_path: Path to the file - - Returns: - SCIP Index Factory for the file's language - """ - language = self.detect_language(file_path) - return self.get_factory(language) - - def process_file(self, file_path: str) -> Optional[scip_pb2.Document]: - """ - Process a single file and generate SCIP document. - - Args: - file_path: Path to the file to process - - Returns: - SCIP Document or None if processing failed - """ - try: - # Get appropriate factory - factory = self.get_factory_for_file(file_path) - - # Read file content - content = self._read_file_content(file_path) - if not content: - return None - - # Create context - relative_path = os.path.relpath(file_path, self.project_root) - context = SCIPContext( - file_path=relative_path, - content=content, - scope_stack=[], - imports={} - ) - - # Generate document - document = factory.create_document(file_path, content) - - if document: - logger.debug(f"Successfully processed {relative_path} with {len(document.symbols)} symbols") - - return document - - except Exception as e: - logger.error(f"Failed to process file {file_path}: {e}") - return None - - def process_files(self, file_paths: List[str]) -> List[scip_pb2.Document]: - """ - Process multiple files and generate SCIP documents. - - Args: - file_paths: List of file paths to process - - Returns: - List of SCIP Documents - """ - documents = [] - processed_count = 0 - error_count = 0 - - # Group files by language for efficiency - files_by_language = self._group_files_by_language(file_paths) - - for language, files in files_by_language.items(): - if not files: - continue - - logger.info(f"Processing {len(files)} {language} files") - - try: - factory = self.get_factory(language) - - for i, file_path in enumerate(files, 1): - document = self.process_file(file_path) - if document: - documents.append(document) - processed_count += 1 - else: - error_count += 1 - - # Progress logging - if i % 10 == 0 or i == len(files): - relative_path = os.path.relpath(file_path, self.project_root) - logger.debug(f"{language} progress: {i}/{len(files)} files, last: {relative_path}") - - except Exception as e: - logger.error(f"Failed to process {language} files: {e}") - error_count += len(files) - continue - - logger.info(f"Processing complete: {processed_count} documents generated, {error_count} errors") - return documents - - def create_complete_index(self, file_paths: Optional[List[str]] = None) -> scip_pb2.Index: - """ - Create a complete SCIP index for the project. - - Args: - file_paths: Optional list of specific files to process. If None, auto-discover. - - Returns: - Complete SCIP Index - """ - if file_paths is None: - file_paths = self._discover_project_files() - - logger.info(f"Creating complete SCIP index for {len(file_paths)} files") - - # Create index with metadata - index = scip_pb2.Index() - - # Use any factory to create metadata (they should be consistent) - try: - fallback_factory = self.get_factory('fallback') - index.metadata.CopyFrom(fallback_factory.create_metadata(self.project_root)) - except Exception as e: - logger.warning(f"Failed to create metadata: {e}") - - # Process all files - documents = self.process_files(file_paths) - index.documents.extend(documents) - - # Extract external symbols - all_external_symbols = [] - files_by_language = self._group_files_by_language(file_paths) - - for language, files in files_by_language.items(): - if not files: - continue - - try: - factory = self.get_factory(language) - language_documents = [doc for doc in documents if self._get_document_language(doc) == language] - external_symbols = factory.extract_external_symbols(language_documents) - all_external_symbols.extend(external_symbols) - except Exception as e: - logger.warning(f"Failed to extract external symbols for {language}: {e}") - - index.external_symbols.extend(all_external_symbols) - - # Build cross-document relationships after all documents are processed - logger.info("Building cross-document relationships...") - self._build_cross_document_relationships(index) - - logger.info(f"Complete index created with {len(documents)} documents and {len(all_external_symbols)} external symbols") - return index - - def _build_cross_document_relationships(self, index: scip_pb2.Index) -> None: - """ - Build cross-document relationships using language-specific processing. - - This method delegates relationship building to individual language factories - to handle language-specific module systems and import semantics correctly. - """ - logger.info("Building cross-document relationships using language-specific processing...") - - # Group documents by language for language-specific processing - files_by_language = self._group_documents_by_language(index.documents) - - total_relationships_added = 0 - - for language, documents in files_by_language.items(): - if not documents: - continue - - try: - logger.info(f"Processing cross-document relationships for {len(documents)} {language} files") - factory = self.get_factory(language) - - # Delegate to language-specific implementation - relationships_added = factory.build_cross_document_relationships(documents, index) - total_relationships_added += relationships_added - - logger.info(f"Added {relationships_added} relationships for {language} files") - - except Exception as e: - logger.warning(f"Failed to build cross-document relationships for {language}: {e}") - # Fallback to legacy unified processing for this language - self._build_cross_document_relationships_legacy(index, documents) - - logger.info(f"Total cross-document relationships added: {total_relationships_added}") - - def _build_cross_document_relationships_legacy(self, index: scip_pb2.Index, documents_filter: List[scip_pb2.Document] = None) -> None: - """ - Legacy unified cross-document relationship building as fallback. - - This is the original implementation kept for fallback purposes. - """ - logger.info("Using legacy cross-document relationship building") - - # Use provided documents or all documents in index - documents_to_process = documents_filter if documents_filter else index.documents - - # Step 1: Build global symbol registry - symbol_registry = {} - for doc in documents_to_process: - for symbol_info in doc.symbols: - symbol_id = symbol_info.symbol - symbol_registry[symbol_id] = (doc, symbol_info) - - # Also register without suffix for function symbols - if symbol_info.kind == 11: # SymbolKind.Function - if symbol_id.endswith('().'): - base_id = symbol_id[:-3] # Remove '().' - symbol_registry[base_id] = (doc, symbol_info) - - logger.debug(f"Built legacy symbol registry with {len(symbol_registry)} entries") - - # Step 2: Analyze occurrences to build relationships - relationships_added = 0 - for source_doc in documents_to_process: - for occurrence in source_doc.occurrences: - # Skip if not a reference (we want ReadAccess = 8) - if not (occurrence.symbol_roles & 8): - continue - - # Skip if it's also a definition (Definition = 1) - if occurrence.symbol_roles & 1: - continue - - target_symbol_id = occurrence.symbol - - # Find the target symbol being referenced - target_entry = symbol_registry.get(target_symbol_id) - if not target_entry: - continue - - target_doc, target_symbol_info = target_entry - - # Skip self-references within same symbol - source_symbol_id = self._find_containing_symbol(occurrence, source_doc) - if not source_symbol_id or source_symbol_id == target_symbol_id: - continue - - # Create relationship (target is called by source) - # Only add if it's a function being called - if target_symbol_info.kind == 11: # SymbolKind.Function - relationship = scip_pb2.Relationship() - relationship.symbol = source_symbol_id - relationship.is_reference = True - relationship.is_implementation = False - relationship.is_type_definition = False - relationship.is_definition = False - - # Check if this relationship already exists to avoid duplicates - already_exists = any( - rel.symbol == source_symbol_id - for rel in target_symbol_info.relationships - ) - - if not already_exists: - target_symbol_info.relationships.append(relationship) - relationships_added += 1 - - logger.info(f"Added {relationships_added} legacy cross-document relationships") - - def _find_containing_symbol(self, occurrence, document) -> Optional[str]: - """ - Find which symbol contains this occurrence based on position. - - Args: - occurrence: The occurrence to locate - document: The document containing the occurrence - - Returns: - Symbol ID of the containing symbol, or None if not found - """ - if not occurrence.range or not occurrence.range.start: - return None - - occurrence_line = occurrence.range.start[0] if len(occurrence.range.start) > 0 else 0 - - # Find the symbol that contains this occurrence - best_symbol = None - for symbol_info in document.symbols: - # We need to determine if the occurrence is within this symbol's scope - # This is a simplified approach - ideally we'd have proper scope ranges - # For now, we'll use a heuristic based on symbol type - - # If it's a module-level symbol (no parent), it could contain the occurrence - if not best_symbol: - best_symbol = symbol_info.symbol - - # If no containing symbol found, use file-level context - if not best_symbol and document.relative_path: - file_name = document.relative_path.replace('\\', '/').split('/')[-1] - return f"local {file_name}#" - - return best_symbol - - def get_supported_languages(self) -> Set[str]: - """Get all supported languages.""" - return set(self._factory_creators.keys()) - - def get_language_statistics(self, file_paths: List[str]) -> Dict[str, int]: - """Get statistics about language distribution in file list.""" - stats = {} - for file_path in file_paths: - language = self.detect_language(file_path) - stats[language] = stats.get(language, 0) + 1 - return stats - - def _read_file_content(self, file_path: str) -> Optional[str]: - """Read file content safely.""" - try: - with open(file_path, 'r', encoding='utf-8', errors='replace') as f: - return f.read() - except Exception as e: - logger.warning(f"Failed to read file {file_path}: {e}") - return None - - def _group_files_by_language(self, file_paths: List[str]) -> Dict[str, List[str]]: - """Group files by their detected language.""" - groups = {} - for file_path in file_paths: - language = self.detect_language(file_path) - if language not in groups: - groups[language] = [] - groups[language].append(file_path) - return groups - - def _group_documents_by_language(self, documents: List[scip_pb2.Document]) -> Dict[str, List[scip_pb2.Document]]: - """Group SCIP documents by their language.""" - groups = {} - for doc in documents: - language = self._get_document_language(doc) - if language not in groups: - groups[language] = [] - groups[language].append(doc) - return groups - - def _discover_project_files(self) -> List[str]: - """Auto-discover files in the project directory.""" - files = [] - project_path = Path(self.project_root) - - # Common exclude patterns - exclude_patterns = { - '.git', '__pycache__', 'node_modules', '.vscode', '.idea', - '.pytest_cache', '.mypy_cache', 'dist', 'build' - } - - for file_path in project_path.rglob('*'): - if file_path.is_file(): - # Skip excluded directories - if any(part in exclude_patterns for part in file_path.parts): - continue - - # Only include files with known extensions or force fallback - extension = file_path.suffix.lower() - if extension in self._extension_mapping or extension: - files.append(str(file_path)) - - logger.info(f"Discovered {len(files)} files in project") - return files - - def _get_document_language(self, document: scip_pb2.Document) -> str: - """Extract language from document.""" - if hasattr(document, 'language') and document.language: - return document.language - - # Fallback: detect from file path - return self.detect_language(document.relative_path) if document.relative_path else 'fallback' - - -# Convenience function for quick usage -def create_language_manager(project_root: str) -> SCIPLanguageManager: - """Create a new SCIP Language Manager for the given project.""" - return SCIPLanguageManager(project_root) \ No newline at end of file diff --git a/src/code_index_mcp/scip/proto/__init__.py b/src/code_index_mcp/scip/proto/__init__.py deleted file mode 100644 index 479e6fc..0000000 --- a/src/code_index_mcp/scip/proto/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""SCIP Protocol Buffer definitions and utilities.""" \ No newline at end of file diff --git a/src/code_index_mcp/scip/proto/scip.proto b/src/code_index_mcp/scip/proto/scip.proto deleted file mode 100644 index 9519306..0000000 --- a/src/code_index_mcp/scip/proto/scip.proto +++ /dev/null @@ -1,265 +0,0 @@ -// SCIP (Source Code Intelligence Protocol) schema definition. -// This is a direct copy from: https://github.com/sourcegraph/scip/blob/main/scip.proto - -syntax = "proto3"; - -package scip; - -option go_package = "github.com/sourcegraph/scip/bindings/go/scip/v1"; -option java_package = "com.sourcegraph.scip_java"; - -// An Index message payload represents a complete SCIP index for a workspace -// rooted at a single directory. An Index payload may have a large memory -// footprint and it's recommended to emit and consume an Index payload one -// field value at a time. To permit such streaming usage, the `metadata` and -// `documents` fields should preferably come first and each `documents` field -// should be emitted as a separate message. -// -// To reduce the memory footprint of Index messages, all Symbol values that -// are referenced from `documents` should be de-duplicated and stored in the -// `external_symbols` field. When consuming Index messages, the client should -// construct a symbol table from these `external_symbols` to correctly resolve -// Symbol references that appear in `documents`. -message Index { - Metadata metadata = 1; - repeated Document documents = 2; - repeated SymbolInformation external_symbols = 3; -} - -// ProtocolVersion specifies the protocol version that should be used to -// interpret this SCIP index. Different versions of the protocol may not -// be backwards compatible with each other. -enum ProtocolVersion { - UnspecifiedProtocolVersion = 0; -} - -// Metadata contains information about the producer of the SCIP index. -message Metadata { - ProtocolVersion version = 1; - ToolInfo tool_info = 2; - string project_root = 3; - TextDocumentEncoding text_document_encoding = 4; -} - -enum TextDocumentEncoding { - UnspecifiedTextDocumentEncoding = 0; - // Use UTF-8 encoding where a 'character' corresponds to a Unicode scalar - // value and a 'character offset' corresponds to a byte offset in the - // underlying byte array. - UTF8 = 1; - // Use UTF-16 encoding where a 'character' corresponds to a Unicode code unit - // (which may be a high or low surrogate), and a 'character offset' - // corresponds to the Unicode code unit offset in the underlying byte array. - UTF16 = 2; - // Use UTF-32 encoding where a 'character' corresponds to a Unicode scalar - // value and a 'character offset' corresponds to a byte offset in the - // underlying byte array. - UTF32 = 3; -} - -// Information about the tool that produced the SCIP index. -message ToolInfo { - string name = 1; - string version = 2; - repeated string arguments = 3; -} - -// A Document represents the metadata about one source file on disk. -message Document { - string relative_path = 1; - string language = 2; - repeated Occurrence occurrences = 3; - repeated SymbolInformation symbols = 4; - // Optional: the text contents of this document. - string text = 5; - // Used to indicate the encoding used for the text. Should be UTF-8 - // if unspecified, to be compatible with editors and the JVM ecosystem. - PositionEncoding position_encoding = 6; -} - -enum PositionEncoding { - UnspecifiedPositionEncoding = 0; - // The position encoding where columns are measured in UTF-8 byte - // offsets. This is the default encoding if unspecified. - UTF8Bytes = 1; - // The position encoding where columns are measured in UTF-16 code - // units. This encoding is supported by the Language Server Protocol - // and is part of many Microsoft/web ecosystems. - UTF16CodeUnits = 2; - // The position encoding where columns are measured in UTF-32 Unicode - // scalar values (also known as Unicode codepoints). This encoding is - // supported by some text editors like Emacs and the Neovim ecosystem. - UTF32CodeUnits = 3; -} - -// An Occurrence associates source positions with symbols. -message Occurrence { - Range range = 1; - string symbol = 2; - int32 symbol_roles = 3; - SyntaxKind syntax_kind = 4; - repeated Diagnostic diagnostics = 5; - repeated string enclosing_range = 6; -} - -enum SyntaxKind { - UnspecifiedSyntaxKind = 0; - Comment = 1; - PunctuationDelimiter = 2; - PunctuationBracket = 3; - Keyword = 4; - // IdentifierKeyword corresponds to identifiers that are treated as keywords. - // This is needed for languages such as Go where built-in functions like - // `println` are identifiers but have special meaning. - IdentifierKeyword = 5; - IdentifierOperator = 6; - Identifier = 7; - IdentifierBuiltin = 8; - IdentifierNull = 9; - IdentifierConstant = 10; - IdentifierMutableGlobal = 11; - IdentifierParameter = 12; - IdentifierLocal = 13; - IdentifierShadowed = 14; - IdentifierNamespace = 15; - IdentifierFunction = 16; - IdentifierFunctionDefinition = 17; - IdentifierMacro = 18; - IdentifierMacroDefinition = 19; - IdentifierType = 20; - IdentifierBuiltinType = 21; - IdentifierAttribute = 22; - RegexEscape = 23; - RegexRepeated = 24; - RegexWildcard = 25; - RegexDelimiter = 26; - RegexJoin = 27; - StringLiteral = 28; - StringLiteralEscape = 29; - StringLiteralSpecial = 30; - StringLiteralKey = 31; - CharacterLiteral = 32; - NumericLiteral = 33; - BooleanLiteral = 34; - Tag = 35; - TagAttribute = 36; - TagDelimiter = 37; -} - -// A Range represents source positions. -message Range { - repeated int32 start = 1; - repeated int32 end = 2; -} - -// A Diagnostic is a message associated with source positions. -message Diagnostic { - Severity severity = 1; - string code = 2; - string message = 3; - string source = 4; - repeated DiagnosticTag tags = 5; -} - -enum Severity { - UnspecifiedSeverity = 0; - Error = 1; - Warning = 2; - Information = 3; - Hint = 4; -} - -enum DiagnosticTag { - UnspecifiedDiagnosticTag = 0; - Unnecessary = 1; - Deprecated = 2; -} - -// SymbolInformation provides rich metadata about symbols in the index. -message SymbolInformation { - string symbol = 1; - repeated string documentation = 2; - repeated Relationship relationships = 3; - SymbolKind kind = 4; - string display_name = 5; - string signature_documentation = 6; - repeated string enclosing_symbol = 7; -} - -enum SymbolKind { - UnspecifiedSymbolKind = 0; - Array = 1; - Boolean = 2; - Class = 3; - Constant = 4; - Constructor = 5; - Enum = 6; - EnumMember = 7; - Event = 8; - Field = 9; - File = 10; - Function = 11; - Interface = 12; - Key = 13; - Method = 14; - Module = 15; - Namespace = 16; - Null = 17; - Number = 18; - Object = 19; - Operator = 20; - Package = 21; - Parameter = 22; - Property = 23; - String = 24; - Struct = 25; - TypeParameter = 26; - Unit = 27; - Value = 28; - Variable = 29; - // Language-specific symbol kinds. Use the `display_name` field to give - // the symbol a generic name. - AssociatedType = 30; - SelfParameter = 31; - UnknownKind = 32; - Trait = 33; - Union = 34; - Macro = 35; -} - -// Represents a relationship between symbols. -message Relationship { - string symbol = 1; - bool is_reference = 2; - bool is_implementation = 3; - bool is_type_definition = 4; - bool is_definition = 5; -} - -// Symbol roles encode the relationship a symbol has to its containing document. -// A symbol can have multiple roles. For example, a function that is being defined -// can have both the `definition` role and the `import` role if it's imported from -// another package. -enum SymbolRole { - UnspecifiedSymbolRole = 0; - // Is the symbol defined here? If yes, this is considered a symbol definition. - Definition = 1; - // Is the symbol imported here? For example, the symbol `fmt` is imported in - // the Go code `import "fmt"`. - Import = 2; - // Is the symbol written here? For example, the symbol `variable` is written - // in the Go code `variable := value`. - Write = 4; - // Is the symbol read here? This is the default role for a symbol that is - // being referenced. - Read = 8; - // Is the symbol generated here? For example, in the Go code `type Foo struct { Name string }`, - // the symbol `Name` has the role `Generated | Read` for the getter function `func (x Foo) Name() string`. - Generated = 16; - // Is the symbol tested here? For example, in the Go code `func TestSomething(t *testing.T) { t.Errorf("got %s") }`, - // the symbols `TestSomething` and `t.Errorf` have the role `Test`. - Test = 32; - // Is the symbol for a type reference? For example, in the Go code `var x []User`, - // the symbol `User` has the role `Type | Read`. - Type = 64; -} \ No newline at end of file diff --git a/src/code_index_mcp/scip/proto/scip_pb2.py b/src/code_index_mcp/scip/proto/scip_pb2.py deleted file mode 100644 index 06f63d9..0000000 --- a/src/code_index_mcp/scip/proto/scip_pb2.py +++ /dev/null @@ -1,69 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# NO CHECKED-IN PROTOBUF GENCODE -# source: code_index_mcp/scip/proto/scip.proto -# Protobuf Python Version: 6.31.1 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import runtime_version as _runtime_version -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder -_runtime_version.ValidateProtobufRuntimeVersion( - _runtime_version.Domain.PUBLIC, - 6, - 31, - 1, - '', - 'code_index_mcp/scip/proto/scip.proto' -) -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n$code_index_mcp/scip/proto/scip.proto\x12\x04scip\"\x7f\n\x05Index\x12 \n\x08metadata\x18\x01 \x01(\x0b\x32\x0e.scip.Metadata\x12!\n\tdocuments\x18\x02 \x03(\x0b\x32\x0e.scip.Document\x12\x31\n\x10\x65xternal_symbols\x18\x03 \x03(\x0b\x32\x17.scip.SymbolInformation\"\xa7\x01\n\x08Metadata\x12&\n\x07version\x18\x01 \x01(\x0e\x32\x15.scip.ProtocolVersion\x12!\n\ttool_info\x18\x02 \x01(\x0b\x32\x0e.scip.ToolInfo\x12\x14\n\x0cproject_root\x18\x03 \x01(\t\x12:\n\x16text_document_encoding\x18\x04 \x01(\x0e\x32\x1a.scip.TextDocumentEncoding\"<\n\x08ToolInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12\x11\n\targuments\x18\x03 \x03(\t\"\xc5\x01\n\x08\x44ocument\x12\x15\n\rrelative_path\x18\x01 \x01(\t\x12\x10\n\x08language\x18\x02 \x01(\t\x12%\n\x0boccurrences\x18\x03 \x03(\x0b\x32\x10.scip.Occurrence\x12(\n\x07symbols\x18\x04 \x03(\x0b\x32\x17.scip.SymbolInformation\x12\x0c\n\x04text\x18\x05 \x01(\t\x12\x31\n\x11position_encoding\x18\x06 \x01(\x0e\x32\x16.scip.PositionEncoding\"\xb5\x01\n\nOccurrence\x12\x1a\n\x05range\x18\x01 \x01(\x0b\x32\x0b.scip.Range\x12\x0e\n\x06symbol\x18\x02 \x01(\t\x12\x14\n\x0csymbol_roles\x18\x03 \x01(\x05\x12%\n\x0bsyntax_kind\x18\x04 \x01(\x0e\x32\x10.scip.SyntaxKind\x12%\n\x0b\x64iagnostics\x18\x05 \x03(\x0b\x32\x10.scip.Diagnostic\x12\x17\n\x0f\x65nclosing_range\x18\x06 \x03(\t\"#\n\x05Range\x12\r\n\x05start\x18\x01 \x03(\x05\x12\x0b\n\x03\x65nd\x18\x02 \x03(\x05\"\x80\x01\n\nDiagnostic\x12 \n\x08severity\x18\x01 \x01(\x0e\x32\x0e.scip.Severity\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\x12\x0e\n\x06source\x18\x04 \x01(\t\x12!\n\x04tags\x18\x05 \x03(\x0e\x32\x13.scip.DiagnosticTag\"\xd6\x01\n\x11SymbolInformation\x12\x0e\n\x06symbol\x18\x01 \x01(\t\x12\x15\n\rdocumentation\x18\x02 \x03(\t\x12)\n\rrelationships\x18\x03 \x03(\x0b\x32\x12.scip.Relationship\x12\x1e\n\x04kind\x18\x04 \x01(\x0e\x32\x10.scip.SymbolKind\x12\x14\n\x0c\x64isplay_name\x18\x05 \x01(\t\x12\x1f\n\x17signature_documentation\x18\x06 \x01(\t\x12\x18\n\x10\x65nclosing_symbol\x18\x07 \x03(\t\"\x82\x01\n\x0cRelationship\x12\x0e\n\x06symbol\x18\x01 \x01(\t\x12\x14\n\x0cis_reference\x18\x02 \x01(\x08\x12\x19\n\x11is_implementation\x18\x03 \x01(\x08\x12\x1a\n\x12is_type_definition\x18\x04 \x01(\x08\x12\x15\n\ris_definition\x18\x05 \x01(\x08*1\n\x0fProtocolVersion\x12\x1e\n\x1aUnspecifiedProtocolVersion\x10\x00*[\n\x14TextDocumentEncoding\x12#\n\x1fUnspecifiedTextDocumentEncoding\x10\x00\x12\x08\n\x04UTF8\x10\x01\x12\t\n\x05UTF16\x10\x02\x12\t\n\x05UTF32\x10\x03*j\n\x10PositionEncoding\x12\x1f\n\x1bUnspecifiedPositionEncoding\x10\x00\x12\r\n\tUTF8Bytes\x10\x01\x12\x12\n\x0eUTF16CodeUnits\x10\x02\x12\x12\n\x0eUTF32CodeUnits\x10\x03*\xc8\x06\n\nSyntaxKind\x12\x19\n\x15UnspecifiedSyntaxKind\x10\x00\x12\x0b\n\x07\x43omment\x10\x01\x12\x18\n\x14PunctuationDelimiter\x10\x02\x12\x16\n\x12PunctuationBracket\x10\x03\x12\x0b\n\x07Keyword\x10\x04\x12\x15\n\x11IdentifierKeyword\x10\x05\x12\x16\n\x12IdentifierOperator\x10\x06\x12\x0e\n\nIdentifier\x10\x07\x12\x15\n\x11IdentifierBuiltin\x10\x08\x12\x12\n\x0eIdentifierNull\x10\t\x12\x16\n\x12IdentifierConstant\x10\n\x12\x1b\n\x17IdentifierMutableGlobal\x10\x0b\x12\x17\n\x13IdentifierParameter\x10\x0c\x12\x13\n\x0fIdentifierLocal\x10\r\x12\x16\n\x12IdentifierShadowed\x10\x0e\x12\x17\n\x13IdentifierNamespace\x10\x0f\x12\x16\n\x12IdentifierFunction\x10\x10\x12 \n\x1cIdentifierFunctionDefinition\x10\x11\x12\x13\n\x0fIdentifierMacro\x10\x12\x12\x1d\n\x19IdentifierMacroDefinition\x10\x13\x12\x12\n\x0eIdentifierType\x10\x14\x12\x19\n\x15IdentifierBuiltinType\x10\x15\x12\x17\n\x13IdentifierAttribute\x10\x16\x12\x0f\n\x0bRegexEscape\x10\x17\x12\x11\n\rRegexRepeated\x10\x18\x12\x11\n\rRegexWildcard\x10\x19\x12\x12\n\x0eRegexDelimiter\x10\x1a\x12\r\n\tRegexJoin\x10\x1b\x12\x11\n\rStringLiteral\x10\x1c\x12\x17\n\x13StringLiteralEscape\x10\x1d\x12\x18\n\x14StringLiteralSpecial\x10\x1e\x12\x14\n\x10StringLiteralKey\x10\x1f\x12\x14\n\x10\x43haracterLiteral\x10 \x12\x12\n\x0eNumericLiteral\x10!\x12\x12\n\x0e\x42ooleanLiteral\x10\"\x12\x07\n\x03Tag\x10#\x12\x10\n\x0cTagAttribute\x10$\x12\x10\n\x0cTagDelimiter\x10%*V\n\x08Severity\x12\x17\n\x13UnspecifiedSeverity\x10\x00\x12\t\n\x05\x45rror\x10\x01\x12\x0b\n\x07Warning\x10\x02\x12\x0f\n\x0bInformation\x10\x03\x12\x08\n\x04Hint\x10\x04*N\n\rDiagnosticTag\x12\x1c\n\x18UnspecifiedDiagnosticTag\x10\x00\x12\x0f\n\x0bUnnecessary\x10\x01\x12\x0e\n\nDeprecated\x10\x02*\xf1\x03\n\nSymbolKind\x12\x19\n\x15UnspecifiedSymbolKind\x10\x00\x12\t\n\x05\x41rray\x10\x01\x12\x0b\n\x07\x42oolean\x10\x02\x12\t\n\x05\x43lass\x10\x03\x12\x0c\n\x08\x43onstant\x10\x04\x12\x0f\n\x0b\x43onstructor\x10\x05\x12\x08\n\x04\x45num\x10\x06\x12\x0e\n\nEnumMember\x10\x07\x12\t\n\x05\x45vent\x10\x08\x12\t\n\x05\x46ield\x10\t\x12\x08\n\x04\x46ile\x10\n\x12\x0c\n\x08\x46unction\x10\x0b\x12\r\n\tInterface\x10\x0c\x12\x07\n\x03Key\x10\r\x12\n\n\x06Method\x10\x0e\x12\n\n\x06Module\x10\x0f\x12\r\n\tNamespace\x10\x10\x12\x08\n\x04Null\x10\x11\x12\n\n\x06Number\x10\x12\x12\n\n\x06Object\x10\x13\x12\x0c\n\x08Operator\x10\x14\x12\x0b\n\x07Package\x10\x15\x12\r\n\tParameter\x10\x16\x12\x0c\n\x08Property\x10\x17\x12\n\n\x06String\x10\x18\x12\n\n\x06Struct\x10\x19\x12\x11\n\rTypeParameter\x10\x1a\x12\x08\n\x04Unit\x10\x1b\x12\t\n\x05Value\x10\x1c\x12\x0c\n\x08Variable\x10\x1d\x12\x12\n\x0e\x41ssociatedType\x10\x1e\x12\x11\n\rSelfParameter\x10\x1f\x12\x0f\n\x0bUnknownKind\x10 \x12\t\n\x05Trait\x10!\x12\t\n\x05Union\x10\"\x12\t\n\x05Macro\x10#*{\n\nSymbolRole\x12\x19\n\x15UnspecifiedSymbolRole\x10\x00\x12\x0e\n\nDefinition\x10\x01\x12\n\n\x06Import\x10\x02\x12\t\n\x05Write\x10\x04\x12\x08\n\x04Read\x10\x08\x12\r\n\tGenerated\x10\x10\x12\x08\n\x04Test\x10 \x12\x08\n\x04Type\x10@BL\n\x19\x63om.sourcegraph.scip_javaZ/github.com/sourcegraph/scip/bindings/go/scip/v1b\x06proto3') - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'code_index_mcp.scip.proto.scip_pb2', _globals) -if not _descriptor._USE_C_DESCRIPTORS: - _globals['DESCRIPTOR']._loaded_options = None - _globals['DESCRIPTOR']._serialized_options = b'\n\031com.sourcegraph.scip_javaZ/github.com/sourcegraph/scip/bindings/go/scip/v1' - _globals['_PROTOCOLVERSION']._serialized_start=1309 - _globals['_PROTOCOLVERSION']._serialized_end=1358 - _globals['_TEXTDOCUMENTENCODING']._serialized_start=1360 - _globals['_TEXTDOCUMENTENCODING']._serialized_end=1451 - _globals['_POSITIONENCODING']._serialized_start=1453 - _globals['_POSITIONENCODING']._serialized_end=1559 - _globals['_SYNTAXKIND']._serialized_start=1562 - _globals['_SYNTAXKIND']._serialized_end=2402 - _globals['_SEVERITY']._serialized_start=2404 - _globals['_SEVERITY']._serialized_end=2490 - _globals['_DIAGNOSTICTAG']._serialized_start=2492 - _globals['_DIAGNOSTICTAG']._serialized_end=2570 - _globals['_SYMBOLKIND']._serialized_start=2573 - _globals['_SYMBOLKIND']._serialized_end=3070 - _globals['_SYMBOLROLE']._serialized_start=3072 - _globals['_SYMBOLROLE']._serialized_end=3195 - _globals['_INDEX']._serialized_start=46 - _globals['_INDEX']._serialized_end=173 - _globals['_METADATA']._serialized_start=176 - _globals['_METADATA']._serialized_end=343 - _globals['_TOOLINFO']._serialized_start=345 - _globals['_TOOLINFO']._serialized_end=405 - _globals['_DOCUMENT']._serialized_start=408 - _globals['_DOCUMENT']._serialized_end=605 - _globals['_OCCURRENCE']._serialized_start=608 - _globals['_OCCURRENCE']._serialized_end=789 - _globals['_RANGE']._serialized_start=791 - _globals['_RANGE']._serialized_end=826 - _globals['_DIAGNOSTIC']._serialized_start=829 - _globals['_DIAGNOSTIC']._serialized_end=957 - _globals['_SYMBOLINFORMATION']._serialized_start=960 - _globals['_SYMBOLINFORMATION']._serialized_end=1174 - _globals['_RELATIONSHIP']._serialized_start=1177 - _globals['_RELATIONSHIP']._serialized_end=1307 -# @@protoc_insertion_point(module_scope) diff --git a/src/code_index_mcp/server.py b/src/code_index_mcp/server.py index da36810..5892c0a 100644 --- a/src/code_index_mcp/server.py +++ b/src/code_index_mcp/server.py @@ -24,7 +24,6 @@ from .services import ( SearchService, FileService, SettingsService, FileWatcherService ) -from .indexing.unified_index_manager import UnifiedIndexManager from .services.settings_service import manage_temp_directory from .services.file_discovery_service import FileDiscoveryService from .services.project_management_service import ProjectManagementService diff --git a/src/code_index_mcp/services/base_service.py b/src/code_index_mcp/services/base_service.py index f9931e7..a29e6bf 100644 --- a/src/code_index_mcp/services/base_service.py +++ b/src/code_index_mcp/services/base_service.py @@ -132,9 +132,9 @@ def index_provider(self): @property def index_manager(self): """ - Convenient access to the unified index manager. + Convenient access to the index manager. Returns: - The UnifiedIndexManager instance, or None if not available + The index manager instance, or None if not available """ return self.helper.index_manager diff --git a/src/code_index_mcp/services/code_intelligence_service.py b/src/code_index_mcp/services/code_intelligence_service.py index fec4b3f..77ff894 100644 --- a/src/code_index_mcp/services/code_intelligence_service.py +++ b/src/code_index_mcp/services/code_intelligence_service.py @@ -1,33 +1,32 @@ """ Code Intelligence Service - Business logic for code analysis and understanding. -This service handles the business logic for analyzing code files, extracting -intelligence, and providing comprehensive code insights. It composes technical -tools to achieve business goals. +This service handles the business logic for analyzing code files using the new +JSON-based indexing system optimized for LLM consumption. """ +import logging import os from typing import Dict, Any +logger = logging.getLogger(__name__) + from .base_service import BaseService from ..tools.filesystem import FileSystemTool +from ..indexing import get_index_manager class CodeIntelligenceService(BaseService): """ - Business service for code analysis and intelligence. + Business service for code analysis and intelligence using JSON indexing. - This service orchestrates code analysis workflows by composing - technical tools to achieve business goals like understanding code - structure, extracting insights, and providing comprehensive analysis. + This service provides comprehensive code analysis using the optimized + JSON-based indexing system for fast LLM-friendly responses. """ def __init__(self, ctx): super().__init__(ctx) self._filesystem_tool = FileSystemTool() - # Use new enhanced symbol analyzer instead of legacy SCIPQueryTool - from ..tools.scip.scip_symbol_analyzer import SCIPSymbolAnalyzer - self._symbol_analyzer = SCIPSymbolAnalyzer() def analyze_file(self, file_path: str) -> Dict[str, Any]: """ @@ -49,11 +48,24 @@ def analyze_file(self, file_path: str) -> Dict[str, Any]: # Business validation self._validate_analysis_request(file_path) - # Use enhanced SCIP analysis - analysis = self._perform_enhanced_scip_analysis(file_path) + # Use the global index manager + index_manager = get_index_manager() + + # Debug logging + logger.info(f"Getting file summary for: {file_path}") + logger.info(f"Index manager state - Project path: {index_manager.project_path}") + logger.info(f"Index manager state - Has builder: {index_manager.index_builder is not None}") + if index_manager.index_builder: + logger.info(f"Index manager state - Has index: {index_manager.index_builder.in_memory_index is not None}") + + # Get file summary from JSON index + summary = index_manager.get_file_summary(file_path) + logger.info(f"Summary result: {summary is not None}") + + if not summary: + raise ValueError(f"File not found in index: {file_path}") - # Direct conversion to output format (no intermediate transformations) - return analysis.to_dict() + return summary def _validate_analysis_request(self, file_path: str) -> None: """ @@ -65,47 +77,23 @@ def _validate_analysis_request(self, file_path: str) -> None: Raises: ValueError: If validation fails """ - # Business rule: Project must be set up - self._require_project_setup() + # Business rule: Project must be set up OR auto-initialization must be possible + if self.base_path: + # Standard validation if project is set up in context + self._require_valid_file_path(file_path) + full_path = os.path.join(self.base_path, file_path) + if not os.path.exists(full_path): + raise ValueError(f"File does not exist: {file_path}") + else: + # Allow proceeding if auto-initialization might work + # The index manager will handle project discovery + logger.info("Project not set in context, relying on index auto-initialization") + + # Basic file path validation only + if not file_path or '..' in file_path: + raise ValueError(f"Invalid file path: {file_path}") - # Business rule: File path must be valid - self._require_valid_file_path(file_path) - # Business rule: File must exist - full_path = os.path.join(self.base_path, file_path) - if not os.path.exists(full_path): - raise ValueError(f"File does not exist: {file_path}") - - def _get_scip_tool(self): - """Get SCIP tool instance from the index manager.""" - if self.index_manager: - # Access the SCIP tool from unified index manager - return self.index_manager._get_scip_tool() - return None - - def _perform_enhanced_scip_analysis(self, file_path: str): - """ - Enhanced SCIP analysis using the new symbol analyzer. - - Args: - file_path: File path to analyze - - Returns: - FileAnalysis object with accurate symbol information - """ - # Get SCIP tool for index access - scip_tool = self._get_scip_tool() - if not scip_tool: - raise RuntimeError("SCIP tool is not available for file analysis") - - # Get raw SCIP index - scip_index = scip_tool.get_raw_index() - if not scip_index: - raise RuntimeError("SCIP index is not available for file analysis") - - # Use enhanced analyzer for accurate symbol analysis - return self._symbol_analyzer.analyze_file(file_path, scip_index) - diff --git a/src/code_index_mcp/services/file_discovery_service.py b/src/code_index_mcp/services/file_discovery_service.py index 0f03011..478beea 100644 --- a/src/code_index_mcp/services/file_discovery_service.py +++ b/src/code_index_mcp/services/file_discovery_service.py @@ -1,17 +1,15 @@ """ File Discovery Service - Business logic for intelligent file discovery. -This service handles the business logic for finding files in a project, -including pattern matching, relevance scoring, and result optimization. -It composes technical tools to achieve business goals. +This service handles the business logic for finding files using the new +JSON-based indexing system optimized for LLM consumption. """ from typing import Dict, Any, List, Optional from dataclasses import dataclass from .base_service import BaseService -from ..tools.filesystem import FileMatchingTool -from ..utils import ValidationHelper +from ..indexing import get_index_manager @dataclass @@ -26,24 +24,19 @@ class FileDiscoveryResult: class FileDiscoveryService(BaseService): """ - Business service for intelligent file discovery. + Business service for intelligent file discovery using JSON indexing. - This service orchestrates file discovery workflows by composing - technical tools to achieve business goals like finding relevant - files, optimizing search results, and providing meaningful metadata. + This service provides fast file discovery using the optimized JSON + indexing system for efficient LLM-oriented responses. """ def __init__(self, ctx): super().__init__(ctx) - self._matcher_tool = FileMatchingTool() + self._index_manager = get_index_manager() def find_files(self, pattern: str, max_results: Optional[int] = None) -> List[str]: """ - Find files matching the given pattern using intelligent discovery. - - This is the main business method that orchestrates the file discovery - workflow, ensuring the index is available, applying business rules, - and optimizing results for the user. + Find files matching the given pattern using JSON indexing. Args: pattern: Glob pattern to search for (e.g., "*.py", "test_*.js") @@ -58,14 +51,14 @@ def find_files(self, pattern: str, max_results: Optional[int] = None) -> List[st # Business validation self._validate_discovery_request(pattern) - # Business logic: Ensure index is ready - self._ensure_index_available() - - # Business workflow: Execute discovery - discovery_result = self._execute_discovery_workflow(pattern, max_results) - - # Business result formatting - return self._format_discovery_result(discovery_result) + # Get files from JSON index + files = self._index_manager.find_files(pattern) + + # Apply max_results limit if specified + if max_results and len(files) > max_results: + files = files[:max_results] + + return files def _validate_discovery_request(self, pattern: str) -> None: """ @@ -83,213 +76,3 @@ def _validate_discovery_request(self, pattern: str) -> None: # Validate pattern if not pattern or not pattern.strip(): raise ValueError("Search pattern cannot be empty") - - # Business rule: Validate glob pattern - error = ValidationHelper.validate_glob_pattern(pattern) - if error: - raise ValueError(f"Invalid search pattern: {error}") - - def _ensure_index_available(self) -> None: - """ - Business logic to ensure index is available for discovery. - - Now uses unified index manager instead of direct SCIP tool access. - - Raises: - RuntimeError: If index cannot be made available - """ - # Business rule: Check if unified index manager is available - if not self.index_manager: - raise RuntimeError("Index manager not available. Please initialize project first.") - - # Business rule: Check if index provider is available - provider = self.index_provider - if provider and provider.is_available(): - return - - # Business logic: Initialize or refresh index - try: - if not self.index_manager.initialize(): - raise RuntimeError("Failed to initialize index manager") - - # Update context with file count - provider = self.index_provider - if provider: - file_count = len(provider.get_file_list()) - self.helper.update_file_count(file_count) - - except Exception as e: - raise RuntimeError(f"Failed to ensure index availability: {e}") from e - - def _execute_discovery_workflow(self, pattern: str, max_results: Optional[int]) -> FileDiscoveryResult: - """ - Execute the core file discovery business workflow. - - Args: - pattern: Search pattern - max_results: Maximum results limit - - Returns: - FileDiscoveryResult with discovery data - """ - # Get all indexed files through unified interface - provider = self.index_provider - if not provider: - raise RuntimeError("Index provider not available. Please initialize project first.") - - all_files = provider.get_file_list() - - # Apply pattern matching using technical tool - matched_files = self._matcher_tool.match_glob_pattern(all_files, pattern) - - # Business logic: Apply relevance sorting - sorted_files = self._matcher_tool.sort_by_relevance(matched_files, pattern) - - # Business logic: Apply result limits if specified - if max_results: - limited_files = self._matcher_tool.limit_results(sorted_files, max_results) - else: - limited_files = sorted_files - - # Business logic: Determine search strategy used - search_strategy = self._determine_search_strategy(pattern, len(all_files), len(matched_files)) - - # Extract file paths for result - file_paths = [file_info.relative_path for file_info in limited_files] - - # Gather business metadata - metadata = self._gather_discovery_metadata(all_files, matched_files, limited_files, pattern) - - return FileDiscoveryResult( - files=file_paths, - total_count=len(matched_files), - pattern_used=pattern, - search_strategy=search_strategy, - metadata=metadata - ) - - def _determine_search_strategy(self, pattern: str, total_files: int, matched_files: int) -> str: - """ - Business logic to determine what search strategy was most effective. - - Args: - pattern: Search pattern used - total_files: Total files in index - matched_files: Number of files matched - - Returns: - String describing the search strategy - """ - is_glob_pattern = '*' in pattern or '?' in pattern - - if is_glob_pattern: - # Glob pattern strategy determination - if matched_files == 0: - strategy = "glob_pattern_no_matches" - elif matched_files < 10: - strategy = "glob_pattern_focused" - elif matched_files > total_files * 0.5: # More than 50% of files matched - strategy = "glob_pattern_very_broad" - else: - strategy = "glob_pattern_broad" - else: - # Exact filename strategy determination - if matched_files == 0: - strategy = "exact_filename_not_found" - elif matched_files == 1: - strategy = "exact_filename_found" - else: - strategy = "exact_filename_multiple_matches" - - return strategy - - def _get_project_metadata_from_index_manager(self) -> Dict[str, Any]: - """ - Get project metadata from unified index manager. - - Returns: - Dictionary with project metadata, or default values if not available - """ - if self.index_manager: - try: - status = self.index_manager.get_index_status() - if status and status.get('metadata'): - metadata = status['metadata'] - return { - 'project_root': metadata.get('project_root', self.base_path), - 'total_files': status.get('file_count', 0), - 'tool_version': metadata.get('tool_version', 'unified-manager'), - 'languages': [] # Languages info not available in current IndexMetadata - } - elif status: - # Fallback to status info - return { - 'project_root': self.base_path, - 'total_files': status.get('file_count', 0), - 'tool_version': 'unified-manager', - 'languages': [] - } - except (AttributeError, KeyError, TypeError): - pass # Fall through to default if metadata access fails - - # Fallback to default metadata if index manager not available - return { - 'project_root': self.base_path, - 'total_files': 0, - 'tool_version': 'unknown', - 'languages': [] - } - - def _gather_discovery_metadata(self, all_files, matched_files, limited_files, pattern: str) -> Dict[str, Any]: - """ - Gather business metadata about the discovery operation. - - Args: - all_files: All files in index - matched_files: Files that matched the pattern - limited_files: Final limited result set - pattern: Search pattern used - - Returns: - Dictionary with business metadata - """ - # Get project metadata from unified index manager - project_metadata = self._get_project_metadata_from_index_manager() - - # Calculate business metrics - match_ratio = len(matched_files) / len(all_files) if all_files else 0 - - # Analyze file types in results - file_languages = {} - for file_info in matched_files: - lang = file_info.language - file_languages[lang] = file_languages.get(lang, 0) + 1 - - # Analyze pattern characteristics - pattern_type = 'glob' if ('*' in pattern or '?' in pattern) else 'exact' - pattern_complexity = 'simple' if pattern.count('*') <= 1 else 'complex' - - return { - 'total_indexed_files': len(all_files), - 'total_matches': len(matched_files), - 'returned_results': len(limited_files), - 'match_ratio': round(match_ratio, 3), - 'languages_found': file_languages, - 'project_languages': project_metadata.get('languages', []), - 'search_efficiency': 'high' if match_ratio < 0.1 else 'medium' if match_ratio < 0.5 else 'low', - 'pattern_type': pattern_type, - 'pattern_complexity': pattern_complexity, - 'original_pattern': pattern - } - - def _format_discovery_result(self, discovery_result: FileDiscoveryResult) -> List[str]: - """ - Format the discovery result according to business requirements. - - Args: - discovery_result: Raw discovery result - - Returns: - Simple list of file paths - """ - return discovery_result.files diff --git a/src/code_index_mcp/services/index_management_service.py b/src/code_index_mcp/services/index_management_service.py index 7c9f42b..e4714a3 100644 --- a/src/code_index_mcp/services/index_management_service.py +++ b/src/code_index_mcp/services/index_management_service.py @@ -2,7 +2,7 @@ Index Management Service - Business logic for index lifecycle management. This service handles the business logic for index rebuilding, status monitoring, -and index-related operations. It composes technical tools to achieve business goals. +and index-related operations using the new JSON-based indexing system. """ import time import logging @@ -13,8 +13,7 @@ logger = logging.getLogger(__name__) from .base_service import BaseService -from ..tools.scip import SCIPIndexTool -from ..tools.config import ProjectConfigTool +from ..indexing import get_index_manager @dataclass @@ -30,23 +29,17 @@ class IndexManagementService(BaseService): """ Business service for index lifecycle management. - This service orchestrates index management workflows by composing - technical tools to achieve business goals like rebuilding indexes, - monitoring index status, and managing index lifecycle. + This service orchestrates index management workflows using the new + JSON-based indexing system for optimal LLM performance. """ def __init__(self, ctx): super().__init__(ctx) - self._scip_tool = SCIPIndexTool() - self._config_tool = ProjectConfigTool() + self._index_manager = get_index_manager() def rebuild_index(self) -> str: """ - Rebuild the project index using business logic. - - This is the main business method that orchestrates the index - rebuild workflow, ensuring proper validation, cleanup, and - state management. + Rebuild the project index using the new JSON indexing system. Returns: Success message with rebuild information @@ -78,29 +71,20 @@ def get_rebuild_status(self) -> Dict[str, Any]: 'is_rebuilding': False } - # Get index availability status - try to load existing index first - if not self._scip_tool.is_index_available(): - self._scip_tool.load_existing_index(self.base_path) - is_available = self._scip_tool.is_index_available() - - # Get basic status information - status = { - 'status': 'ready' if is_available else 'needs_rebuild', - 'index_available': is_available, - 'is_rebuilding': False, # We don't track background rebuilds in this simplified version - 'project_path': self.base_path + # Get index stats from the new JSON system + stats = self._index_manager.get_index_stats() + + return { + 'status': 'ready' if stats.get('status') == 'loaded' else 'needs_rebuild', + 'index_available': stats.get('status') == 'loaded', + 'is_rebuilding': False, + 'project_path': self.base_path, + 'file_count': stats.get('indexed_files', 0), + 'total_symbols': stats.get('total_symbols', 0), + 'symbol_types': stats.get('symbol_types', {}), + 'languages': stats.get('languages', []) } - # Add file count if index is available - if is_available: - try: - status['file_count'] = self._scip_tool.get_file_count() - status['metadata'] = self._scip_tool.get_project_metadata() - except Exception as e: - status['error'] = f"Failed to get index metadata: {e}" - - return status - def _validate_rebuild_request(self) -> None: """ Validate the index rebuild request according to business rules. @@ -118,20 +102,19 @@ def _execute_rebuild_workflow(self) -> IndexRebuildResult: Returns: IndexRebuildResult with rebuild data """ - start_time = time.time() - # Business step 1: Clear existing index state - self._clear_existing_index() - - # Business step 2: Rebuild index using technical tool - file_count = self._rebuild_index_data() + # Set project path in index manager + if not self._index_manager.set_project_path(self.base_path): + raise RuntimeError("Failed to set project path in index manager") - # Business step 3: Update system state - self._update_index_state(file_count) + # Rebuild the index + if not self._index_manager.refresh_index(): + raise RuntimeError("Failed to rebuild index") - # Business step 4: Save updated configuration - self._save_rebuild_metadata() + # Get stats for result + stats = self._index_manager.get_index_stats() + file_count = stats.get('indexed_files', 0) rebuild_time = time.time() - start_time @@ -142,96 +125,6 @@ def _execute_rebuild_workflow(self) -> IndexRebuildResult: message=f"Index rebuilt successfully with {file_count} files" ) - def _clear_existing_index(self) -> None: - """Business logic to clear existing index state.""" - - # Clear unified index manager - self.helper.clear_index_cache() - - # No logging - - def _rebuild_index_data(self) -> int: - """ - Business logic to rebuild index data using technical tools. - - Returns: - Number of files indexed - - Raises: - RuntimeError: If rebuild fails - """ - try: - # Business logic: Manual rebuild through unified manager - if not self.index_manager: - raise RuntimeError("Index manager not available") - - # Force rebuild - success = self.index_manager.refresh_index(force=True) - if not success: - raise RuntimeError("Index rebuild failed") - - # Get file count from provider - provider = self.index_provider - if provider: - file_count = len(provider.get_file_list()) - - # Save the rebuilt index - if not self.index_manager.save_index(): - logger.warning("Manual rebuild: Index built but save failed") - - return file_count - else: - raise RuntimeError("No index provider available after rebuild") - - except Exception as e: - raise RuntimeError(f"Failed to rebuild index: {e}") from e - - def _update_index_state(self, file_count: int) -> None: - """Business logic to update system state after rebuild.""" - # No logging - - # Update context with new file count - self.helper.update_file_count(file_count) - - # No logging - - def _save_rebuild_metadata(self) -> None: - """Business logic to save SCIP index and metadata.""" - - try: - # Initialize config tool if needed - if not self._config_tool.get_project_path(): - self._config_tool.initialize_settings(self.base_path) - - # Get the SCIP index from the tool - scip_index = self._scip_tool.get_raw_index() - if scip_index is None: - raise RuntimeError("No SCIP index available to save") - - # Save the actual SCIP protobuf index - settings = self._config_tool._settings - settings.save_scip_index(scip_index) - # Also save legacy JSON metadata for compatibility - index_data = { - 'index_metadata': { - 'version': '4.0-scip', - 'source_format': 'scip', - 'last_rebuilt': time.time(), - 'rebuild_trigger': 'manual' - }, - 'project_metadata': self._scip_tool.get_project_metadata() - } - - # Save metadata (legacy format) - self._config_tool.save_index_data(index_data) - - # Update project configuration - config = self._config_tool.create_default_config(self.base_path) - config['last_indexed'] = time.time() - self._config_tool.save_project_config(config) - - except Exception: - pass def _format_rebuild_result(self, result: IndexRebuildResult) -> str: """ diff --git a/src/code_index_mcp/services/project_management_service.py b/src/code_index_mcp/services/project_management_service.py index c18e1a9..1aa0706 100644 --- a/src/code_index_mcp/services/project_management_service.py +++ b/src/code_index_mcp/services/project_management_service.py @@ -2,7 +2,7 @@ Project Management Service - Business logic for project lifecycle management. This service handles the business logic for project initialization, configuration, -and lifecycle management. It composes technical tools to achieve business goals. +and lifecycle management using the new JSON-based indexing system. """ import json import logging @@ -11,21 +11,12 @@ from contextlib import contextmanager from .base_service import BaseService -from ..tools.config import ProjectConfigTool from ..utils.response_formatter import ResponseFormatter from ..constants import SUPPORTED_EXTENSIONS -from ..indexing.unified_index_manager import UnifiedIndexManager +from ..indexing import get_index_manager logger = logging.getLogger(__name__) -# Optional SCIP tools import -try: - from ..tools.scip import SCIPIndexTool - SCIP_AVAILABLE = True -except ImportError: - SCIPIndexTool = None - SCIP_AVAILABLE = False - @dataclass class ProjectInitializationResult: @@ -49,8 +40,10 @@ class ProjectManagementService(BaseService): def __init__(self, ctx): super().__init__(ctx) + # Use the global singleton index manager + self._index_manager = get_index_manager() + from ..tools.config import ProjectConfigTool self._config_tool = ProjectConfigTool() - self._scip_tool = SCIPIndexTool() if SCIP_AVAILABLE else None # Import FileWatcherTool locally to avoid circular import from ..tools.monitoring import FileWatcherTool self._watcher_tool = FileWatcherTool(ctx) @@ -111,22 +104,25 @@ def _execute_initialization_workflow(self, path: str) -> ProjectInitializationRe Returns: ProjectInitializationResult with initialization data """ + # Business step 1: Initialize config tool + self._config_tool.initialize_settings(path) + # Normalize path for consistent processing normalized_path = self._config_tool.normalize_project_path(path) - # Business step 1: Cleanup existing project state + # Business step 2: Cleanup existing project state self._cleanup_existing_project() - # Business step 2: Initialize project configuration - self._initialize_project_configuration(normalized_path) + # Business step 3: Initialize JSON index manager + index_result = self._initialize_json_index_manager(normalized_path) - # Business step 3: Initialize unified index manager - index_result = self._initialize_index_manager(normalized_path) + # Business step 3.1: Store index manager in context for other services + self.helper.update_index_manager(self._index_manager) # Business step 4: Setup file monitoring monitoring_result = self._setup_file_monitoring(normalized_path) - # Business step 5: Update system state + # Business step 4: Update system state self._update_project_state(normalized_path, index_result['file_count']) # Business step 6: Get search capabilities info @@ -150,25 +146,12 @@ def _cleanup_existing_project(self) -> None: # Clear existing index cache self.helper.clear_index_cache() - # Clear SCIP tool state - self._scip_tool.clear_index() - - def _initialize_project_configuration(self, project_path: str) -> None: - """Business logic to initialize project configuration.""" - with self._noop_operation(): - - # Initialize settings using config tool - settings = self._config_tool.initialize_settings(project_path) + # Clear any existing index state + pass - # Update context with new settings - self.helper.update_settings(settings) - self.helper.update_base_path(project_path) - - self._config_tool.get_settings_path() - - def _initialize_index_manager(self, project_path: str) -> Dict[str, Any]: + def _initialize_json_index_manager(self, project_path: str) -> Dict[str, Any]: """ - Business logic to initialize unified index manager. + Business logic to initialize JSON index manager. Args: project_path: Project path @@ -176,44 +159,32 @@ def _initialize_index_manager(self, project_path: str) -> Dict[str, Any]: Returns: Dictionary with initialization results """ - with self._noop_operation(): - # Check if index needs rebuild before initialization - needs_rebuild = not self.helper.settings.is_latest_index() - - if needs_rebuild: - # Clean up legacy files - self.helper.settings.cleanup_legacy_files() - - # Force rebuild by ensuring fresh start - try: - from ..services.index_management_service import IndexManagementService - index_service = IndexManagementService(self._context) - index_service.rebuild_index() - except Exception: - # If rebuild fails, continue with normal initialization - pass - - # Create unified index manager - index_manager = UnifiedIndexManager(project_path, self.helper.settings) - - # Store in context - self.helper.update_index_manager(index_manager) - - # Initialize the manager (this will load existing or build new) - if index_manager.initialize(): - provider = index_manager.get_provider() - if provider: - file_count = len(provider.get_file_list()) - return { - 'file_count': file_count, - 'source': 'unified_manager' - } - - # Fallback if initialization fails - return { - 'file_count': 0, - 'source': 'failed' - } + # Set project path in index manager + if not self._index_manager.set_project_path(project_path): + raise RuntimeError(f"Failed to set project path: {project_path}") + + # Update context + self.helper.update_base_path(project_path) + + # Try to load existing index or build new one + if self._index_manager.load_index(): + source = "loaded_existing" + else: + if not self._index_manager.build_index(): + raise RuntimeError("Failed to build index") + source = "built_new" + + # Get stats + stats = self._index_manager.get_index_stats() + file_count = stats.get('indexed_files', 0) + + return { + 'file_count': file_count, + 'source': source, + 'total_symbols': stats.get('total_symbols', 0), + 'languages': stats.get('languages', []) + } + def _is_valid_existing_index(self, index_data: Dict[str, Any]) -> bool: """ @@ -261,53 +232,6 @@ def _load_existing_index(self, index_data: Dict[str, Any]) -> Dict[str, Any]: 'source': 'loaded_existing' } - def _build_new_index(self, project_path: str) -> Dict[str, Any]: - """ - Business logic to build new project index. - - Args: - project_path: Project path to index - - Returns: - Dictionary with build results - """ - - - try: - # Use SCIP tool to build index - file_count = self._scip_tool.build_index(project_path) - - # Save the new index using config tool - # Note: This is a simplified approach - in a full implementation, - # we would need to convert SCIP data to the expected format - index_data = { - 'index_metadata': { - 'version': '4.0-scip', - 'source_format': 'scip', - 'created_at': __import__('time').time() - }, - 'project_metadata': { - 'project_root': project_path, - 'total_files': file_count, - 'tool_version': 'scip-builder' - } - } - - self._config_tool.save_index_data(index_data) - - # Save project configuration - config = self._config_tool.create_default_config(project_path) - self._config_tool.save_project_config(config) - - # No logging - - return { - 'file_count': file_count, - 'source': 'built_new' - } - - except Exception as e: - raise ValueError(f"Failed to build project index: {e}") from e def _setup_file_monitoring(self, project_path: str) -> str: """ @@ -322,22 +246,20 @@ def _setup_file_monitoring(self, project_path: str) -> str: try: - # Create rebuild callback that uses our SCIP tool + # Create rebuild callback that uses the JSON index manager def rebuild_callback(): logger.info("File watcher triggered rebuild callback") try: logger.debug(f"Starting index rebuild for: {project_path}") - # Business logic: File changed, rebuild through unified manager - if self.helper.index_manager: - success = self.helper.index_manager.refresh_index(force=True) - if success: - provider = self.helper.index_manager.get_provider() - file_count = len(provider.get_file_list()) if provider else 0 - logger.info(f"File watcher rebuild completed successfully - indexed {file_count} files") - return True - - logger.warning("File watcher rebuild failed - no index manager available") - return False + # Business logic: File changed, rebuild using JSON index manager + if self._index_manager.refresh_index(): + stats = self._index_manager.get_index_stats() + file_count = stats.get('indexed_files', 0) + logger.info(f"File watcher rebuild completed successfully - indexed {file_count} files") + return True + else: + logger.warning("File watcher rebuild failed") + return False except Exception as e: import traceback logger.error(f"File watcher rebuild failed: {e}") diff --git a/src/code_index_mcp/services/settings_service.py b/src/code_index_mcp/services/settings_service.py index 74b21ff..bd641c4 100644 --- a/src/code_index_mcp/services/settings_service.py +++ b/src/code_index_mcp/services/settings_service.py @@ -13,6 +13,7 @@ from ..utils import ResponseFormatter from ..constants import SETTINGS_DIR from ..project_settings import ProjectSettings +from ..indexing import get_index_manager def manage_temp_directory(action: str) -> Dict[str, Any]: @@ -34,7 +35,12 @@ def manage_temp_directory(action: str) -> Dict[str, Any]: if action not in ['create', 'check']: raise ValueError(f"Invalid action: {action}. Must be 'create' or 'check'") - temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR) + # Try to get the actual temp directory from index manager, fallback to default + try: + index_manager = get_index_manager() + temp_dir = index_manager.temp_dir if index_manager.temp_dir else os.path.join(tempfile.gettempdir(), SETTINGS_DIR) + except: + temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR) if action == 'create': existed_before = os.path.exists(temp_dir) @@ -118,13 +124,17 @@ def get_settings_info(self) -> Dict[str, Any]: Dictionary with settings directory, config, stats, and status information """ temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR) + + # Get the actual index directory from the index manager + index_manager = get_index_manager() + actual_temp_dir = index_manager.temp_dir if index_manager.temp_dir else temp_dir # Check if base_path is set if not self.base_path: return ResponseFormatter.settings_info_response( settings_directory="", - temp_directory=temp_dir, - temp_directory_exists=os.path.exists(temp_dir), + temp_directory=actual_temp_dir, + temp_directory_exists=os.path.exists(actual_temp_dir), config={}, stats={}, exists=False, @@ -136,13 +146,13 @@ def get_settings_info(self) -> Dict[str, Any]: # Get config and stats config = self.settings.load_config() if self.settings else {} stats = self.settings.get_stats() if self.settings else {} - settings_directory = self.settings.settings_path if self.settings else "" + settings_directory = actual_temp_dir exists = os.path.exists(settings_directory) if settings_directory else False return ResponseFormatter.settings_info_response( settings_directory=settings_directory, - temp_directory=temp_dir, - temp_directory_exists=os.path.exists(temp_dir), + temp_directory=actual_temp_dir, + temp_directory_exists=os.path.exists(actual_temp_dir), config=config, stats=stats, exists=exists diff --git a/src/code_index_mcp/tools/__init__.py b/src/code_index_mcp/tools/__init__.py index 7242df9..f69d664 100644 --- a/src/code_index_mcp/tools/__init__.py +++ b/src/code_index_mcp/tools/__init__.py @@ -6,14 +6,11 @@ business layer to achieve business goals. """ -from .scip import SCIPIndexTool, SCIPSymbolAnalyzer from .filesystem import FileMatchingTool, FileSystemTool from .config import ProjectConfigTool, SettingsTool from .monitoring import FileWatcherTool __all__ = [ - 'SCIPIndexTool', - 'SCIPSymbolAnalyzer', 'FileMatchingTool', 'FileSystemTool', 'ProjectConfigTool', diff --git a/src/code_index_mcp/tools/config/project_config_tool.py b/src/code_index_mcp/tools/config/project_config_tool.py index 304b974..cf78da2 100644 --- a/src/code_index_mcp/tools/config/project_config_tool.py +++ b/src/code_index_mcp/tools/config/project_config_tool.py @@ -98,10 +98,10 @@ def save_index_data(self, index_data: Dict[str, Any]) -> None: def check_index_version(self) -> bool: """ - Check if index is the latest version. + Check if JSON index is the latest version. Returns: - True if latest SCIP index exists, False if needs rebuild + True if JSON index exists and is recent, False if needs rebuild Raises: RuntimeError: If settings not initialized @@ -109,7 +109,17 @@ def check_index_version(self) -> bool: if not self._settings: raise RuntimeError("Settings not initialized") - return self._settings.is_latest_index() + # Check if JSON index exists and is fresh + from ...indexing import get_index_manager + index_manager = get_index_manager() + + # Set project path if available + if self._settings.base_path: + index_manager.set_project_path(self._settings.base_path) + stats = index_manager.get_index_stats() + return stats.get('status') == 'loaded' + + return False def cleanup_legacy_files(self) -> None: """ diff --git a/src/code_index_mcp/tools/filesystem/file_matching_tool.py b/src/code_index_mcp/tools/filesystem/file_matching_tool.py index 8e66b92..22ebdf6 100644 --- a/src/code_index_mcp/tools/filesystem/file_matching_tool.py +++ b/src/code_index_mcp/tools/filesystem/file_matching_tool.py @@ -9,7 +9,14 @@ from typing import List, Set from pathlib import Path -from ..scip.scip_index_tool import FileInfo +# FileInfo defined locally for file matching operations +from dataclasses import dataclass + +@dataclass +class FileInfo: + """File information structure.""" + relative_path: str + language: str class FileMatchingTool: diff --git a/src/code_index_mcp/tools/scip/__init__.py b/src/code_index_mcp/tools/scip/__init__.py deleted file mode 100644 index d2e86d3..0000000 --- a/src/code_index_mcp/tools/scip/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -SCIP Tools - Technical components for SCIP operations. -""" - -from .scip_index_tool import SCIPIndexTool -from .scip_symbol_analyzer import SCIPSymbolAnalyzer - -__all__ = ['SCIPIndexTool', 'SCIPSymbolAnalyzer'] diff --git a/src/code_index_mcp/tools/scip/analyzers/__init__.py b/src/code_index_mcp/tools/scip/analyzers/__init__.py deleted file mode 100644 index eac5859..0000000 --- a/src/code_index_mcp/tools/scip/analyzers/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Language-specific SCIP symbol analyzers. - -This package provides the modular language analyzer system that replaces the -monolithic SCIPSymbolAnalyzer, following the refactoring plan for better -maintainability and extensibility. - -Key Components: -- LanguageAnalyzer: Abstract base class for all language analyzers -- PythonAnalyzer: Python-specific import and symbol analysis -- ZigAnalyzer: Zig-specific import and symbol analysis -- ObjectiveCAnalyzer: Objective-C framework and symbol analysis -- JavaScriptAnalyzer: JavaScript/TypeScript analysis -- LanguageAnalyzerFactory: Factory for creating appropriate analyzers -- FallbackAnalyzer: Generic analyzer for unsupported languages - -Usage: - from .factory import get_analyzer - - # Get analyzer for Python file - analyzer = get_analyzer(language='python') - - # Get analyzer based on file extension - analyzer = get_analyzer(file_path='main.py') - - # Extract imports - analyzer.extract_imports(document, imports, symbol_parser) -""" - -from .base import LanguageAnalyzer, BaseLanguageAnalyzer, FallbackAnalyzer -from .python_analyzer import PythonAnalyzer -from .zig_analyzer import ZigAnalyzer -from .objc_analyzer import ObjectiveCAnalyzer -from .javascript_analyzer import JavaScriptAnalyzer -from .factory import ( - LanguageAnalyzerFactory, - get_analyzer_factory, - get_analyzer, - register_custom_analyzer, - get_supported_languages -) - -__all__ = [ - # Base classes - 'LanguageAnalyzer', - 'BaseLanguageAnalyzer', - 'FallbackAnalyzer', - - # Language-specific analyzers - 'PythonAnalyzer', - 'ZigAnalyzer', - 'ObjectiveCAnalyzer', - 'JavaScriptAnalyzer', - - # Factory and utilities - 'LanguageAnalyzerFactory', - 'get_analyzer_factory', - 'get_analyzer', - 'register_custom_analyzer', - 'get_supported_languages' -] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/analyzers/base.py b/src/code_index_mcp/tools/scip/analyzers/base.py deleted file mode 100644 index 3aa5280..0000000 --- a/src/code_index_mcp/tools/scip/analyzers/base.py +++ /dev/null @@ -1,324 +0,0 @@ -""" -Base interfaces and common utilities for language-specific SCIP analyzers. - -This module provides the abstract base classes and shared functionality for the -modular language analyzer system, following the SCIP Symbol Analyzer refactoring plan. -""" - -import logging -from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Any, Set -from ..symbol_definitions import ImportGroup, LocationInfo - -logger = logging.getLogger(__name__) - - -class LanguageAnalyzer(ABC): - """ - Abstract base class for language-specific SCIP symbol analyzers. - - Each language analyzer handles language-specific logic for: - - Import extraction and classification - - Symbol metadata enrichment - - Dependency classification - - Standard library module detection - """ - - def __init__(self): - """Initialize the language analyzer.""" - self._cache: Dict[str, Any] = {} - self.language_name = self._get_language_name() - - @abstractmethod - def _get_language_name(self) -> str: - """Return the name of the language this analyzer handles.""" - pass - - @abstractmethod - def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: - """ - Extract import information from SCIP document. - - Args: - document: SCIP document containing symbols and occurrences - imports: ImportGroup to populate with extracted imports - symbol_parser: Optional SCIPSymbolManager for enhanced parsing - """ - pass - - @abstractmethod - def classify_dependency(self, module_name: str) -> str: - """ - Classify dependency as standard_library, third_party, or local. - - Args: - module_name: Name of the module/dependency to classify - - Returns: - Classification string: 'standard_library', 'third_party', or 'local' - """ - pass - - @abstractmethod - def extract_symbol_metadata(self, symbol_info, document) -> Dict[str, Any]: - """ - Extract language-specific symbol metadata. - - Args: - symbol_info: SCIP symbol information object - document: SCIP document containing the symbol - - Returns: - Dictionary with language-specific metadata - """ - pass - - @abstractmethod - def get_standard_library_modules(self) -> Set[str]: - """ - Return set of standard library module names for this language. - - Returns: - Set of standard library module names - """ - pass - - def normalize_import_path(self, raw_path: str) -> str: - """ - Normalize import path for consistent processing. - Default implementation returns the path as-is. - - Args: - raw_path: Raw import path from SCIP data - - Returns: - Normalized import path - """ - return raw_path.strip() - - def is_import_occurrence(self, occurrence) -> bool: - """ - Check if occurrence represents an import statement. - Default implementation checks for Import role (role = 2). - - Args: - occurrence: SCIP occurrence object - - Returns: - True if this occurrence is an import - """ - return hasattr(occurrence, 'symbol_roles') and (occurrence.symbol_roles & 2) - - def extract_module_from_symbol(self, symbol: str, descriptors: str = "") -> Optional[str]: - """ - Extract module name from SCIP symbol. - Default implementation for common patterns. - - Args: - symbol: SCIP symbol string - descriptors: SCIP descriptors if available - - Returns: - Module name or None if not extractable - """ - try: - if descriptors and '/' in descriptors: - # Extract from descriptors: module.py/symbol -> module - parts = descriptors.split('/') - if len(parts) >= 2: - file_part = parts[0] - if file_part.endswith('.py'): - return file_part[:-3].replace('/', '.') - return file_part.replace('/', '.') - - # Fallback: parse from symbol string - if symbol.startswith('external:'): - symbol_path = symbol[9:] - if '/' in symbol_path: - return symbol_path.split('/')[0] - elif '#' in symbol_path: - return symbol_path.split('#')[0] - return symbol_path.rstrip('.') - - except Exception as e: - logger.debug(f"Error extracting module from symbol {symbol}: {e}") - - return None - - -class AnalyzerCache: - """Shared caching system for analyzer results.""" - - def __init__(self): - self._symbol_cache: Dict[str, Dict[str, Any]] = {} - self._dependency_cache: Dict[str, str] = {} - self._module_cache: Dict[str, Set[str]] = {} - - def cache_symbol_metadata(self, symbol: str, metadata: Dict[str, Any]) -> None: - """Cache symbol metadata.""" - self._symbol_cache[symbol] = metadata - - def get_cached_symbol_metadata(self, symbol: str) -> Optional[Dict[str, Any]]: - """Retrieve cached symbol metadata.""" - return self._symbol_cache.get(symbol) - - def cache_dependency_classification(self, module: str, classification: str) -> None: - """Cache dependency classification result.""" - self._dependency_cache[module] = classification - - def get_cached_dependency_classification(self, module: str) -> Optional[str]: - """Retrieve cached dependency classification.""" - return self._dependency_cache.get(module) - - def cache_standard_library_modules(self, language: str, modules: Set[str]) -> None: - """Cache standard library modules for a language.""" - self._module_cache[language] = modules - - def get_cached_standard_library_modules(self, language: str) -> Optional[Set[str]]: - """Retrieve cached standard library modules.""" - return self._module_cache.get(language) - - -class BaseLanguageAnalyzer(LanguageAnalyzer): - """ - Base implementation providing common functionality for language analyzers. - - This class provides default implementations for common patterns while - requiring subclasses to implement language-specific logic. - """ - - def __init__(self): - super().__init__() - self._cache = AnalyzerCache() - self._standard_library_modules: Optional[Set[str]] = None - - def get_standard_library_modules(self) -> Set[str]: - """ - Get standard library modules with caching. - - Returns: - Set of standard library module names - """ - if self._standard_library_modules is None: - cached = self._cache.get_cached_standard_library_modules(self.language_name) - if cached is not None: - self._standard_library_modules = cached - else: - self._standard_library_modules = self._build_standard_library_modules() - self._cache.cache_standard_library_modules(self.language_name, self._standard_library_modules) - - return self._standard_library_modules - - @abstractmethod - def _build_standard_library_modules(self) -> Set[str]: - """Build the set of standard library modules for this language.""" - pass - - def classify_dependency(self, module_name: str) -> str: - """ - Classify dependency with caching support. - - Args: - module_name: Name of the module to classify - - Returns: - Classification string - """ - # Check cache first - cached = self._cache.get_cached_dependency_classification(module_name) - if cached is not None: - return cached - - # Perform classification - classification = self._classify_dependency_impl(module_name) - - # Cache result - self._cache.cache_dependency_classification(module_name, classification) - - return classification - - @abstractmethod - def _classify_dependency_impl(self, module_name: str) -> str: - """Implement the actual dependency classification logic.""" - pass - - def extract_symbol_metadata(self, symbol_info, document) -> Dict[str, Any]: - """ - Extract symbol metadata with caching. - - Args: - symbol_info: SCIP symbol information - document: SCIP document - - Returns: - Dictionary with symbol metadata - """ - symbol = getattr(symbol_info, 'symbol', '') - if not symbol: - return {} - - # Check cache - cached = self._cache.get_cached_symbol_metadata(symbol) - if cached is not None: - return cached - - # Extract metadata - metadata = self._extract_symbol_metadata_impl(symbol_info, document) - - # Cache result - self._cache.cache_symbol_metadata(symbol, metadata) - - return metadata - - @abstractmethod - def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: - """Implement language-specific symbol metadata extraction.""" - pass - - -class FallbackAnalyzer(BaseLanguageAnalyzer): - """ - Fallback analyzer for unsupported languages. - - Provides basic functionality when no language-specific analyzer is available. - """ - - def _get_language_name(self) -> str: - return "fallback" - - def _build_standard_library_modules(self) -> Set[str]: - """Fallback has no standard library modules.""" - return set() - - def _classify_dependency_impl(self, module_name: str) -> str: - """Basic classification for unknown languages.""" - if module_name.startswith('.'): - return 'local' - # Default to third_party for unknown languages - return 'third_party' - - def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: - """Basic import extraction using occurrence analysis.""" - try: - seen_modules = set() - - for occurrence in document.occurrences: - if not self.is_import_occurrence(occurrence): - continue - - symbol = occurrence.symbol - module_name = self.extract_module_from_symbol(symbol) - if module_name and module_name not in seen_modules: - classification = self.classify_dependency(module_name) - imports.add_import(module_name, classification) - seen_modules.add(module_name) - - except Exception as e: - logger.debug(f"Error in fallback import extraction: {e}") - - def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: - """Basic metadata extraction for fallback.""" - return { - 'source': 'fallback', - 'confidence': 'low' - } diff --git a/src/code_index_mcp/tools/scip/analyzers/factory.py b/src/code_index_mcp/tools/scip/analyzers/factory.py deleted file mode 100644 index 52c08b0..0000000 --- a/src/code_index_mcp/tools/scip/analyzers/factory.py +++ /dev/null @@ -1,383 +0,0 @@ -""" -Language analyzer factory and registry. - -This module provides the factory pattern for creating language-specific analyzers -based on document language or file extension, following the SCIP Symbol Analyzer -refactoring plan. -""" - -import logging -from typing import Dict, Optional, Type, Set -from .base import LanguageAnalyzer, FallbackAnalyzer -from .python_analyzer import PythonAnalyzer -from .zig_analyzer import ZigAnalyzer -from .objc_analyzer import ObjectiveCAnalyzer -from .javascript_analyzer import JavaScriptAnalyzer - -logger = logging.getLogger(__name__) - - -class LanguageAnalyzerFactory: - """ - Factory for creating language-specific analyzers. - - This factory provides centralized management of language analyzers, - supporting dynamic registration and language detection based on - various criteria. - """ - - def __init__(self): - """Initialize the factory with default analyzers.""" - self._analyzers: Dict[str, Type[LanguageAnalyzer]] = {} - self._file_extension_map: Dict[str, str] = {} - self._language_aliases: Dict[str, str] = {} - self._analyzer_instances: Dict[str, LanguageAnalyzer] = {} - - # Register default analyzers - self._register_default_analyzers() - self._setup_file_extension_mapping() - self._setup_language_aliases() - - def _register_default_analyzers(self) -> None: - """Register all default language analyzers.""" - self.register_analyzer('python', PythonAnalyzer) - self.register_analyzer('zig', ZigAnalyzer) - self.register_analyzer('objective-c', ObjectiveCAnalyzer) - self.register_analyzer('javascript', JavaScriptAnalyzer) - self.register_analyzer('typescript', JavaScriptAnalyzer) # TypeScript uses JS analyzer - self.register_analyzer('fallback', FallbackAnalyzer) - - def _setup_file_extension_mapping(self) -> None: - """Setup mapping from file extensions to language names.""" - self._file_extension_map = { - # Python - '.py': 'python', - '.pyx': 'python', - '.pyi': 'python', - '.pyw': 'python', - - # Zig - '.zig': 'zig', - - # Objective-C - '.m': 'objective-c', - '.mm': 'objective-c', - '.h': 'objective-c', # Could be C/C++ too, but often ObjC in iOS/macOS projects - - # JavaScript/TypeScript - '.js': 'javascript', - '.jsx': 'javascript', - '.ts': 'typescript', - '.tsx': 'typescript', - '.mjs': 'javascript', - '.cjs': 'javascript', - - # Other languages that might be added later - '.java': 'java', - '.kt': 'kotlin', - '.swift': 'swift', - '.go': 'go', - '.rs': 'rust', - '.cpp': 'cpp', - '.cc': 'cpp', - '.cxx': 'cpp', - '.c': 'c', - '.cs': 'csharp', - '.rb': 'ruby', - '.php': 'php', - '.scala': 'scala', - '.clj': 'clojure', - '.sh': 'shell', - '.bash': 'shell', - '.zsh': 'shell', - '.fish': 'shell' - } - - def _setup_language_aliases(self) -> None: - """Setup aliases for language names.""" - self._language_aliases = { - # Python aliases - 'py': 'python', - 'python3': 'python', - - # JavaScript/TypeScript aliases - 'js': 'javascript', - 'jsx': 'javascript', - 'ts': 'typescript', - 'tsx': 'typescript', - 'ecmascript': 'javascript', - 'node': 'javascript', - 'nodejs': 'javascript', - - # Objective-C aliases - 'objc': 'objective-c', - 'obj-c': 'objective-c', - 'objective_c': 'objective-c', - 'objectivec': 'objective-c', - - # Other aliases - 'zigc': 'zig', - 'c++': 'cpp', - 'c#': 'csharp', - 'dotnet': 'csharp' - } - - def register_analyzer(self, language: str, analyzer_class: Type[LanguageAnalyzer]) -> None: - """ - Register a language analyzer. - - Args: - language: Language name (canonical form) - analyzer_class: Analyzer class to register - """ - self._analyzers[language.lower()] = analyzer_class - logger.debug(f"Registered analyzer for language: {language}") - - def get_analyzer(self, language: str = None, file_path: str = None) -> LanguageAnalyzer: - """ - Get appropriate analyzer for the given language or file. - - Args: - language: Language name (if known) - file_path: File path (for extension-based detection) - - Returns: - Language-specific analyzer or fallback analyzer - """ - detected_language = self._detect_language(language, file_path) - - # Return cached instance if available - if detected_language in self._analyzer_instances: - return self._analyzer_instances[detected_language] - - # Create new instance - analyzer_class = self._analyzers.get(detected_language) - if analyzer_class: - try: - analyzer = analyzer_class() - self._analyzer_instances[detected_language] = analyzer - return analyzer - except Exception as e: - logger.warning(f"Failed to create analyzer for {detected_language}: {e}") - - # Fallback to default analyzer - if 'fallback' not in self._analyzer_instances: - self._analyzer_instances['fallback'] = FallbackAnalyzer() - - return self._analyzer_instances['fallback'] - - def _detect_language(self, language: str = None, file_path: str = None) -> str: - """ - Detect language from various hints. - - Args: - language: Explicit language hint - file_path: File path for extension-based detection - - Returns: - Detected language name (normalized) - """ - # Method 1: Use explicit language if provided - if language: - normalized = self._normalize_language(language) - if normalized in self._analyzers: - return normalized - - # Method 2: Detect from file extension - if file_path: - file_extension = self._get_file_extension(file_path) - if file_extension in self._file_extension_map: - detected = self._file_extension_map[file_extension] - if detected in self._analyzers: - return detected - - # Method 3: Detect from file path patterns - if file_path: - path_based = self._detect_from_path_patterns(file_path) - if path_based and path_based in self._analyzers: - return path_based - - # Default to fallback - return 'fallback' - - def _normalize_language(self, language: str) -> str: - """ - Normalize language name using aliases. - - Args: - language: Raw language name - - Returns: - Normalized language name - """ - language_lower = language.lower().strip() - - # Check aliases first - if language_lower in self._language_aliases: - return self._language_aliases[language_lower] - - # Return as-is if no alias found - return language_lower - - def _get_file_extension(self, file_path: str) -> str: - """ - Extract file extension from path. - - Args: - file_path: File path - - Returns: - File extension (including dot) - """ - try: - if '.' in file_path: - return '.' + file_path.split('.')[-1].lower() - except Exception: - pass - return '' - - def _detect_from_path_patterns(self, file_path: str) -> Optional[str]: - """ - Detect language from file path patterns. - - Args: - file_path: File path - - Returns: - Detected language or None - """ - path_lower = file_path.lower() - - # JavaScript/TypeScript project patterns - if any(pattern in path_lower for pattern in ['node_modules', 'package.json', 'tsconfig']): - if any(ext in path_lower for ext in ['.ts', '.tsx']): - return 'typescript' - return 'javascript' - - # Python project patterns - if any(pattern in path_lower for pattern in ['__pycache__', 'requirements.txt', 'setup.py', '.py']): - return 'python' - - # Zig project patterns - if any(pattern in path_lower for pattern in ['build.zig', '.zig']): - return 'zig' - - # Objective-C project patterns - if any(pattern in path_lower for pattern in ['.xcodeproj', '.xcworkspace', 'podfile']): - return 'objective-c' - - return None - - def get_supported_languages(self) -> Set[str]: - """ - Get set of supported languages. - - Returns: - Set of supported language names - """ - return set(self._analyzers.keys()) - - def get_supported_extensions(self) -> Set[str]: - """ - Get set of supported file extensions. - - Returns: - Set of supported file extensions - """ - return set(self._file_extension_map.keys()) - - def is_language_supported(self, language: str) -> bool: - """ - Check if a language is supported. - - Args: - language: Language name to check - - Returns: - True if language is supported - """ - normalized = self._normalize_language(language) - return normalized in self._analyzers - - def clear_cache(self) -> None: - """Clear cached analyzer instances.""" - self._analyzer_instances.clear() - logger.debug("Cleared analyzer instance cache") - - def get_analyzer_info(self) -> Dict[str, Dict[str, any]]: - """ - Get information about registered analyzers. - - Returns: - Dictionary with analyzer information - """ - info = {} - for language, analyzer_class in self._analyzers.items(): - try: - analyzer = analyzer_class() - info[language] = { - 'class': analyzer_class.__name__, - 'supported_extensions': [ - ext for ext, lang in self._file_extension_map.items() - if lang == language - ], - 'aliases': [ - alias for alias, canonical in self._language_aliases.items() - if canonical == language - ], - 'standard_library_modules': len(analyzer.get_standard_library_modules()) - } - except Exception as e: - info[language] = { - 'class': analyzer_class.__name__, - 'error': str(e) - } - - return info - - -# Global factory instance -_factory_instance: Optional[LanguageAnalyzerFactory] = None - - -def get_analyzer_factory() -> LanguageAnalyzerFactory: - """ - Get the global analyzer factory instance. - - Returns: - Global LanguageAnalyzerFactory instance - """ - global _factory_instance - if _factory_instance is None: - _factory_instance = LanguageAnalyzerFactory() - return _factory_instance - - -def get_analyzer(language: str = None, file_path: str = None) -> LanguageAnalyzer: - """ - Convenience function to get a language analyzer. - - Args: - language: Language name (if known) - file_path: File path (for extension-based detection) - - Returns: - Appropriate language analyzer - """ - return get_analyzer_factory().get_analyzer(language, file_path) - - -def register_custom_analyzer(language: str, analyzer_class: Type[LanguageAnalyzer]) -> None: - """ - Register a custom language analyzer. - - Args: - language: Language name - analyzer_class: Custom analyzer class - """ - get_analyzer_factory().register_analyzer(language, analyzer_class) - - -def get_supported_languages() -> Set[str]: - """Get set of all supported languages.""" - return get_analyzer_factory().get_supported_languages() diff --git a/src/code_index_mcp/tools/scip/analyzers/javascript_analyzer.py b/src/code_index_mcp/tools/scip/analyzers/javascript_analyzer.py deleted file mode 100644 index 72228c4..0000000 --- a/src/code_index_mcp/tools/scip/analyzers/javascript_analyzer.py +++ /dev/null @@ -1,410 +0,0 @@ -""" -JavaScript/TypeScript language-specific SCIP symbol analyzer. - -This module handles JavaScript and TypeScript specific logic for import parsing, -dependency classification, and symbol metadata extraction. -""" - -import logging -from typing import Dict, List, Optional, Any, Set -from .base import BaseLanguageAnalyzer -from ..symbol_definitions import ImportGroup - -logger = logging.getLogger(__name__) - - -class JavaScriptAnalyzer(BaseLanguageAnalyzer): - """ - JavaScript/TypeScript language-specific SCIP symbol analyzer. - - Handles JavaScript and TypeScript specific import parsing, dependency - classification, and symbol metadata extraction. - """ - - def _get_language_name(self) -> str: - return "javascript" - - def _build_standard_library_modules(self) -> Set[str]: - """Build JavaScript/Node.js built-in modules set.""" - return { - # Node.js built-in modules - 'assert', 'async_hooks', 'buffer', 'child_process', 'cluster', - 'console', 'constants', 'crypto', 'dgram', 'dns', 'domain', - 'events', 'fs', 'http', 'http2', 'https', 'inspector', - 'module', 'net', 'os', 'path', 'perf_hooks', 'process', - 'punycode', 'querystring', 'readline', 'repl', 'stream', - 'string_decoder', 'timers', 'tls', 'trace_events', 'tty', - 'url', 'util', 'v8', 'vm', 'worker_threads', 'zlib', - - # Web APIs (for browser environment) - 'window', 'document', 'navigator', 'location', 'history', - 'localStorage', 'sessionStorage', 'fetch', 'XMLHttpRequest', - 'WebSocket', 'Worker', 'ServiceWorker', 'MessageChannel', - 'BroadcastChannel', 'AbortController', 'URL', 'URLSearchParams', - 'Blob', 'File', 'FileReader', 'FormData', 'Headers', - 'Request', 'Response', 'ReadableStream', 'WritableStream', - 'TransformStream', 'TextEncoder', 'TextDecoder', - 'Intl', 'JSON', 'Math', 'Date', 'RegExp', 'Promise', - 'Proxy', 'Reflect', 'Symbol', 'Map', 'Set', 'WeakMap', - 'WeakSet', 'ArrayBuffer', 'DataView', 'Int8Array', - 'Uint8Array', 'Int16Array', 'Uint16Array', 'Int32Array', - 'Uint32Array', 'Float32Array', 'Float64Array', 'BigInt64Array', - 'BigUint64Array' - } - - def _classify_dependency_impl(self, module_name: str) -> str: - """ - Classify JavaScript/TypeScript dependency based on module patterns. - - Args: - module_name: Module name to classify - - Returns: - Classification: 'standard_library', 'third_party', or 'local' - """ - # Local imports (relative paths) - if module_name.startswith('./') or module_name.startswith('../'): - return 'local' - - # Absolute local imports (no node_modules) - if module_name.startswith('/') or module_name.startswith('~'): - return 'local' - - # Check for common project patterns - if any(pattern in module_name for pattern in ['src/', 'lib/', 'app/', '@/']): - return 'local' - - # Node.js built-in modules - base_module = module_name.split('/')[0] - if base_module in self.get_standard_library_modules(): - return 'standard_library' - - # Check for common scoped packages (third-party) - if module_name.startswith('@'): - return 'third_party' - - # Common third-party indicators - third_party_indicators = { - 'react', 'vue', 'angular', 'jquery', 'lodash', 'moment', - 'express', 'koa', 'fastify', 'webpack', 'babel', 'eslint', - 'typescript', 'jest', 'mocha', 'chai', 'sinon', 'cypress', - 'puppeteer', 'playwright', 'storybook', 'next', 'nuxt', - 'gatsby', 'vite', 'rollup', 'parcel', 'styled-components', - 'emotion', 'material-ui', 'antd', 'bootstrap', 'tailwind' - } - - if base_module in third_party_indicators: - return 'third_party' - - # Everything else is likely third_party in JavaScript ecosystem - return 'third_party' - - def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: - """ - Extract JavaScript/TypeScript imports from SCIP document. - - Args: - document: SCIP document containing symbols and occurrences - imports: ImportGroup to populate with extracted imports - symbol_parser: Optional SCIPSymbolManager for enhanced parsing - """ - try: - seen_modules = set() - - if symbol_parser: - # Extract using symbol parser - for occurrence in document.occurrences: - if not self.is_import_occurrence(occurrence): - continue - - symbol_info = symbol_parser.parse_symbol(occurrence.symbol) - if not symbol_info: - continue - - # Handle different manager types - if symbol_info.manager == 'npm': - # npm packages - package_name = symbol_info.package or self._extract_package_from_descriptors(symbol_info.descriptors) - if package_name and package_name not in seen_modules: - classification = self.classify_dependency(package_name) - imports.add_import(package_name, classification) - seen_modules.add(package_name) - - elif symbol_info.manager in ['builtin', 'node']: - # Node.js built-ins - module_name = self._extract_module_from_descriptors(symbol_info.descriptors) - if module_name and module_name not in seen_modules: - imports.add_import(module_name, 'standard_library') - seen_modules.add(module_name) - - elif symbol_info.manager == 'local': - # Local imports - module_path = self._extract_local_module_path(symbol_info.descriptors) - if module_path and module_path not in seen_modules: - imports.add_import(module_path, 'local') - seen_modules.add(module_path) - - else: - # Fallback: basic extraction without symbol parser - self._extract_imports_fallback(document, imports, seen_modules) - - logger.debug(f"Extracted {len(seen_modules)} JavaScript imports") - - except Exception as e: - logger.debug(f"Error extracting JavaScript imports: {e}") - - def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: - """ - Extract JavaScript/TypeScript specific symbol metadata. - - Args: - symbol_info: SCIP symbol information - document: SCIP document - - Returns: - Dictionary with JavaScript/TypeScript specific metadata - """ - metadata = { - 'language': 'javascript', - 'source': 'javascript_analyzer' - } - - try: - # Extract type information (especially for TypeScript) - if hasattr(symbol_info, 'signature') and symbol_info.signature: - signature = symbol_info.signature - metadata['signature'] = signature - - # Parse TypeScript-specific patterns - if '=>' in signature: - metadata['is_arrow_function'] = True - - if 'async' in signature: - metadata['is_async'] = True - - if 'export' in signature: - metadata['is_exported'] = True - - if 'default' in signature: - metadata['is_default_export'] = True - - # Parse function parameters - if '(' in signature and ')' in signature: - params = self._parse_js_parameters(signature) - if params: - metadata['parameters'] = params - - # Parse return type (TypeScript) - if ':' in signature and '=>' not in signature: - parts = signature.split(':') - if len(parts) > 1: - type_part = parts[-1].strip() - metadata['type'] = type_part - - # Extract symbol characteristics - symbol = getattr(symbol_info, 'symbol', '') - if symbol: - metadata['is_class'] = self._is_js_class(symbol) - metadata['is_interface'] = self._is_ts_interface(symbol) - metadata['is_type'] = self._is_ts_type(symbol) - metadata['is_enum'] = self._is_ts_enum(symbol) - metadata['is_namespace'] = self._is_ts_namespace(symbol) - metadata['scope'] = self._classify_js_scope(symbol) - - # Extract JSDoc documentation - if hasattr(symbol_info, 'documentation') and symbol_info.documentation: - metadata['documentation'] = symbol_info.documentation - metadata['has_jsdoc'] = any('@' in line for line in symbol_info.documentation) - - except Exception as e: - logger.debug(f"Error extracting JavaScript metadata: {e}") - metadata['extraction_error'] = str(e) - - return metadata - - def _extract_package_from_descriptors(self, descriptors: str) -> Optional[str]: - """ - Extract package name from SCIP descriptors for JavaScript. - - Args: - descriptors: SCIP descriptors string - - Returns: - Package name or None - """ - try: - # Handle descriptors like 'react/' or 'lodash/map' - if '/' in descriptors: - package_part = descriptors.split('/')[0] - # Handle scoped packages like @types/node - if package_part.startswith('@'): - parts = descriptors.split('/') - if len(parts) >= 2: - return f"{parts[0]}/{parts[1]}" - return package_part - return descriptors.strip('/') - except Exception: - return None - - def _extract_local_module_path(self, descriptors: str) -> Optional[str]: - """ - Extract local module path from descriptors for JavaScript. - - Args: - descriptors: SCIP descriptors string - - Returns: - Module path or None - """ - try: - # Handle local JavaScript imports - if '/' in descriptors: - parts = descriptors.split('/') - if len(parts) >= 1: - file_part = parts[0] - # Remove common JavaScript extensions - for ext in ['.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs']: - if file_part.endswith(ext): - file_part = file_part[:-len(ext)] - break - return file_part - return None - except Exception: - return None - - def _extract_imports_fallback(self, document, imports: ImportGroup, seen_modules: Set[str]) -> None: - """Fallback import extraction without symbol parser.""" - try: - for occurrence in document.occurrences: - if not self.is_import_occurrence(occurrence): - continue - - symbol = occurrence.symbol - module_name = self.extract_module_from_symbol(symbol) - if module_name and module_name not in seen_modules: - classification = self.classify_dependency(module_name) - imports.add_import(module_name, classification) - seen_modules.add(module_name) - except Exception as e: - logger.debug(f"Error in JavaScript fallback import extraction: {e}") - - def _parse_js_parameters(self, signature: str) -> List[str]: - """ - Parse parameter names from JavaScript/TypeScript function signature. - - Args: - signature: Function signature string - - Returns: - List of parameter names - """ - try: - if '(' in signature and ')' in signature: - # Find the parameter section - start = signature.find('(') - end = signature.find(')', start) - if start < end: - param_section = signature[start + 1:end] - if not param_section.strip(): - return [] - - params = [] - # Split by comma, but be careful of nested parentheses and generics - current_param = "" - paren_depth = 0 - bracket_depth = 0 - - for char in param_section: - if char == '(': - paren_depth += 1 - elif char == ')': - paren_depth -= 1 - elif char == '<': - bracket_depth += 1 - elif char == '>': - bracket_depth -= 1 - elif char == ',' and paren_depth == 0 and bracket_depth == 0: - params.append(current_param.strip()) - current_param = "" - continue - - current_param += char - - if current_param.strip(): - params.append(current_param.strip()) - - # Extract just parameter names (before : or =) - param_names = [] - for param in params: - # Handle destructuring and rest parameters - param = param.strip() - if param.startswith('...'): - param = param[3:].strip() - - # Extract name before type annotation or default value - if ':' in param: - param = param.split(':')[0].strip() - elif '=' in param: - param = param.split('=')[0].strip() - - if param and not param.startswith('{') and not param.startswith('['): - param_names.append(param) - - return param_names - except Exception as e: - logger.debug(f"Error parsing JavaScript parameters: {e}") - - return [] - - def _is_js_class(self, symbol: str) -> bool: - """Check if symbol represents a JavaScript class.""" - try: - return 'class' in symbol.lower() or '/Class' in symbol - except Exception: - return False - - def _is_ts_interface(self, symbol: str) -> bool: - """Check if symbol represents a TypeScript interface.""" - try: - return 'interface' in symbol.lower() or '/Interface' in symbol - except Exception: - return False - - def _is_ts_type(self, symbol: str) -> bool: - """Check if symbol represents a TypeScript type alias.""" - try: - return 'type' in symbol.lower() and not 'typeof' in symbol.lower() - except Exception: - return False - - def _is_ts_enum(self, symbol: str) -> bool: - """Check if symbol represents a TypeScript enum.""" - try: - return 'enum' in symbol.lower() or '/Enum' in symbol - except Exception: - return False - - def _is_ts_namespace(self, symbol: str) -> bool: - """Check if symbol represents a TypeScript namespace.""" - try: - return 'namespace' in symbol.lower() or '/Namespace' in symbol - except Exception: - return False - - def _classify_js_scope(self, symbol: str) -> str: - """ - Classify JavaScript symbol scope. - - Args: - symbol: SCIP symbol string - - Returns: - Scope classification - """ - # Basic scope classification for JavaScript - if '//' in symbol or symbol.count('/') > 2: - return 'nested' - elif '/' in symbol: - return 'module' - else: - return 'global' \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/analyzers/objc_analyzer.py b/src/code_index_mcp/tools/scip/analyzers/objc_analyzer.py deleted file mode 100644 index 6de9c5c..0000000 --- a/src/code_index_mcp/tools/scip/analyzers/objc_analyzer.py +++ /dev/null @@ -1,366 +0,0 @@ -""" -Objective-C language-specific SCIP symbol analyzer. - -This module handles Objective-C specific logic extracted from the monolithic -SCIPSymbolAnalyzer, including framework detection and system library classification. -""" - -import logging -from typing import Dict, List, Optional, Any, Set -from .base import BaseLanguageAnalyzer -from ..symbol_definitions import ImportGroup - -logger = logging.getLogger(__name__) - - -class ObjectiveCAnalyzer(BaseLanguageAnalyzer): - """ - Objective-C language-specific SCIP symbol analyzer. - - Handles Objective-C specific framework imports, system library detection, - and symbol metadata extraction. - """ - - def _get_language_name(self) -> str: - return "objective-c" - - def _build_standard_library_modules(self) -> Set[str]: - """Build comprehensive Objective-C system frameworks set.""" - return { - # Core frameworks (iOS and macOS) - 'Foundation', 'CoreFoundation', 'CoreData', 'CoreGraphics', - 'QuartzCore', 'CoreAnimation', 'CoreImage', 'CoreText', - 'Security', 'SystemConfiguration', 'CFNetwork', - - # UI frameworks - 'UIKit', 'AppKit', 'Cocoa', 'SwiftUI', - - # Media frameworks - 'AVFoundation', 'AVKit', 'AudioToolbox', 'AudioUnit', - 'VideoToolbox', 'MediaPlayer', 'Photos', 'PhotosUI', - 'CoreAudio', 'CoreMIDI', 'CoreMedia', 'ImageIO', - - # Graphics and gaming - 'Metal', 'MetalKit', 'GameplayKit', 'SpriteKit', 'SceneKit', - 'GLKit', 'OpenGLES', 'CoreMotion', 'ARKit', 'RealityKit', - - # Location and maps - 'CoreLocation', 'MapKit', 'Contacts', 'ContactsUI', - - # Web and networking - 'WebKit', 'JavaScriptCore', 'NetworkExtension', - - # Data and storage - 'CloudKit', 'CoreSpotlight', 'EventKit', 'EventKitUI', - 'HealthKit', 'HealthKitUI', 'HomeKit', 'HomeKitUI', - - # Device and sensors - 'CoreBluetooth', 'ExternalAccessory', 'CoreNFC', - 'CoreTelephony', 'CallKit', 'PushKit', - - # Machine learning and AI - 'CoreML', 'Vision', 'NaturalLanguage', 'Speech', - 'SoundAnalysis', - - # Development tools - 'XCTest', 'os', 'Accelerate', 'simd', - - # Legacy frameworks - 'AddressBook', 'AddressBookUI', 'AssetsLibrary', - 'MobileCoreServices', 'Social', 'Accounts', - - # watchOS specific - 'WatchKit', 'ClockKit', 'WatchConnectivity', - - # tvOS specific - 'TVUIKit', 'TVMLKit', - - # macOS specific - 'Carbon', 'ApplicationServices', 'CoreServices', - 'IOKit', 'DiskArbitration', 'FSEvents', 'ServiceManagement', - 'LaunchServices', 'SearchKit', 'PreferencePanes', - 'InstantMessage', 'Automator', 'CalendarStore', - 'Collaboration', 'CoreWLAN', 'DiscRecording', - 'DiscRecordingUI', 'DVDPlayback', 'ExceptionHandling', - 'FWAUserLib', 'InstallerPlugins', 'IOBluetooth', - 'IOBluetoothUI', 'Kernel', 'LDAP', 'Message', - 'OpenDirectory', 'OSAKit', 'PubSub', 'QTKit', - 'Quartz', 'QuartzComposer', 'QuickLook', 'ScreenSaver', - 'ScriptingBridge', 'SyncServices', 'Tcl', 'Tk', - 'WebKit', 'XgridFoundation' - } - - def _classify_dependency_impl(self, module_name: str) -> str: - """ - Classify Objective-C dependency based on framework patterns. - - Args: - module_name: Framework/module name to classify - - Returns: - Classification: 'standard_library', 'third_party', or 'local' - """ - # Local imports (project-specific) - if any(pattern in module_name for pattern in ['.', '/', 'Private', 'Internal']): - return 'local' - - # System frameworks check - if module_name in self.get_standard_library_modules(): - return 'standard_library' - - # Third-party framework indicators - third_party_indicators = { - 'AFNetworking', 'Alamofire', 'SDWebImage', 'MBProgressHUD', - 'JSONModel', 'RestKit', 'Firebase', 'ReactiveCocoa', - 'Masonry', 'SnapKit', 'Realm', 'FMDB', 'SQLite', - 'GoogleAnalytics', 'Fabric', 'Crashlytics', 'TestFlight', - 'Facebook', 'Twitter', 'Instagram', 'Pods' - } - - for indicator in third_party_indicators: - if indicator in module_name: - return 'third_party' - - # CocoaPods/Carthage patterns - if any(pattern in module_name for pattern in ['Pod', 'Carthage', 'SPM']): - return 'third_party' - - # Default to standard_library for unknown frameworks - # (Objective-C tends to have many system frameworks) - return 'standard_library' - - def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: - """ - Extract Objective-C imports from SCIP document. - - Args: - document: SCIP document containing symbols and occurrences - imports: ImportGroup to populate with extracted imports - symbol_parser: Optional SCIPSymbolManager for enhanced parsing - """ - try: - seen_modules = set() - - # Method 1: Extract from occurrences with Import role - if symbol_parser: - for occurrence in document.occurrences: - if not self.is_import_occurrence(occurrence): - continue - - symbol_info = symbol_parser.parse_symbol(occurrence.symbol) - if not symbol_info: - continue - - # Handle based on manager type - if symbol_info.manager in ['system', 'framework']: - framework_name = symbol_info.package or self._extract_framework_from_descriptors(symbol_info.descriptors) - if framework_name and framework_name not in seen_modules: - imports.add_import(framework_name, 'standard_library') - seen_modules.add(framework_name) - - elif symbol_info.manager in ['cocoapods', 'carthage', 'third_party']: - package_name = symbol_info.package or self._extract_framework_from_descriptors(symbol_info.descriptors) - if package_name and package_name not in seen_modules: - imports.add_import(package_name, 'third_party') - seen_modules.add(package_name) - - elif symbol_info.manager == 'local': - module_path = self._extract_local_module_path(symbol_info.descriptors) - if module_path and module_path not in seen_modules: - imports.add_import(module_path, 'local') - seen_modules.add(module_path) - - # Method 2: Extract from external symbols (if available in index) - # This handles frameworks detected during indexing but not in occurrences - self._extract_from_external_symbols_if_available(imports, seen_modules, symbol_parser) - - logger.debug(f"Extracted {len(seen_modules)} Objective-C imports/frameworks") - - except Exception as e: - logger.debug(f"Error extracting Objective-C imports: {e}") - - def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: - """ - Extract Objective-C specific symbol metadata. - - Args: - symbol_info: SCIP symbol information - document: SCIP document - - Returns: - Dictionary with Objective-C specific metadata - """ - metadata = { - 'language': 'objective-c', - 'source': 'objc_analyzer' - } - - try: - # Extract method signature patterns - if hasattr(symbol_info, 'signature') and symbol_info.signature: - signature = symbol_info.signature - metadata['signature'] = signature - - # Parse Objective-C method patterns - if signature.startswith('-') or signature.startswith('+'): - metadata['is_method'] = True - metadata['is_instance_method'] = signature.startswith('-') - metadata['is_class_method'] = signature.startswith('+') - - # Parse method parameters (Objective-C style) - if ':' in signature: - metadata['parameter_count'] = signature.count(':') - metadata['method_labels'] = self._extract_method_labels(signature) - - # Parse return type - if ')' in signature and '(' in signature: - return_type_match = signature.split(')') - if len(return_type_match) > 0: - return_type = return_type_match[0].strip('(+-') - if return_type: - metadata['return_type'] = return_type - - # Extract property characteristics - symbol = getattr(symbol_info, 'symbol', '') - if symbol: - metadata['is_property'] = self._is_objc_property(symbol) - metadata['is_protocol'] = self._is_objc_protocol(symbol) - metadata['is_category'] = self._is_objc_category(symbol) - metadata['framework'] = self._extract_framework_from_symbol(symbol) - - # Extract documentation - if hasattr(symbol_info, 'documentation') and symbol_info.documentation: - metadata['documentation'] = symbol_info.documentation - - except Exception as e: - logger.debug(f"Error extracting Objective-C metadata: {e}") - metadata['extraction_error'] = str(e) - - return metadata - - def _extract_framework_from_descriptors(self, descriptors: str) -> Optional[str]: - """ - Extract framework name from SCIP descriptors for Objective-C. - - Args: - descriptors: SCIP descriptors string - - Returns: - Framework name or None - """ - try: - # Handle descriptors like 'Foundation/' or 'UIKit/UIView' - if '/' in descriptors: - return descriptors.split('/')[0] - return descriptors.strip('/') - except Exception: - return None - - def _extract_local_module_path(self, descriptors: str) -> Optional[str]: - """ - Extract local module path from descriptors for Objective-C. - - Args: - descriptors: SCIP descriptors string - - Returns: - Module path or None - """ - try: - # Handle local Objective-C files - if '/' in descriptors: - parts = descriptors.split('/') - if len(parts) >= 2: - file_part = parts[0] - if file_part.endswith('.h') or file_part.endswith('.m'): - return file_part - return file_part - return None - except Exception: - return None - - def _extract_from_external_symbols_if_available(self, imports: ImportGroup, seen_modules: Set[str], symbol_parser) -> None: - """ - Extract additional imports from external symbols if available. - This method would be called with the full SCIP index if available. - """ - # This method would need to be integrated with the main analyzer - # to access external symbols from the SCIP index - pass - - def _extract_method_labels(self, signature: str) -> List[str]: - """ - Extract Objective-C method labels from signature. - - Args: - signature: Method signature string - - Returns: - List of method labels - """ - try: - # Parse Objective-C method signature like: "-(void)setName:(NSString*)name withAge:(int)age" - labels = [] - parts = signature.split(':') - for part in parts[:-1]: # Exclude last part after final : - # Extract the label (word before the colon) - words = part.strip().split() - if words: - label = words[-1] - if label and not label.startswith('(') and not label.startswith('-') and not label.startswith('+'): - labels.append(label) - return labels - except Exception: - return [] - - def _is_objc_property(self, symbol: str) -> bool: - """Check if symbol represents an Objective-C property.""" - try: - # Properties often have specific patterns in SCIP symbols - return '@property' in symbol or 'property' in symbol.lower() - except Exception: - return False - - def _is_objc_protocol(self, symbol: str) -> bool: - """Check if symbol represents an Objective-C protocol.""" - try: - return '@protocol' in symbol or 'protocol' in symbol.lower() - except Exception: - return False - - def _is_objc_category(self, symbol: str) -> bool: - """Check if symbol represents an Objective-C category.""" - try: - # Categories often have + in their symbol representation - return '(' in symbol and ')' in symbol - except Exception: - return False - - def _extract_framework_from_symbol(self, symbol: str) -> Optional[str]: - """ - Extract framework name from SCIP symbol string. - - Args: - symbol: SCIP symbol string - - Returns: - Framework name or None - """ - try: - # Handle various SCIP symbol formats for frameworks - if 'Foundation' in symbol: - return 'Foundation' - elif 'UIKit' in symbol: - return 'UIKit' - # Add more specific framework detection as needed - - # Generic extraction from symbol structure - if ' ' in symbol: - parts = symbol.split() - for part in parts: - if part in self.get_standard_library_modules(): - return part - - return None - except Exception: - return None diff --git a/src/code_index_mcp/tools/scip/analyzers/python_analyzer.py b/src/code_index_mcp/tools/scip/analyzers/python_analyzer.py deleted file mode 100644 index 10aea2d..0000000 --- a/src/code_index_mcp/tools/scip/analyzers/python_analyzer.py +++ /dev/null @@ -1,400 +0,0 @@ -""" -Python language-specific SCIP symbol analyzer. - -This module handles Python-specific logic extracted from the monolithic -SCIPSymbolAnalyzer, following the refactoring plan for modular architecture. -""" - -import logging -from typing import Dict, List, Optional, Any, Set -from .base import BaseLanguageAnalyzer -from ..symbol_definitions import ImportGroup - -logger = logging.getLogger(__name__) - - -class PythonAnalyzer(BaseLanguageAnalyzer): - """ - Python language-specific SCIP symbol analyzer. - - Handles Python-specific import parsing, dependency classification, - and symbol metadata extraction. - """ - - def _get_language_name(self) -> str: - return "python" - - def _build_standard_library_modules(self) -> Set[str]: - """Build comprehensive Python standard library module set.""" - return { - # Core modules - 'os', 'sys', 'json', 'time', 'datetime', 'logging', 'pathlib', - 'typing', 'dataclasses', 'functools', 'itertools', 'collections', - 're', 'math', 'random', 'threading', 'subprocess', 'shutil', - 'contextlib', 'traceback', 'warnings', 'weakref', 'copy', - 'pickle', 'base64', 'hashlib', 'hmac', 'uuid', 'urllib', - 'http', 'socketserver', 'email', 'mimetypes', 'csv', 'configparser', - 'argparse', 'getopt', 'tempfile', 'glob', 'fnmatch', 'linecache', - 'pprint', 'textwrap', 'string', 'struct', 'codecs', 'unicodedata', - 'io', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile', - - # Network and web - 'socket', 'ssl', 'ftplib', 'poplib', 'imaplib', 'smtplib', - 'xmlrpc', 'webbrowser', - - # Data formats - 'xml', 'html', 'sqlite3', 'dbm', 'marshal', - - # Development tools - 'unittest', 'doctest', 'pdb', 'profile', 'cProfile', 'timeit', - 'trace', 'cgitb', 'py_compile', 'compileall', 'dis', 'pickletools', - - # System services - 'errno', 'ctypes', 'syslog', 'curses', 'platform', - - # Internationalization - 'locale', 'gettext', - - # Multimedia - 'audioop', 'wave', 'chunk', 'sunau', 'aifc', 'colorsys', - - # Cryptographic services - 'secrets', 'hashlib', 'hmac', - - # File and directory access - 'stat', 'fileinput', 'filecmp', 'shutil', 'macpath', - - # Data persistence - 'shelve', 'copyreg', - - # Data compression and archiving - 'zlib', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile', - - # File formats - 'csv', 'netrc', 'xdrlib', 'plistlib', - - # Internet protocols and support - 'ipaddress', 'mailbox', 'mimetypes', - - # Structured markup processing tools - 'html', 'xml', - - # Internet data handling - 'json', 'base64', 'binascii', 'uu', 'quopri', - - # Numeric and mathematical modules - 'numbers', 'decimal', 'fractions', 'statistics', 'cmath', - - # Functional programming modules - 'operator', 'functools', 'itertools', - - # Python language services - 'ast', 'symtable', 'symbol', 'token', 'tokenize', 'keyword', - 'tabnanny', 'pyclbr', 'py_compile', 'compileall', 'dis', - 'pickletools', 'distutils', - - # Importing modules - 'importlib', 'pkgutil', 'modulefinder', 'runpy', - - # Python runtime services - 'atexit', 'gc', 'inspect', 'site', '__future__', '__main__', - - # Custom Python interpreters - 'code', 'codeop', - - # MS Windows specific services - 'msvcrt', 'winreg', 'winsound', - - # Unix specific services - 'posix', 'pwd', 'grp', 'crypt', 'termios', 'tty', 'pty', - 'fcntl', 'pipes', 'resource', 'nis', 'syslog', - - # Superseded modules - 'optparse', 'imp' - } - - def _classify_dependency_impl(self, module_name: str) -> str: - """ - Classify Python dependency based on module patterns. - - Args: - module_name: Python module name to classify - - Returns: - Classification: 'standard_library', 'third_party', or 'local' - """ - # Local imports (relative imports or project-specific patterns) - if module_name.startswith('.'): - return 'local' - - # Check for common project patterns - if any(pattern in module_name for pattern in ['src.', 'lib.', 'app.', 'project.']): - return 'local' - - # Standard library check - base_module = module_name.split('.')[0] - if base_module in self.get_standard_library_modules(): - return 'standard_library' - - # Everything else is third_party - return 'third_party' - - def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: - """ - Extract Python imports from SCIP document. - - Args: - document: SCIP document containing symbols and occurrences - imports: ImportGroup to populate with extracted imports - symbol_parser: Optional SCIPSymbolManager for enhanced parsing - """ - if not symbol_parser: - logger.debug("No symbol parser available for Python import extraction") - return - - try: - seen_modules = set() - - # Extract from occurrences with Import role - for occurrence in document.occurrences: - if not self.is_import_occurrence(occurrence): - continue - - symbol_info = symbol_parser.parse_symbol(occurrence.symbol) - if not symbol_info: - continue - - # Handle based on manager type - if symbol_info.manager == 'stdlib': - module_name = self._extract_module_from_descriptors(symbol_info.descriptors) - if module_name and module_name not in seen_modules: - imports.add_import(module_name, 'standard_library') - seen_modules.add(module_name) - - elif symbol_info.manager == 'pip': - # pip packages: package name is the module name - package_name = symbol_info.package - if package_name and package_name not in seen_modules: - imports.add_import(package_name, 'third_party') - seen_modules.add(package_name) - - elif symbol_info.manager == 'local': - # Local imports: extract module path from descriptors - module_path = self._extract_local_module_path(symbol_info.descriptors) - if module_path and module_path not in seen_modules: - imports.add_import(module_path, 'local') - seen_modules.add(module_path) - - logger.debug(f"Extracted {len(seen_modules)} Python imports") - - except Exception as e: - logger.debug(f"Error extracting Python imports: {e}") - - def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: - """ - Extract Python-specific symbol metadata. - - Args: - symbol_info: SCIP symbol information - document: SCIP document - - Returns: - Dictionary with Python-specific metadata - """ - metadata = { - 'language': 'python', - 'source': 'python_analyzer' - } - - try: - # Extract documentation/docstring - if hasattr(symbol_info, 'documentation') and symbol_info.documentation: - metadata['documentation'] = symbol_info.documentation - - # Parse special documentation markers from Python AST analyzer - for doc_line in symbol_info.documentation: - if doc_line.startswith('Parameters: '): - param_str = doc_line[12:] - metadata['parameters'] = [p.strip() for p in param_str.split(',') if p.strip()] - elif doc_line == 'Async function': - metadata['is_async'] = True - elif doc_line.startswith('Decorators: '): - decorator_str = doc_line[12:] - metadata['decorators'] = [d.strip() for d in decorator_str.split(',') if d.strip()] - - # Extract type information from signature - if hasattr(symbol_info, 'signature') and symbol_info.signature: - signature = symbol_info.signature - metadata['signature'] = signature - - # Parse return type - if '->' in signature: - return_type = signature.split('->')[-1].strip() - metadata['return_type'] = return_type - - # Parse parameters from signature - if '(' in signature and ')' in signature and 'parameters' not in metadata: - metadata['parameters'] = self._parse_signature_parameters(signature) - - # Parse variable type annotation - if ':' in signature and '->' not in signature: - type_part = signature.split(':')[1].strip() - metadata['type'] = type_part - - # Parse constant value - if '=' in signature: - value_part = signature.split('=')[1].strip() - metadata['value'] = value_part - - # Classify symbol role - symbol = getattr(symbol_info, 'symbol', '') - if symbol: - metadata['scope'] = self._classify_symbol_scope(symbol) - metadata['is_private'] = self._is_private_symbol(symbol) - metadata['is_dunder'] = self._is_dunder_method(symbol) - - except Exception as e: - logger.debug(f"Error extracting Python metadata: {e}") - metadata['extraction_error'] = str(e) - - return metadata - - def _extract_module_from_descriptors(self, descriptors: str) -> Optional[str]: - """ - Extract module name from SCIP descriptors for Python. - - Args: - descriptors: SCIP descriptors string - - Returns: - Module name or None - """ - try: - # Handle descriptors like 'os/' or 'pathlib/Path' - if '/' in descriptors: - return descriptors.split('/')[0] - return descriptors.strip('/') - except Exception: - return None - - def _extract_local_module_path(self, descriptors: str) -> Optional[str]: - """ - Extract local module path from descriptors for Python. - - Args: - descriptors: SCIP descriptors string - - Returns: - Module path or None - """ - try: - # Handle descriptors like 'utils.py/helper_function' -> 'utils' - # or 'services/user_service.py/UserService' -> 'services.user_service' - if '/' in descriptors: - parts = descriptors.split('/') - if len(parts) >= 2: - file_part = parts[0] - if file_part.endswith('.py'): - return file_part[:-3].replace('/', '.') - return file_part.replace('/', '.') - return None - except Exception: - return None - - def _parse_signature_parameters(self, signature: str) -> List[str]: - """ - Parse parameter names from Python function signature. - - Args: - signature: Function signature string - - Returns: - List of parameter names - """ - try: - if '(' in signature and ')' in signature: - param_section = signature.split('(')[1].split(')')[0] - if not param_section.strip(): - return [] - - params = [] - for param in param_section.split(','): - param = param.strip() - if param: - # Extract parameter name (before type annotation) - param_name = param.split(':')[0].strip() - if param_name: - params.append(param_name) - - return params - except Exception as e: - logger.debug(f"Error parsing Python signature parameters: {e}") - - return [] - - def _classify_symbol_scope(self, symbol: str) -> str: - """ - Classify Python symbol scope (global, class, function). - - Args: - symbol: SCIP symbol string - - Returns: - Scope classification - """ - if '#' not in symbol: - return 'global' - elif symbol.count('#') == 1: - return 'class' - else: - return 'function' - - def _is_private_symbol(self, symbol: str) -> bool: - """ - Check if symbol is private (starts with underscore). - - Args: - symbol: SCIP symbol string - - Returns: - True if symbol appears to be private - """ - try: - # Extract symbol name from various SCIP formats - if '#' in symbol: - name = symbol.split('#')[-1] - elif '/' in symbol: - name = symbol.split('/')[-1] - else: - name = symbol.split('.')[-1] - - # Clean up name - name = name.rstrip('().#') - return name.startswith('_') and not name.startswith('__') - except Exception: - return False - - def _is_dunder_method(self, symbol: str) -> bool: - """ - Check if symbol is a dunder (double underscore) method. - - Args: - symbol: SCIP symbol string - - Returns: - True if symbol appears to be a dunder method - """ - try: - # Extract symbol name - if '#' in symbol: - name = symbol.split('#')[-1] - elif '/' in symbol: - name = symbol.split('/')[-1] - else: - name = symbol.split('.')[-1] - - # Clean up name - name = name.rstrip('().#') - return name.startswith('__') and name.endswith('__') - except Exception: - return False diff --git a/src/code_index_mcp/tools/scip/analyzers/zig_analyzer.py b/src/code_index_mcp/tools/scip/analyzers/zig_analyzer.py deleted file mode 100644 index 332950a..0000000 --- a/src/code_index_mcp/tools/scip/analyzers/zig_analyzer.py +++ /dev/null @@ -1,300 +0,0 @@ -""" -Zig language-specific SCIP symbol analyzer. - -This module handles Zig-specific logic extracted from the monolithic -SCIPSymbolAnalyzer, including Zig import classification and standard library detection. -""" - -import logging -from typing import Dict, List, Optional, Any, Set -from .base import BaseLanguageAnalyzer -from ..symbol_definitions import ImportGroup - -logger = logging.getLogger(__name__) - - -class ZigAnalyzer(BaseLanguageAnalyzer): - """ - Zig language-specific SCIP symbol analyzer. - - Handles Zig-specific import parsing, dependency classification, - and symbol metadata extraction. - """ - - def _get_language_name(self) -> str: - return "zig" - - def _build_standard_library_modules(self) -> Set[str]: - """Build comprehensive Zig standard library module set.""" - return { - # Core standard library - 'std', 'builtin', 'testing', - - # Data structures and algorithms - 'math', 'mem', 'sort', 'hash', 'crypto', - - # Text and formatting - 'fmt', 'ascii', 'unicode', 'json', - - # System interaction - 'os', 'fs', 'process', 'thread', 'atomic', - - # Networking and I/O - 'net', 'http', 'io', - - # Compression and encoding - 'compress', 'base64', - - # Development and debugging - 'debug', 'log', 'meta', 'comptime', - - # Utilities - 'rand', 'time', 'zig', - - # Platform-specific - 'c', 'wasm', - - # Build system - 'build', 'target' - } - - def _classify_dependency_impl(self, module_name: str) -> str: - """ - Classify Zig dependency based on module patterns. - - Args: - module_name: Zig module name to classify - - Returns: - Classification: 'standard_library', 'third_party', or 'local' - """ - # Local imports (relative paths or .zig files) - if (module_name.startswith('./') or - module_name.startswith('../') or - module_name.endswith('.zig')): - return 'local' - - # Standard library check - if module_name in self.get_standard_library_modules(): - return 'standard_library' - - # Check for common Zig package patterns - if any(pattern in module_name for pattern in ['zig-', 'pkg/', 'deps/']): - return 'third_party' - - # Everything else is third_party (Zig doesn't have as many stdlib modules as Python) - return 'third_party' - - def extract_imports(self, document, imports: ImportGroup, symbol_parser=None) -> None: - """ - Extract Zig imports from SCIP document. - - Args: - document: SCIP document containing symbols and occurrences - imports: ImportGroup to populate with extracted imports - symbol_parser: Optional SCIPSymbolManager for enhanced parsing - """ - if not symbol_parser: - logger.debug("No symbol parser available for Zig import extraction") - return - - try: - seen_modules = set() - - # Extract from occurrences with Import role - for occurrence in document.occurrences: - if not self.is_import_occurrence(occurrence): - continue - - symbol_info = symbol_parser.parse_symbol(occurrence.symbol) - if not symbol_info: - continue - - # Handle Zig-specific patterns - if symbol_info.manager == 'local': - # Local imports: extract from descriptors - module_path = self._extract_zig_local_module_path(symbol_info.descriptors) - if module_path and module_path not in seen_modules: - import_type = self.classify_dependency(module_path) - imports.add_import(module_path, import_type) - seen_modules.add(module_path) - - elif symbol_info.manager in ['system', 'stdlib']: - # Standard library imports - module_name = self._extract_module_from_descriptors(symbol_info.descriptors) - if module_name and module_name not in seen_modules: - imports.add_import(module_name, 'standard_library') - seen_modules.add(module_name) - - elif symbol_info.manager in ['third_party', 'pkg']: - # Third-party packages - package_name = symbol_info.package or self._extract_module_from_descriptors(symbol_info.descriptors) - if package_name and package_name not in seen_modules: - imports.add_import(package_name, 'third_party') - seen_modules.add(package_name) - - logger.debug(f"Extracted {len(seen_modules)} Zig imports") - - except Exception as e: - logger.debug(f"Error extracting Zig imports: {e}") - - def _extract_symbol_metadata_impl(self, symbol_info, document) -> Dict[str, Any]: - """ - Extract Zig-specific symbol metadata. - - Args: - symbol_info: SCIP symbol information - document: SCIP document - - Returns: - Dictionary with Zig-specific metadata - """ - metadata = { - 'language': 'zig', - 'source': 'zig_analyzer' - } - - try: - # Extract type information from signature - if hasattr(symbol_info, 'signature') and symbol_info.signature: - signature = symbol_info.signature - metadata['signature'] = signature - - # Parse Zig-specific type patterns - if ':' in signature: - # Variable/field type: name: Type - type_part = signature.split(':', 1)[1].strip() - metadata['type'] = type_part - - # Parse function return type (Zig uses different syntax) - if '!' in signature: - # Error union type - metadata['can_error'] = True - - if 'comptime' in signature: - metadata['is_comptime'] = True - - if 'pub' in signature: - metadata['is_public'] = True - else: - metadata['is_private'] = True - - # Extract documentation if available - if hasattr(symbol_info, 'documentation') and symbol_info.documentation: - metadata['documentation'] = symbol_info.documentation - - # Classify symbol characteristics - symbol = getattr(symbol_info, 'symbol', '') - if symbol: - metadata['scope'] = self._classify_zig_symbol_scope(symbol) - metadata['is_test'] = self._is_zig_test_symbol(symbol) - metadata['is_generic'] = self._is_zig_generic_symbol(symbol) - - except Exception as e: - logger.debug(f"Error extracting Zig metadata: {e}") - metadata['extraction_error'] = str(e) - - return metadata - - def _extract_zig_local_module_path(self, descriptors: str) -> Optional[str]: - """ - Extract local module path from descriptors for Zig. - - Args: - descriptors: SCIP descriptors string - - Returns: - Module path or None - """ - try: - # Handle Zig descriptors like: - # 'test/sample-projects/zig/code-index-example/src/main.zig/std.' -> 'std' - # 'src/utils.zig/helper_function' -> 'utils' - if '/' in descriptors: - parts = descriptors.split('/') - if len(parts) >= 2: - # For Zig: if we have a .zig file, the symbol after it is the import - for i, part in enumerate(parts): - if part.endswith('.zig') and i + 1 < len(parts): - # Next part is the imported symbol/module - symbol_name = parts[i + 1].rstrip('.') - return symbol_name - - # Fallback: traditional file-based extraction - file_part = parts[0] - if file_part.endswith('.zig'): - return file_part[:-4] # Remove .zig extension - return file_part - return None - except Exception: - return None - - def _extract_module_from_descriptors(self, descriptors: str) -> Optional[str]: - """ - Extract module name from SCIP descriptors for Zig. - - Args: - descriptors: SCIP descriptors string - - Returns: - Module name or None - """ - try: - # Handle descriptors like 'std/' or 'std/mem' - if '/' in descriptors: - return descriptors.split('/')[0] - return descriptors.strip('/.') - except Exception: - return None - - def _classify_zig_symbol_scope(self, symbol: str) -> str: - """ - Classify Zig symbol scope. - - Args: - symbol: SCIP symbol string - - Returns: - Scope classification - """ - # Zig doesn't use # for scope like other languages - if '/' in symbol: - parts = symbol.count('/') - if parts == 1: - return 'module' - elif parts >= 2: - return 'nested' - return 'global' - - def _is_zig_test_symbol(self, symbol: str) -> bool: - """ - Check if symbol is a Zig test. - - Args: - symbol: SCIP symbol string - - Returns: - True if symbol appears to be a test - """ - try: - # Zig tests often contain 'test' in their symbol path - return 'test' in symbol.lower() - except Exception: - return False - - def _is_zig_generic_symbol(self, symbol: str) -> bool: - """ - Check if symbol is a generic (comptime) function/type. - - Args: - symbol: SCIP symbol string - - Returns: - True if symbol appears to be generic - """ - try: - # This would require more sophisticated analysis - # For now, just check for common generic patterns - return 'comptime' in symbol.lower() or 'generic' in symbol.lower() - except Exception: - return False \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/dependencies/__init__.py b/src/code_index_mcp/tools/scip/dependencies/__init__.py deleted file mode 100644 index 3b31207..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Unified dependency classification and management system. - -This package provides the dependency management system that replaces scattered -dependency logic throughout the SCIPSymbolAnalyzer, following the refactoring -plan for centralized and configurable dependency classification. - -Key Components: -- DependencyClassifier: Main dependency classification engine -- DependencyConfig: Abstract base for language-specific configurations -- DependencyRegistry: Centralized registry and caching system -- ImportNormalizer: Import path normalization utilities - -The system supports: -- Configurable classification rules per language -- Caching for performance optimization -- Standard library detection -- Third-party package identification -- Local/project import detection -- Custom classification rules -""" - -from .classifier import DependencyClassifier -from .registry import DependencyRegistry -from .normalizer import ImportNormalizer -from .configs import get_dependency_config - -__all__ = [ - 'DependencyClassifier', - 'DependencyRegistry', - 'ImportNormalizer', - 'get_dependency_config' -] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/dependencies/classifier.py b/src/code_index_mcp/tools/scip/dependencies/classifier.py deleted file mode 100644 index 6a539c9..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/classifier.py +++ /dev/null @@ -1,361 +0,0 @@ -""" -Main dependency classifier engine. - -This module provides the centralized DependencyClassifier that replaces scattered -dependency logic throughout the SCIPSymbolAnalyzer, supporting configurable -classification rules per language. -""" - -import logging -from typing import Dict, Set, List, Optional, Any -from .configs import get_dependency_config, BaseDependencyConfig -from .registry import DependencyRegistry -from .normalizer import ImportNormalizer - -logger = logging.getLogger(__name__) - - -class DependencyClassifier: - """ - Main dependency classification engine. - - This class provides centralized dependency classification with support for: - - Language-specific classification rules - - Caching for performance optimization - - Context-aware classification - - Custom rule registration - - Batch processing capabilities - """ - - def __init__(self): - """Initialize the dependency classifier.""" - self._configs: Dict[str, BaseDependencyConfig] = {} - self._registry = DependencyRegistry() - self._normalizer = ImportNormalizer() - self._context_cache: Dict[str, Dict[str, Any]] = {} - - def classify_import( - self, - import_path: str, - language: str, - context: Optional[Dict[str, Any]] = None - ) -> str: - """ - Classify an import path based on language-specific rules. - - Args: - import_path: Import path to classify - language: Programming language - context: Optional context information (project structure, etc.) - - Returns: - Classification: 'standard_library', 'third_party', or 'local' - """ - if not import_path: - return 'local' - - # Normalize the import path - normalized_path = self._normalizer.normalize_import_path(import_path, language) - - # Check cache first - cache_key = f"{language}:{normalized_path}" - cached_result = self._registry.get_cached_classification(cache_key) - if cached_result is not None: - return cached_result - - # Get language-specific configuration - config = self._get_config(language) - - # Perform classification - classification = config.classify_import(normalized_path, context) - - # Cache the result - self._registry.cache_classification(cache_key, classification) - - logger.debug(f"Classified {import_path} ({language}) as {classification}") - return classification - - def classify_batch( - self, - imports: List[str], - language: str, - context: Optional[Dict[str, Any]] = None - ) -> Dict[str, str]: - """ - Classify multiple imports efficiently. - - Args: - imports: List of import paths to classify - language: Programming language - context: Optional context information - - Returns: - Dictionary mapping import_path -> classification - """ - results = {} - config = self._get_config(language) - - for import_path in imports: - if not import_path: - results[import_path] = 'local' - continue - - # Normalize the import path - normalized_path = self._normalizer.normalize_import_path(import_path, language) - - # Check cache first - cache_key = f"{language}:{normalized_path}" - cached_result = self._registry.get_cached_classification(cache_key) - - if cached_result is not None: - results[import_path] = cached_result - else: - # Perform classification - classification = config.classify_import(normalized_path, context) - results[import_path] = classification - - # Cache the result - self._registry.cache_classification(cache_key, classification) - - logger.debug(f"Classified {len(imports)} imports for {language}") - return results - - def get_standard_library_modules(self, language: str) -> Set[str]: - """ - Get standard library modules for a language. - - Args: - language: Programming language - - Returns: - Set of standard library module names - """ - config = self._get_config(language) - return config.get_stdlib_modules() - - def is_standard_library(self, import_path: str, language: str) -> bool: - """ - Check if an import is from the standard library. - - Args: - import_path: Import path to check - language: Programming language - - Returns: - True if import is from standard library - """ - return self.classify_import(import_path, language) == 'standard_library' - - def is_third_party(self, import_path: str, language: str) -> bool: - """ - Check if an import is third-party. - - Args: - import_path: Import path to check - language: Programming language - - Returns: - True if import is third-party - """ - return self.classify_import(import_path, language) == 'third_party' - - def is_local(self, import_path: str, language: str) -> bool: - """ - Check if an import is local. - - Args: - import_path: Import path to check - language: Programming language - - Returns: - True if import is local - """ - return self.classify_import(import_path, language) == 'local' - - def register_custom_config(self, language: str, config: BaseDependencyConfig) -> None: - """ - Register a custom dependency configuration for a language. - - Args: - language: Language name - config: Custom configuration instance - """ - self._configs[language.lower()] = config - logger.debug(f"Registered custom dependency config for {language}") - - def update_context(self, project_path: str, context: Dict[str, Any]) -> None: - """ - Update context information for a project. - - Args: - project_path: Path to the project - context: Context information to cache - """ - self._context_cache[project_path] = context - logger.debug(f"Updated context for project: {project_path}") - - def get_context(self, project_path: str) -> Optional[Dict[str, Any]]: - """ - Get cached context information for a project. - - Args: - project_path: Path to the project - - Returns: - Cached context or None - """ - return self._context_cache.get(project_path) - - def extract_dependencies_from_file( - self, - file_path: str, - file_content: str, - language: str - ) -> List[str]: - """ - Extract dependencies from package manager files. - - Args: - file_path: Path to the package manager file - file_content: Content of the file - language: Programming language - - Returns: - List of dependency names - """ - config = self._get_config(language) - return config.extract_dependencies_from_file(file_path, file_content) - - def get_package_manager_files(self, language: str) -> Set[str]: - """ - Get package manager files for a language. - - Args: - language: Programming language - - Returns: - Set of package manager file names - """ - config = self._get_config(language) - return config.get_package_manager_files() - - def detect_package_version( - self, - package_name: str, - language: str, - context: Optional[Dict[str, Any]] = None - ) -> Optional[str]: - """ - Detect version of a package if possible. - - Args: - package_name: Name of the package - language: Programming language - context: Optional context information - - Returns: - Package version or None if not detectable - """ - config = self._get_config(language) - if config.supports_version_detection(): - return config.detect_package_version(package_name, context) - return None - - def get_supported_languages(self) -> Set[str]: - """ - Get set of supported languages. - - Returns: - Set of supported language names - """ - # Languages supported by default configs - supported = {'python', 'zig', 'javascript', 'typescript', 'objective-c'} - # Add custom registered languages - supported.update(self._configs.keys()) - return supported - - def clear_cache(self) -> None: - """Clear all cached data.""" - self._registry.clear_cache() - self._context_cache.clear() - logger.debug("Cleared dependency classifier cache") - - def get_classification_stats(self) -> Dict[str, Any]: - """ - Get statistics about classification operations. - - Returns: - Dictionary with classification statistics - """ - return self._registry.get_stats() - - def _get_config(self, language: str) -> BaseDependencyConfig: - """ - Get or create configuration for a language. - - Args: - language: Programming language - - Returns: - Language-specific dependency configuration - """ - language_lower = language.lower() - - # Check if we have a custom config - if language_lower in self._configs: - return self._configs[language_lower] - - # Get default config - config = get_dependency_config(language_lower) - self._configs[language_lower] = config - - return config - - -# Global classifier instance -_classifier_instance: Optional[DependencyClassifier] = None - - -def get_dependency_classifier() -> DependencyClassifier: - """ - Get the global dependency classifier instance. - - Returns: - Global DependencyClassifier instance - """ - global _classifier_instance - if _classifier_instance is None: - _classifier_instance = DependencyClassifier() - return _classifier_instance - - -def classify_import( - import_path: str, - language: str, - context: Optional[Dict[str, Any]] = None -) -> str: - """ - Convenience function to classify an import. - - Args: - import_path: Import path to classify - language: Programming language - context: Optional context information - - Returns: - Classification string - """ - return get_dependency_classifier().classify_import(import_path, language, context) - - -def get_standard_library_modules(language: str) -> Set[str]: - """ - Convenience function to get standard library modules. - - Args: - language: Programming language - - Returns: - Set of standard library module names - """ - return get_dependency_classifier().get_standard_library_modules(language) diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/__init__.py b/src/code_index_mcp/tools/scip/dependencies/configs/__init__.py deleted file mode 100644 index 5cb33d5..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/configs/__init__.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Language-specific dependency configuration system. - -This package provides language-specific dependency configurations that define -how imports and dependencies should be classified for each supported language. - -Key Components: -- BaseDependencyConfig: Abstract base class for all configurations -- PythonConfig: Python-specific dependency rules -- ZigConfig: Zig-specific dependency rules -- JavaScriptConfig: JavaScript/TypeScript dependency rules -- ObjectiveCConfig: Objective-C framework classification rules - -Each configuration defines: -- Standard library module sets -- Third-party package detection rules -- Local import patterns -- Package manager integration -- Custom classification logic -""" - -from .base import BaseDependencyConfig -from .python import PythonDependencyConfig -from .zig import ZigDependencyConfig -from .javascript import JavaScriptDependencyConfig -from .objc import ObjectiveCDependencyConfig - -# Configuration registry -_CONFIGS = { - 'python': PythonDependencyConfig, - 'zig': ZigDependencyConfig, - 'javascript': JavaScriptDependencyConfig, - 'typescript': JavaScriptDependencyConfig, # TypeScript uses JS config - 'objective-c': ObjectiveCDependencyConfig, -} - -def get_dependency_config(language: str) -> BaseDependencyConfig: - """ - Get dependency configuration for the specified language. - - Args: - language: Language name - - Returns: - Language-specific dependency configuration - """ - language_lower = language.lower() - config_class = _CONFIGS.get(language_lower) - - if config_class: - return config_class() - - # Return base config for unsupported languages - return BaseDependencyConfig() - -def register_dependency_config(language: str, config_class) -> None: - """ - Register a custom dependency configuration. - - Args: - language: Language name - config_class: Configuration class - """ - _CONFIGS[language.lower()] = config_class - -__all__ = [ - 'BaseDependencyConfig', - 'PythonDependencyConfig', - 'ZigDependencyConfig', - 'JavaScriptDependencyConfig', - 'ObjectiveCDependencyConfig', - 'get_dependency_config', - 'register_dependency_config' -] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/base.py b/src/code_index_mcp/tools/scip/dependencies/configs/base.py deleted file mode 100644 index 5972a37..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/configs/base.py +++ /dev/null @@ -1,236 +0,0 @@ -""" -Base dependency configuration class. - -This module provides the abstract base class for language-specific dependency -configurations, defining the interface and common functionality. -""" - -import logging -from abc import ABC, abstractmethod -from typing import Set, Dict, List, Optional, Pattern -import re - -logger = logging.getLogger(__name__) - - -class BaseDependencyConfig(ABC): - """ - Abstract base class for language-specific dependency configurations. - - Each language configuration defines how to classify imports and dependencies - as standard_library, third_party, or local based on language-specific patterns. - """ - - def __init__(self): - """Initialize the dependency configuration.""" - self._stdlib_modules: Optional[Set[str]] = None - self._third_party_patterns: List[Pattern] = [] - self._local_patterns: List[Pattern] = [] - self._package_manager_indicators: Set[str] = set() - - # Initialize patterns - self._compile_patterns() - - @abstractmethod - def get_language_name(self) -> str: - """Return the language name this configuration handles.""" - pass - - @abstractmethod - def get_stdlib_modules(self) -> Set[str]: - """Return set of standard library modules for this language.""" - pass - - def classify_import(self, import_path: str, context: Dict[str, any] = None) -> str: - """ - Classify import path based on language-specific rules. - - Args: - import_path: Import path to classify - context: Optional context information (file path, project structure, etc.) - - Returns: - Classification: 'standard_library', 'third_party', or 'local' - """ - if not import_path: - return 'local' # Default for empty imports - - # Step 1: Check for obvious local patterns first - if self._is_local_import(import_path, context): - return 'local' - - # Step 2: Check standard library - if self._is_stdlib_import(import_path): - return 'standard_library' - - # Step 3: Check third-party patterns - if self._is_third_party_import(import_path, context): - return 'third_party' - - # Step 4: Language-specific classification - return self._classify_import_impl(import_path, context) - - def normalize_import_path(self, raw_path: str) -> str: - """ - Normalize import path for consistent processing. - Default implementation just strips whitespace. - - Args: - raw_path: Raw import path - - Returns: - Normalized import path - """ - return raw_path.strip() - - def _compile_patterns(self) -> None: - """Compile regex patterns for efficient matching.""" - try: - # Default patterns - subclasses should override - self._third_party_patterns = [ - re.compile(r'^[a-zA-Z][a-zA-Z0-9_-]*$'), # Simple package names - ] - - self._local_patterns = [ - re.compile(r'^\.'), # Relative imports - re.compile(r'^/'), # Absolute local paths - ] - except Exception as e: - logger.warning(f"Error compiling patterns for {self.get_language_name()}: {e}") - - def _is_local_import(self, import_path: str, context: Dict[str, any] = None) -> bool: - """Check if import is local based on patterns.""" - # Relative imports are always local - if import_path.startswith('.'): - return True - - # Check compiled patterns - for pattern in self._local_patterns: - if pattern.match(import_path): - return True - - # Context-based checks - if context: - # Check against project-specific patterns - project_indicators = context.get('project_patterns', []) - for indicator in project_indicators: - if indicator in import_path: - return True - - return False - - def _is_stdlib_import(self, import_path: str) -> bool: - """Check if import is from standard library.""" - if self._stdlib_modules is None: - self._stdlib_modules = self.get_stdlib_modules() - - # Extract base module name - base_module = import_path.split('.')[0].split('/')[0] - return base_module in self._stdlib_modules - - def _is_third_party_import(self, import_path: str, context: Dict[str, any] = None) -> bool: - """Check if import is third-party based on patterns.""" - # Check compiled patterns - for pattern in self._third_party_patterns: - if pattern.match(import_path): - return True - - # Check package manager indicators - if context: - package_indicators = context.get('package_indicators', set()) - for indicator in package_indicators: - if indicator in import_path: - return True - - return False - - def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: - """ - Language-specific import classification implementation. - Default implementation returns 'third_party' for unknown imports. - - Args: - import_path: Import path to classify - context: Optional context information - - Returns: - Classification string - """ - return 'third_party' - - def get_package_manager_files(self) -> Set[str]: - """ - Return set of package manager files for this language. - Used to detect project structure and third-party dependencies. - - Returns: - Set of package manager file names - """ - return set() - - def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: - """ - Extract dependency list from package manager files. - - Args: - file_path: Path to the package manager file - file_content: Content of the file - - Returns: - List of dependency names - """ - # Default implementation returns empty list - # Subclasses should implement language-specific parsing - return [] - - def is_scoped_package(self, import_path: str) -> bool: - """ - Check if import represents a scoped package. - - Args: - import_path: Import path to check - - Returns: - True if import is a scoped package - """ - # Default implementation - no scoped packages - return False - - def get_package_name_from_import(self, import_path: str) -> str: - """ - Extract package name from import path. - - Args: - import_path: Full import path - - Returns: - Package name (first component typically) - """ - # Default implementation: return first component - if '/' in import_path: - return import_path.split('/')[0] - elif '.' in import_path: - return import_path.split('.')[0] - return import_path - - def supports_version_detection(self) -> bool: - """ - Check if this configuration supports version detection. - - Returns: - True if version detection is supported - """ - return False - - def detect_package_version(self, package_name: str, context: Dict[str, any] = None) -> Optional[str]: - """ - Detect version of a package if possible. - - Args: - package_name: Name of the package - context: Optional context (lock files, manifests, etc.) - - Returns: - Package version or None if not detectable - """ - return None diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/javascript.py b/src/code_index_mcp/tools/scip/dependencies/configs/javascript.py deleted file mode 100644 index a2099f5..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/configs/javascript.py +++ /dev/null @@ -1,283 +0,0 @@ -""" -JavaScript/TypeScript-specific dependency configuration. - -This module provides JavaScript and TypeScript specific dependency classification, -including npm/yarn package management and Node.js built-in modules. -""" - -import json -import re -import logging -from typing import Set, Dict, List, Optional -from .base import BaseDependencyConfig - -logger = logging.getLogger(__name__) - - -class JavaScriptDependencyConfig(BaseDependencyConfig): - """ - JavaScript/TypeScript-specific dependency configuration. - - Handles JavaScript and TypeScript import classification with support for: - - Node.js built-in modules - - npm/yarn package management - - ES6 modules and CommonJS - - Scoped packages (@scope/package) - - Relative and absolute imports - """ - - def get_language_name(self) -> str: - return "javascript" - - def get_stdlib_modules(self) -> Set[str]: - """Return Node.js built-in modules.""" - return { - # Node.js built-in modules - 'assert', 'async_hooks', 'buffer', 'child_process', 'cluster', - 'console', 'constants', 'crypto', 'dgram', 'dns', 'domain', - 'events', 'fs', 'http', 'http2', 'https', 'inspector', - 'module', 'net', 'os', 'path', 'perf_hooks', 'process', - 'punycode', 'querystring', 'readline', 'repl', 'stream', - 'string_decoder', 'timers', 'tls', 'trace_events', 'tty', - 'url', 'util', 'v8', 'vm', 'worker_threads', 'zlib' - } - - def _compile_patterns(self) -> None: - """Compile JavaScript-specific regex patterns.""" - try: - self._third_party_patterns = [ - # Standard npm package names - re.compile(r'^[a-z][a-z0-9-._]*$'), - # Scoped packages - re.compile(r'^@[a-z0-9-]+/[a-z0-9-._]+$'), - # Common frameworks and libraries - re.compile(r'^(react|vue|angular|express|lodash|jquery)'), - ] - - self._local_patterns = [ - # Relative imports - re.compile(r'^\.\.?/'), - # Absolute local paths - re.compile(r'^/[^/]'), - # Webpack aliases - re.compile(r'^@/'), - re.compile(r'^~/'), - # Common local patterns - re.compile(r'^(src|lib|components|utils|helpers)/'), - ] - except Exception as e: - logger.warning(f"Error compiling JavaScript patterns: {e}") - - def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: - """JavaScript-specific import classification.""" - # Handle scoped packages - if import_path.startswith('@'): - return 'third_party' - - # Check for common third-party packages - common_third_party = { - 'react', 'vue', 'angular', 'svelte', 'jquery', 'lodash', - 'express', 'koa', 'fastify', 'next', 'nuxt', 'gatsby', - 'webpack', 'vite', 'rollup', 'parcel', 'babel', 'typescript', - 'eslint', 'prettier', 'jest', 'mocha', 'cypress', 'playwright', - 'axios', 'fetch', 'node-fetch', 'superagent', 'got', - 'moment', 'dayjs', 'date-fns', 'luxon', - 'styled-components', 'emotion', '@emotion/react', - 'material-ui', '@mui/material', 'antd', 'bootstrap', - 'tailwindcss', 'bulma', 'semantic-ui-react', - 'redux', 'mobx', 'zustand', 'recoil', 'rxjs', - 'graphql', 'apollo-client', '@apollo/client', - 'socket.io', 'ws', 'uuid', 'bcrypt', 'jsonwebtoken', - 'mongoose', 'sequelize', 'prisma', 'typeorm' - } - - base_package = self.get_package_name_from_import(import_path) - if base_package in common_third_party: - return 'third_party' - - # Check context for npm/yarn info - if context: - # Check package.json dependencies - npm_deps = context.get('npm_dependencies', set()) - if base_package in npm_deps: - return 'third_party' - - # Check node_modules - node_modules = context.get('node_modules', set()) - if base_package in node_modules: - return 'third_party' - - # Default to third_party for JavaScript ecosystem - return 'third_party' - - def normalize_import_path(self, raw_path: str) -> str: - """Normalize JavaScript import path.""" - normalized = raw_path.strip() - - # Remove file extensions - for ext in ['.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs']: - if normalized.endswith(ext): - normalized = normalized[:-len(ext)] - break - - # Remove /index suffix - if normalized.endswith('/index'): - normalized = normalized[:-6] - - return normalized - - def get_package_manager_files(self) -> Set[str]: - """Return JavaScript package manager files.""" - return { - 'package.json', - 'package-lock.json', - 'yarn.lock', - 'pnpm-lock.yaml', - 'npm-shrinkwrap.json', - 'lerna.json', - 'rush.json' - } - - def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: - """Extract dependencies from JavaScript package manager files.""" - dependencies = [] - - try: - if file_path.endswith('package.json'): - dependencies = self._parse_package_json(file_content) - elif file_path.endswith('package-lock.json'): - dependencies = self._parse_package_lock(file_content) - elif file_path.endswith('yarn.lock'): - dependencies = self._parse_yarn_lock(file_content) - elif file_path.endswith('pnpm-lock.yaml'): - dependencies = self._parse_pnpm_lock(file_content) - except Exception as e: - logger.debug(f"Error parsing JavaScript dependency file {file_path}: {e}") - - return dependencies - - def _parse_package_json(self, content: str) -> List[str]: - """Parse package.json for dependencies.""" - dependencies = [] - try: - data = json.loads(content) - - # Extract from different dependency sections - for section in ['dependencies', 'devDependencies', 'peerDependencies', 'optionalDependencies']: - if section in data and isinstance(data[section], dict): - dependencies.extend(data[section].keys()) - - except Exception as e: - logger.debug(f"Error parsing package.json: {e}") - - return dependencies - - def _parse_package_lock(self, content: str) -> List[str]: - """Parse package-lock.json for dependencies.""" - dependencies = [] - try: - data = json.loads(content) - - # Extract from packages section (npm v7+) - if 'packages' in data: - for package_path in data['packages']: - if package_path.startswith('node_modules/'): - package_name = package_path[13:] # Remove 'node_modules/' prefix - if package_name and not package_name.startswith('@'): - dependencies.append(package_name) - elif package_name.startswith('@'): - # Handle scoped packages - dependencies.append(package_name) - - # Extract from dependencies section (npm v6) - elif 'dependencies' in data: - dependencies.extend(data['dependencies'].keys()) - - except Exception as e: - logger.debug(f"Error parsing package-lock.json: {e}") - - return dependencies - - def _parse_yarn_lock(self, content: str) -> List[str]: - """Parse yarn.lock for dependencies.""" - dependencies = [] - try: - # Parse yarn.lock format - for line in content.splitlines(): - line = line.strip() - if line and not line.startswith('#') and '@' in line and ':' in line: - # Extract package name from yarn.lock entry - package_spec = line.split(':')[0].strip() - if '"' in package_spec: - package_name = package_spec.split('"')[1] - if package_name and package_name not in dependencies: - # Remove version specifier - base_name = package_name.split('@')[0] if not package_name.startswith('@') else '@' + package_name.split('@')[1] - if base_name: - dependencies.append(base_name) - - except Exception as e: - logger.debug(f"Error parsing yarn.lock: {e}") - - return dependencies - - def _parse_pnpm_lock(self, content: str) -> List[str]: - """Parse pnpm-lock.yaml for dependencies.""" - dependencies = [] - try: - # Simple YAML parsing for dependencies - in_deps_section = False - for line in content.splitlines(): - line = line.strip() - if line in ['dependencies:', 'devDependencies:']: - in_deps_section = True - continue - elif line and not line.startswith(' ') and in_deps_section: - in_deps_section = False - elif in_deps_section and ':' in line: - dep_name = line.split(':')[0].strip() - if dep_name and not dep_name.startswith('#'): - dependencies.append(dep_name) - - except Exception as e: - logger.debug(f"Error parsing pnpm-lock.yaml: {e}") - - return dependencies - - def is_scoped_package(self, import_path: str) -> bool: - """Check if import is a scoped npm package.""" - return import_path.startswith('@') and '/' in import_path - - def get_package_name_from_import(self, import_path: str) -> str: - """Extract package name from JavaScript import path.""" - # Handle scoped packages - if import_path.startswith('@'): - parts = import_path.split('/') - if len(parts) >= 2: - return f"{parts[0]}/{parts[1]}" - return parts[0] - - # Regular packages - return import_path.split('/')[0] - - def supports_version_detection(self) -> bool: - """JavaScript supports version detection through package files.""" - return True - - def detect_package_version(self, package_name: str, context: Dict[str, any] = None) -> Optional[str]: - """Detect JavaScript package version from context.""" - if not context: - return None - - # Check package-lock.json or yarn.lock data - lock_data = context.get('lock_file_data', {}) - if package_name in lock_data: - return lock_data[package_name].get('version') - - # Check package.json dependencies - package_json = context.get('package_json', {}) - for dep_section in ['dependencies', 'devDependencies', 'peerDependencies']: - if dep_section in package_json and package_name in package_json[dep_section]: - return package_json[dep_section][package_name] - - return None diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/objc.py b/src/code_index_mcp/tools/scip/dependencies/configs/objc.py deleted file mode 100644 index 544d1b3..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/configs/objc.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -Objective-C-specific dependency configuration. - -This module provides Objective-C specific dependency classification, -including iOS/macOS framework detection and CocoaPods support. -""" - -import re -import logging -from typing import Set, Dict, List, Optional -from .base import BaseDependencyConfig - -logger = logging.getLogger(__name__) - - -class ObjectiveCDependencyConfig(BaseDependencyConfig): - """ - Objective-C-specific dependency configuration. - - Handles Objective-C framework and dependency classification with support for: - - iOS and macOS system frameworks - - CocoaPods package management - - Carthage dependency management - - Swift Package Manager integration - - Private framework detection - """ - - def get_language_name(self) -> str: - return "objective-c" - - def get_stdlib_modules(self) -> Set[str]: - """Return iOS/macOS system frameworks.""" - return { - # Core frameworks (iOS and macOS) - 'Foundation', 'CoreFoundation', 'CoreData', 'CoreGraphics', - 'QuartzCore', 'CoreAnimation', 'CoreImage', 'CoreText', - 'Security', 'SystemConfiguration', 'CFNetwork', - - # UI frameworks - 'UIKit', 'AppKit', 'Cocoa', 'SwiftUI', - - # Media frameworks - 'AVFoundation', 'AVKit', 'AudioToolbox', 'AudioUnit', - 'VideoToolbox', 'MediaPlayer', 'Photos', 'PhotosUI', - 'CoreAudio', 'CoreMIDI', 'CoreMedia', 'ImageIO', - - # Graphics and gaming - 'Metal', 'MetalKit', 'GameplayKit', 'SpriteKit', 'SceneKit', - 'GLKit', 'OpenGLES', 'CoreMotion', 'ARKit', 'RealityKit', - - # Location and maps - 'CoreLocation', 'MapKit', 'Contacts', 'ContactsUI', - - # Web and networking - 'WebKit', 'JavaScriptCore', 'NetworkExtension', - - # Data and storage - 'CloudKit', 'CoreSpotlight', 'EventKit', 'EventKitUI', - 'HealthKit', 'HealthKitUI', 'HomeKit', 'HomeKitUI', - - # Device and sensors - 'CoreBluetooth', 'ExternalAccessory', 'CoreNFC', - 'CoreTelephony', 'CallKit', 'PushKit', - - # Machine learning and AI - 'CoreML', 'Vision', 'NaturalLanguage', 'Speech', - 'SoundAnalysis', - - # Development tools - 'XCTest', 'os', 'Accelerate', 'simd', - - # Legacy frameworks - 'AddressBook', 'AddressBookUI', 'AssetsLibrary', - 'MobileCoreServices', 'Social', 'Accounts', - - # watchOS specific - 'WatchKit', 'ClockKit', 'WatchConnectivity', - - # tvOS specific - 'TVUIKit', 'TVMLKit', - - # macOS specific - 'Carbon', 'ApplicationServices', 'CoreServices', - 'IOKit', 'DiskArbitration', 'FSEvents', 'ServiceManagement', - 'LaunchServices', 'SearchKit', 'PreferencePanes', - 'InstantMessage', 'Automator', 'CalendarStore', - 'Collaboration', 'CoreWLAN', 'DiscRecording', - 'DiscRecordingUI', 'DVDPlayback', 'ExceptionHandling', - 'FWAUserLib', 'InstallerPlugins', 'IOBluetooth', - 'IOBluetoothUI', 'Kernel', 'LDAP', 'Message', - 'OpenDirectory', 'OSAKit', 'PubSub', 'QTKit', - 'Quartz', 'QuartzComposer', 'QuickLook', 'ScreenSaver', - 'ScriptingBridge', 'SyncServices', 'Tcl', 'Tk', - 'WebKit', 'XgridFoundation' - } - - def _compile_patterns(self) -> None: - """Compile Objective-C specific regex patterns.""" - try: - self._third_party_patterns = [ - # CocoaPods patterns - re.compile(r'^[A-Z][a-zA-Z0-9]*$'), # CamelCase frameworks - re.compile(r'^FB[A-Z][a-zA-Z0-9]*'), # Facebook frameworks - re.compile(r'^AF[A-Z][a-zA-Z0-9]*'), # AFNetworking family - re.compile(r'^SD[A-Z][a-zA-Z0-9]*'), # SDWebImage family - re.compile(r'^MB[A-Z][a-zA-Z0-9]*'), # MBProgressHUD family - re.compile(r'^Google[A-Z][a-zA-Z0-9]*'), # Google frameworks - re.compile(r'^Firebase[A-Z][a-zA-Z0-9]*'), # Firebase frameworks - ] - - self._local_patterns = [ - # Private frameworks - re.compile(r'Private'), - re.compile(r'Internal'), - # Local project patterns - re.compile(r'^[a-z]'), # lowercase frameworks are usually local - re.compile(r'\.framework'), - re.compile(r'/'), # Path-based imports - ] - except Exception as e: - logger.warning(f"Error compiling Objective-C patterns: {e}") - - def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: - """Objective-C specific import classification.""" - # Check for common third-party frameworks - common_third_party = { - 'AFNetworking', 'Alamofire', 'SDWebImage', 'MBProgressHUD', - 'JSONModel', 'RestKit', 'Firebase', 'ReactiveCocoa', - 'Masonry', 'SnapKit', 'Realm', 'FMDB', 'SQLite', - 'GoogleAnalytics', 'Fabric', 'Crashlytics', 'TestFlight', - 'Facebook', 'Twitter', 'Instagram', 'FBSDKCoreKit', - 'GoogleMaps', 'GooglePlaces', 'GoogleSignIn', - 'FirebaseCore', 'FirebaseAuth', 'FirebaseFirestore', - 'FirebaseDatabase', 'FirebaseStorage', 'FirebaseAnalytics', - 'Lottie', 'Charts', 'YYKit', 'Pop', 'IGListKit', - 'ComponentKit', 'Texture', 'AsyncDisplayKit' - } - - base_framework = self.get_package_name_from_import(import_path) - if base_framework in common_third_party: - return 'third_party' - - # Check for CocoaPods/Carthage patterns - if any(indicator in import_path for indicator in ['Pods/', 'Carthage/', 'Build/Products']): - return 'third_party' - - # Check context for dependency management info - if context: - # Check Podfile dependencies - pods = context.get('cocoapods_dependencies', set()) - if base_framework in pods: - return 'third_party' - - # Check Cartfile dependencies - carthage_deps = context.get('carthage_dependencies', set()) - if base_framework in carthage_deps: - return 'third_party' - - # Check SPM dependencies - spm_deps = context.get('spm_dependencies', set()) - if base_framework in spm_deps: - return 'third_party' - - # Private or internal frameworks are local - if 'Private' in import_path or 'Internal' in import_path: - return 'local' - - # Default to standard_library for unknown Apple frameworks - # (Objective-C ecosystem has many system frameworks) - return 'standard_library' - - def normalize_import_path(self, raw_path: str) -> str: - """Normalize Objective-C import path.""" - normalized = raw_path.strip() - - # Remove .framework suffix - if normalized.endswith('.framework'): - normalized = normalized[:-10] - - # Remove file extensions - for ext in ['.h', '.m', '.mm']: - if normalized.endswith(ext): - normalized = normalized[:-len(ext)] - break - - # Extract framework name from paths - if '/' in normalized: - # Extract the last component (framework name) - normalized = normalized.split('/')[-1] - - return normalized - - def get_package_manager_files(self) -> Set[str]: - """Return Objective-C package manager files.""" - return { - 'Podfile', - 'Podfile.lock', - 'Cartfile', - 'Cartfile.resolved', - 'Package.swift', - 'Package.resolved', - 'project.pbxproj' # Xcode project file - } - - def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: - """Extract dependencies from Objective-C package manager files.""" - dependencies = [] - - try: - if 'Podfile' in file_path and not file_path.endswith('.lock'): - dependencies = self._parse_podfile(file_content) - elif file_path.endswith('Podfile.lock'): - dependencies = self._parse_podfile_lock(file_content) - elif 'Cartfile' in file_path: - dependencies = self._parse_cartfile(file_content) - elif file_path.endswith('Package.swift'): - dependencies = self._parse_package_swift(file_content) - elif file_path.endswith('project.pbxproj'): - dependencies = self._parse_pbxproj(file_content) - except Exception as e: - logger.debug(f"Error parsing Objective-C dependency file {file_path}: {e}") - - return dependencies - - def _parse_podfile(self, content: str) -> List[str]: - """Parse Podfile for CocoaPods dependencies.""" - dependencies = [] - try: - for line in content.splitlines(): - line = line.strip() - if line.startswith('pod '): - # Extract pod name - match = re.search(r"pod\s+['\"]([^'\"]+)['\"]", line) - if match: - pod_name = match.group(1) - dependencies.append(pod_name) - except Exception as e: - logger.debug(f"Error parsing Podfile: {e}") - - return dependencies - - def _parse_podfile_lock(self, content: str) -> List[str]: - """Parse Podfile.lock for installed pods.""" - dependencies = [] - try: - in_pods_section = False - for line in content.splitlines(): - line = line.strip() - if line.startswith('PODS:'): - in_pods_section = True - continue - elif in_pods_section and line.startswith('DEPENDENCIES:'): - break - elif in_pods_section and line.startswith('- '): - # Extract pod name - pod_spec = line[2:].strip() - if '(' in pod_spec: - pod_name = pod_spec.split('(')[0].strip() - else: - pod_name = pod_spec.split(' ')[0].strip() - if pod_name: - dependencies.append(pod_name) - except Exception as e: - logger.debug(f"Error parsing Podfile.lock: {e}") - - return dependencies - - def _parse_cartfile(self, content: str) -> List[str]: - """Parse Cartfile for Carthage dependencies.""" - dependencies = [] - try: - for line in content.splitlines(): - line = line.strip() - if line and not line.startswith('#'): - # Extract dependency name from Carthage format - parts = line.split() - if len(parts) >= 2: - repo = parts[1] - if '/' in repo: - # Extract framework name from GitHub repo - framework_name = repo.split('/')[-1] - if framework_name: - dependencies.append(framework_name) - except Exception as e: - logger.debug(f"Error parsing Cartfile: {e}") - - return dependencies - - def _parse_package_swift(self, content: str) -> List[str]: - """Parse Package.swift for Swift Package Manager dependencies.""" - dependencies = [] - try: - # Look for .package declarations - for line in content.splitlines(): - line = line.strip() - if '.package(' in line: - # Extract package name or URL - match = re.search(r'url:\s*["\']([^"\']+)["\']', line) - if match: - url = match.group(1) - if '/' in url: - package_name = url.split('/')[-1] - if package_name.endswith('.git'): - package_name = package_name[:-4] - dependencies.append(package_name) - except Exception as e: - logger.debug(f"Error parsing Package.swift: {e}") - - return dependencies - - def _parse_pbxproj(self, content: str) -> List[str]: - """Parse Xcode project file for framework references.""" - dependencies = [] - try: - # Look for framework references in pbxproj - for line in content.splitlines(): - if '.framework' in line: - # Extract framework names - matches = re.findall(r'([A-Za-z0-9_]+)\.framework', line) - for framework in matches: - if framework not in dependencies: - dependencies.append(framework) - except Exception as e: - logger.debug(f"Error parsing project.pbxproj: {e}") - - return dependencies - - def get_package_name_from_import(self, import_path: str) -> str: - """Extract framework name from Objective-C import path.""" - # Remove common prefixes/suffixes - normalized = import_path - - if normalized.endswith('.framework'): - normalized = normalized[:-10] - - # Extract framework name from paths - if '/' in normalized: - normalized = normalized.split('/')[-1] - - # Remove file extensions - for ext in ['.h', '.m', '.mm']: - if normalized.endswith(ext): - normalized = normalized[:-len(ext)] - break - - return normalized diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/python.py b/src/code_index_mcp/tools/scip/dependencies/configs/python.py deleted file mode 100644 index 02f0f38..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/configs/python.py +++ /dev/null @@ -1,355 +0,0 @@ -""" -Python-specific dependency configuration. - -This module provides Python-specific dependency classification rules, -including comprehensive standard library detection and pip package management. -""" - -import json -import re -import logging -from typing import Set, Dict, List, Optional, Pattern -from .base import BaseDependencyConfig - -logger = logging.getLogger(__name__) - - -class PythonDependencyConfig(BaseDependencyConfig): - """ - Python-specific dependency configuration. - - Handles Python import classification with support for: - - Comprehensive standard library detection - - pip/conda package management - - Virtual environment detection - - Relative and absolute import patterns - - PEP 420 namespace packages - """ - - def get_language_name(self) -> str: - return "python" - - def get_stdlib_modules(self) -> Set[str]: - """Return comprehensive Python standard library modules.""" - return { - # Core modules - 'os', 'sys', 'json', 'time', 'datetime', 'logging', 'pathlib', - 'typing', 'dataclasses', 'functools', 'itertools', 'collections', - 're', 'math', 'random', 'threading', 'subprocess', 'shutil', - 'contextlib', 'traceback', 'warnings', 'weakref', 'copy', - 'pickle', 'base64', 'hashlib', 'hmac', 'uuid', 'urllib', - 'http', 'socketserver', 'email', 'mimetypes', 'csv', 'configparser', - 'argparse', 'getopt', 'tempfile', 'glob', 'fnmatch', 'linecache', - 'pprint', 'textwrap', 'string', 'struct', 'codecs', 'unicodedata', - 'io', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile', - - # Network and web - 'socket', 'ssl', 'ftplib', 'poplib', 'imaplib', 'smtplib', - 'xmlrpc', 'webbrowser', - - # Data formats - 'xml', 'html', 'sqlite3', 'dbm', 'marshal', - - # Development tools - 'unittest', 'doctest', 'pdb', 'profile', 'cProfile', 'timeit', - 'trace', 'cgitb', 'py_compile', 'compileall', 'dis', 'pickletools', - - # System services - 'errno', 'ctypes', 'syslog', 'curses', 'platform', - - # Internationalization - 'locale', 'gettext', - - # Multimedia - 'audioop', 'wave', 'chunk', 'sunau', 'aifc', 'colorsys', - - # Cryptographic services - 'secrets', 'hashlib', 'hmac', - - # File and directory access - 'stat', 'fileinput', 'filecmp', 'shutil', 'macpath', - - # Data persistence - 'shelve', 'copyreg', - - # Data compression and archiving - 'zlib', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile', - - # File formats - 'csv', 'netrc', 'xdrlib', 'plistlib', - - # Internet protocols and support - 'ipaddress', 'mailbox', 'mimetypes', - - # Structured markup processing tools - 'html', 'xml', - - # Internet data handling - 'json', 'base64', 'binascii', 'uu', 'quopri', - - # Numeric and mathematical modules - 'numbers', 'decimal', 'fractions', 'statistics', 'cmath', - - # Functional programming modules - 'operator', 'functools', 'itertools', - - # Python language services - 'ast', 'symtable', 'symbol', 'token', 'tokenize', 'keyword', - 'tabnanny', 'pyclbr', 'py_compile', 'compileall', 'dis', - 'pickletools', 'distutils', - - # Importing modules - 'importlib', 'pkgutil', 'modulefinder', 'runpy', - - # Python runtime services - 'atexit', 'gc', 'inspect', 'site', '__future__', '__main__', - - # Custom Python interpreters - 'code', 'codeop', - - # MS Windows specific services - 'msvcrt', 'winreg', 'winsound', - - # Unix specific services - 'posix', 'pwd', 'grp', 'crypt', 'termios', 'tty', 'pty', - 'fcntl', 'pipes', 'resource', 'nis', 'syslog', - - # Superseded modules - 'optparse', 'imp' - } - - def _compile_patterns(self) -> None: - """Compile Python-specific regex patterns.""" - try: - self._third_party_patterns = [ - # Standard package names - re.compile(r'^[a-zA-Z][a-zA-Z0-9_-]*$'), - # Namespace packages (PEP 420) - re.compile(r'^[a-zA-Z][a-zA-Z0-9_]*(\.[a-zA-Z][a-zA-Z0-9_]*)+$'), - # Common third-party patterns - re.compile(r'^(django|flask|requests|numpy|pandas|matplotlib|scipy|tensorflow|pytorch|sklearn)'), - ] - - self._local_patterns = [ - # Relative imports - re.compile(r'^\.+'), - # Project-specific patterns - re.compile(r'^(src|lib|app|project)\.'), - re.compile(r'^(tests?|test_)'), - # Common local patterns - re.compile(r'^(utils|helpers|common|core|models|views|controllers)$'), - ] - except Exception as e: - logger.warning(f"Error compiling Python patterns: {e}") - - def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: - """Python-specific import classification.""" - # Handle special cases - if import_path.startswith('__'): - # Dunder modules are usually built-in or special - return 'standard_library' - - # Check for common third-party packages - common_third_party = { - 'numpy', 'pandas', 'matplotlib', 'scipy', 'sklearn', 'tensorflow', - 'torch', 'pytorch', 'requests', 'urllib3', 'beautifulsoup4', - 'django', 'flask', 'fastapi', 'sqlalchemy', 'alembic', - 'pytest', 'mock', 'coverage', 'tox', 'black', 'flake8', - 'mypy', 'isort', 'autopep8', 'yapf', 'pylint', 'bandit', - 'click', 'typer', 'pydantic', 'marshmallow', 'cerberus', - 'redis', 'celery', 'kombu', 'amqp', 'boto3', 'botocore', - 'psycopg2', 'pymongo', 'elasticsearch', 'kafka-python', - 'pillow', 'opencv-python', 'imageio', 'plotly', 'seaborn', - 'jupyter', 'ipython', 'notebook', 'jupyterlab' - } - - base_package = self.get_package_name_from_import(import_path) - if base_package in common_third_party: - return 'third_party' - - # Check context for pip indicators - if context: - pip_indicators = context.get('pip_packages', set()) - if base_package in pip_indicators: - return 'third_party' - - # Check for requirements.txt or setup.py dependencies - project_deps = context.get('project_dependencies', set()) - if base_package in project_deps: - return 'third_party' - - # Default to third_party for unknown packages - return 'third_party' - - def normalize_import_path(self, raw_path: str) -> str: - """Normalize Python import path.""" - # Remove common prefixes and suffixes - normalized = raw_path.strip() - - # Handle namespace packages - if normalized.endswith('.__init__'): - normalized = normalized[:-9] - - # Normalize path separators to dots - normalized = normalized.replace('/', '.') - - return normalized - - def get_package_manager_files(self) -> Set[str]: - """Return Python package manager files.""" - return { - 'requirements.txt', - 'requirements-dev.txt', - 'requirements-test.txt', - 'setup.py', - 'setup.cfg', - 'pyproject.toml', - 'Pipfile', - 'Pipfile.lock', - 'poetry.lock', - 'conda.yaml', - 'environment.yml', - 'environment.yaml' - } - - def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: - """Extract dependencies from Python package manager files.""" - dependencies = [] - - try: - if file_path.endswith('requirements.txt'): - dependencies = self._parse_requirements_txt(file_content) - elif file_path.endswith('setup.py'): - dependencies = self._parse_setup_py(file_content) - elif file_path.endswith('pyproject.toml'): - dependencies = self._parse_pyproject_toml(file_content) - elif file_path.endswith('Pipfile'): - dependencies = self._parse_pipfile(file_content) - elif file_path.endswith('.lock'): - dependencies = self._parse_lock_file(file_path, file_content) - except Exception as e: - logger.debug(f"Error parsing {file_path}: {e}") - - return dependencies - - def _parse_requirements_txt(self, content: str) -> List[str]: - """Parse requirements.txt file.""" - dependencies = [] - for line in content.splitlines(): - line = line.strip() - if line and not line.startswith('#'): - # Extract package name (before version specifiers) - package = re.split(r'[><=!]', line)[0].strip() - if package: - dependencies.append(package) - return dependencies - - def _parse_setup_py(self, content: str) -> List[str]: - """Parse setup.py file for dependencies.""" - dependencies = [] - try: - # Look for install_requires or setup() calls - install_requires_match = re.search( - r'install_requires\s*=\s*\[(.*?)\]', - content, - re.DOTALL - ) - if install_requires_match: - deps_str = install_requires_match.group(1) - # Extract quoted strings - for match in re.finditer(r'["\']([^"\']+)["\']', deps_str): - package = re.split(r'[><=!]', match.group(1))[0].strip() - if package: - dependencies.append(package) - except Exception as e: - logger.debug(f"Error parsing setup.py: {e}") - - return dependencies - - def _parse_pyproject_toml(self, content: str) -> List[str]: - """Parse pyproject.toml file.""" - dependencies = [] - try: - # This would require toml parsing library - # For now, use simple regex approach - deps_match = re.search(r'dependencies\s*=\s*\[(.*?)\]', content, re.DOTALL) - if deps_match: - deps_str = deps_match.group(1) - for match in re.finditer(r'["\']([^"\']+)["\']', deps_str): - package = re.split(r'[><=!]', match.group(1))[0].strip() - if package: - dependencies.append(package) - except Exception as e: - logger.debug(f"Error parsing pyproject.toml: {e}") - - return dependencies - - def _parse_pipfile(self, content: str) -> List[str]: - """Parse Pipfile for dependencies.""" - dependencies = [] - try: - # Look for [packages] section - in_packages_section = False - for line in content.splitlines(): - line = line.strip() - if line == '[packages]': - in_packages_section = True - continue - elif line.startswith('[') and in_packages_section: - break - elif in_packages_section and '=' in line: - package = line.split('=')[0].strip().strip('"\'') - if package: - dependencies.append(package) - except Exception as e: - logger.debug(f"Error parsing Pipfile: {e}") - - return dependencies - - def _parse_lock_file(self, file_path: str, content: str) -> List[str]: - """Parse lock files (Pipfile.lock, poetry.lock).""" - dependencies = [] - try: - if 'Pipfile.lock' in file_path: - # JSON format - data = json.loads(content) - if 'default' in data: - dependencies.extend(data['default'].keys()) - if 'develop' in data: - dependencies.extend(data['develop'].keys()) - elif 'poetry.lock' in file_path: - # TOML format - simplified parsing - for line in content.splitlines(): - if line.startswith('name = '): - name = line.split('=')[1].strip().strip('"\'') - if name: - dependencies.append(name) - except Exception as e: - logger.debug(f"Error parsing lock file {file_path}: {e}") - - return dependencies - - def is_scoped_package(self, import_path: str) -> bool: - """Check if import is a namespace package.""" - return '.' in import_path and not import_path.startswith('.') - - def supports_version_detection(self) -> bool: - """Python supports version detection through various methods.""" - return True - - def detect_package_version(self, package_name: str, context: Dict[str, any] = None) -> Optional[str]: - """Detect Python package version from context.""" - if not context: - return None - - # Check lock files first (most reliable) - lock_data = context.get('lock_file_data', {}) - if package_name in lock_data: - return lock_data[package_name].get('version') - - # Check installed packages (if available) - installed_packages = context.get('installed_packages', {}) - if package_name in installed_packages: - return installed_packages[package_name] - - return None diff --git a/src/code_index_mcp/tools/scip/dependencies/configs/zig.py b/src/code_index_mcp/tools/scip/dependencies/configs/zig.py deleted file mode 100644 index e22f553..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/configs/zig.py +++ /dev/null @@ -1,266 +0,0 @@ -""" -Zig-specific dependency configuration. - -This module provides Zig-specific dependency classification rules, -including standard library detection and package management. -""" - -import re -import logging -from typing import Set, Dict, List, Optional -from .base import BaseDependencyConfig - -logger = logging.getLogger(__name__) - - -class ZigDependencyConfig(BaseDependencyConfig): - """ - Zig-specific dependency configuration. - - Handles Zig import classification with support for: - - Zig standard library detection - - Package manager (zigmod, gyro) support - - Local .zig file imports - - System library detection - """ - - def get_language_name(self) -> str: - return "zig" - - def get_stdlib_modules(self) -> Set[str]: - """Return comprehensive Zig standard library modules.""" - return { - # Core standard library - 'std', 'builtin', 'testing', - - # Data structures and algorithms - 'math', 'mem', 'sort', 'hash', 'crypto', - - # Text and formatting - 'fmt', 'ascii', 'unicode', 'json', - - # System interaction - 'os', 'fs', 'process', 'thread', 'atomic', - - # Networking and I/O - 'net', 'http', 'io', - - # Compression and encoding - 'compress', 'base64', - - # Development and debugging - 'debug', 'log', 'meta', 'comptime', - - # Utilities - 'rand', 'time', 'zig', - - # Platform-specific - 'c', 'wasm', - - # Build system - 'build', 'target' - } - - def _compile_patterns(self) -> None: - """Compile Zig-specific regex patterns.""" - try: - self._third_party_patterns = [ - # Package names (typically lowercase with hyphens) - re.compile(r'^[a-z][a-z0-9-]*$'), - # Zig package patterns - re.compile(r'^zig-'), - # GitHub-style packages - re.compile(r'^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$'), - ] - - self._local_patterns = [ - # Relative paths - re.compile(r'^\.\.?/'), - # .zig files - re.compile(r'\.zig$'), - # Local project paths - re.compile(r'^src/'), - re.compile(r'^lib/'), - ] - except Exception as e: - logger.warning(f"Error compiling Zig patterns: {e}") - - def _classify_import_impl(self, import_path: str, context: Dict[str, any] = None) -> str: - """Zig-specific import classification.""" - # Handle .zig file extensions - if import_path.endswith('.zig'): - return 'local' - - # Check for common third-party Zig packages - common_third_party = { - 'zigmod', 'gyro', 'known-folders', 'zig-args', 'zig-clap', - 'zig-network', 'zig-sqlite', 'zig-json', 'zig-yaml', - 'raylib-zig', 'mach', 'zls', 'zig-gamedev' - } - - base_package = self.get_package_name_from_import(import_path) - if base_package in common_third_party: - return 'third_party' - - # Check context for package manager info - if context: - # Check build.zig dependencies - build_deps = context.get('build_dependencies', set()) - if base_package in build_deps: - return 'third_party' - - # Check package manager files - pkg_deps = context.get('package_dependencies', set()) - if base_package in pkg_deps: - return 'third_party' - - # If it's not stdlib or clearly local, assume third_party - return 'third_party' - - def normalize_import_path(self, raw_path: str) -> str: - """Normalize Zig import path.""" - normalized = raw_path.strip() - - # Remove .zig extension for consistency - if normalized.endswith('.zig'): - normalized = normalized[:-4] - - # Normalize path separators - normalized = normalized.replace('\\', '/') - - return normalized - - def get_package_manager_files(self) -> Set[str]: - """Return Zig package manager files.""" - return { - 'build.zig', - 'build.zig.zon', - 'zigmod.yml', - 'zigmod.lock', - 'gyro.zzz', - 'deps.zig' - } - - def extract_dependencies_from_file(self, file_path: str, file_content: str) -> List[str]: - """Extract dependencies from Zig package manager files.""" - dependencies = [] - - try: - if file_path.endswith('build.zig'): - dependencies = self._parse_build_zig(file_content) - elif file_path.endswith('build.zig.zon'): - dependencies = self._parse_build_zon(file_content) - elif file_path.endswith('zigmod.yml'): - dependencies = self._parse_zigmod_yml(file_content) - elif file_path.endswith('gyro.zzz'): - dependencies = self._parse_gyro_zzz(file_content) - except Exception as e: - logger.debug(f"Error parsing Zig dependency file {file_path}: {e}") - - return dependencies - - def _parse_build_zig(self, content: str) -> List[str]: - """Parse build.zig for dependencies.""" - dependencies = [] - try: - # Look for addPackage or dependency declarations - for line in content.splitlines(): - line = line.strip() - # Simple pattern matching for package declarations - if 'addPackage' in line or 'dependency' in line: - # Extract quoted strings that might be package names - matches = re.findall(r'["\']([a-zA-Z0-9_-]+)["\']', line) - dependencies.extend(matches) - except Exception as e: - logger.debug(f"Error parsing build.zig: {e}") - - return dependencies - - def _parse_build_zon(self, content: str) -> List[str]: - """Parse build.zig.zon file.""" - dependencies = [] - try: - # Look for .dependencies section - in_deps_section = False - for line in content.splitlines(): - line = line.strip() - if '.dependencies' in line: - in_deps_section = True - continue - elif in_deps_section and line.startswith('}'): - break - elif in_deps_section and '=' in line: - # Extract dependency name - dep_name = line.split('=')[0].strip().strip('.') - if dep_name: - dependencies.append(dep_name) - except Exception as e: - logger.debug(f"Error parsing build.zig.zon: {e}") - - return dependencies - - def _parse_zigmod_yml(self, content: str) -> List[str]: - """Parse zigmod.yml file.""" - dependencies = [] - try: - # Simple YAML parsing for dependencies section - in_deps_section = False - for line in content.splitlines(): - line = line.strip() - if line.startswith('dependencies:'): - in_deps_section = True - continue - elif in_deps_section and line.startswith('-'): - # Extract dependency info - if 'src:' in line: - # Extract from src: field - match = re.search(r'src:\s*([^\s]+)', line) - if match: - src = match.group(1) - # Extract package name from URL or path - if '/' in src: - dep_name = src.split('/')[-1] - if dep_name: - dependencies.append(dep_name) - elif in_deps_section and not line.startswith(' ') and not line.startswith('-'): - break - except Exception as e: - logger.debug(f"Error parsing zigmod.yml: {e}") - - return dependencies - - def _parse_gyro_zzz(self, content: str) -> List[str]: - """Parse gyro.zzz file.""" - dependencies = [] - try: - # Look for deps section in gyro format - for line in content.splitlines(): - line = line.strip() - if line.startswith('deps:'): - # Extract dependencies from gyro format - deps_part = line[5:].strip() - if deps_part: - # Simple parsing of dependency list - for dep in deps_part.split(): - if dep: - dependencies.append(dep) - except Exception as e: - logger.debug(f"Error parsing gyro.zzz: {e}") - - return dependencies - - def get_package_name_from_import(self, import_path: str) -> str: - """Extract package name from Zig import path.""" - # Handle different Zig import patterns - if '/' in import_path: - # GitHub-style: owner/repo - parts = import_path.split('/') - if len(parts) >= 2: - return f"{parts[0]}/{parts[1]}" - return parts[0] - - # Remove .zig extension if present - if import_path.endswith('.zig'): - import_path = import_path[:-4] - - return import_path diff --git a/src/code_index_mcp/tools/scip/dependencies/normalizer.py b/src/code_index_mcp/tools/scip/dependencies/normalizer.py deleted file mode 100644 index 08c2f01..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/normalizer.py +++ /dev/null @@ -1,354 +0,0 @@ -""" -Import path normalization utilities. - -This module provides utilities for normalizing import paths across different -languages and import styles for consistent dependency classification. -""" - -import re -import logging -from typing import Dict, List, Optional, Set, Callable -from urllib.parse import urlparse - -logger = logging.getLogger(__name__) - - -class ImportNormalizer: - """ - Import path normalization system. - - Provides language-specific import path normalization to ensure - consistent classification regardless of import style variations. - """ - - def __init__(self): - """Initialize the import normalizer.""" - self._normalizers: Dict[str, Callable[[str], str]] = {} - self._setup_default_normalizers() - - def _setup_default_normalizers(self) -> None: - """Setup default normalizers for supported languages.""" - self._normalizers.update({ - 'python': self._normalize_python_import, - 'javascript': self._normalize_javascript_import, - 'typescript': self._normalize_javascript_import, # Same as JS - 'zig': self._normalize_zig_import, - 'objective-c': self._normalize_objc_import, - 'java': self._normalize_java_import, - 'swift': self._normalize_swift_import, - 'go': self._normalize_go_import, - 'rust': self._normalize_rust_import, - }) - - def normalize_import_path(self, import_path: str, language: str) -> str: - """ - Normalize an import path based on language-specific rules. - - Args: - import_path: Raw import path to normalize - language: Programming language - - Returns: - Normalized import path - """ - if not import_path: - return import_path - - # Apply basic normalization first - normalized = self._basic_normalize(import_path) - - # Apply language-specific normalization - language_lower = language.lower() - if language_lower in self._normalizers: - normalized = self._normalizers[language_lower](normalized) - - logger.debug(f"Normalized {import_path} -> {normalized} ({language})") - return normalized - - def _basic_normalize(self, import_path: str) -> str: - """Apply basic normalization common to all languages.""" - # Strip whitespace - normalized = import_path.strip() - - # Remove quotes if present - if (normalized.startswith('"') and normalized.endswith('"')) or \ - (normalized.startswith("'") and normalized.endswith("'")): - normalized = normalized[1:-1] - - # Remove semicolons at the end - normalized = normalized.rstrip(';') - - return normalized - - def _normalize_python_import(self, import_path: str) -> str: - """Normalize Python import paths.""" - normalized = import_path - - # Handle namespace packages - if normalized.endswith('.__init__'): - normalized = normalized[:-9] - - # Convert file paths to module paths - normalized = normalized.replace('/', '.') - normalized = normalized.replace('\\', '.') - - # Remove .py extension if present - if normalized.endswith('.py'): - normalized = normalized[:-3] - - # Normalize multiple dots in relative imports - if normalized.startswith('.'): - # Count leading dots - dot_count = 0 - for char in normalized: - if char == '.': - dot_count += 1 - else: - break - - # Reconstruct with normalized dots - remaining = normalized[dot_count:] - if remaining: - normalized = '.' * dot_count + remaining - else: - normalized = '.' * dot_count - - return normalized - - def _normalize_javascript_import(self, import_path: str) -> str: - """Normalize JavaScript/TypeScript import paths.""" - normalized = import_path - - # Handle URL imports (for Deno or web) - if normalized.startswith(('http://', 'https://')): - parsed = urlparse(normalized) - # Extract package name from URL - path_parts = parsed.path.strip('/').split('/') - if path_parts: - normalized = path_parts[0] # Use first path component as package name - - # Remove common file extensions - extensions = ['.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs', '.json'] - for ext in extensions: - if normalized.endswith(ext): - normalized = normalized[:-len(ext)] - break - - # Remove /index suffix (common in Node.js) - if normalized.endswith('/index'): - normalized = normalized[:-6] - - # Handle scoped packages - ensure proper format - if normalized.startswith('@') and '/' in normalized: - parts = normalized.split('/') - if len(parts) >= 2: - # Keep only @scope/package part - normalized = f"{parts[0]}/{parts[1]}" - - # Convert Windows paths to forward slashes - normalized = normalized.replace('\\', '/') - - return normalized - - def _normalize_zig_import(self, import_path: str) -> str: - """Normalize Zig import paths.""" - normalized = import_path - - # Remove .zig extension - if normalized.endswith('.zig'): - normalized = normalized[:-4] - - # Convert Windows paths to forward slashes - normalized = normalized.replace('\\', '/') - - # Handle relative paths - if normalized.startswith('./'): - normalized = normalized[2:] - elif normalized.startswith('../'): - # Keep relative indicator but normalize - pass - - return normalized - - def _normalize_objc_import(self, import_path: str) -> str: - """Normalize Objective-C import paths.""" - normalized = import_path - - # Remove framework suffix - if normalized.endswith('.framework'): - normalized = normalized[:-10] - - # Remove common file extensions - extensions = ['.h', '.m', '.mm'] - for ext in extensions: - if normalized.endswith(ext): - normalized = normalized[:-len(ext)] - break - - # Extract framework name from paths - if '/' in normalized: - parts = normalized.split('/') - # For framework imports, usually want the framework name - # e.g., "UIKit/UIKit.h" -> "UIKit" - if len(parts) >= 2 and parts[0] == parts[-1]: - normalized = parts[0] - else: - # Use the last component - normalized = parts[-1] - - return normalized - - def _normalize_java_import(self, import_path: str) -> str: - """Normalize Java import paths.""" - normalized = import_path - - # Java imports are typically already normalized - # But handle any file extensions that might be present - if normalized.endswith('.java'): - normalized = normalized[:-5] - - # Convert file paths to package notation - normalized = normalized.replace('/', '.') - normalized = normalized.replace('\\', '.') - - return normalized - - def _normalize_swift_import(self, import_path: str) -> str: - """Normalize Swift import paths.""" - normalized = import_path - - # Remove .swift extension if present - if normalized.endswith('.swift'): - normalized = normalized[:-6] - - # Swift imports are typically module names, so minimal normalization needed - return normalized - - def _normalize_go_import(self, import_path: str) -> str: - """Normalize Go import paths.""" - normalized = import_path - - # Go imports are typically already well-formatted - # Remove any .go extension that might be present - if normalized.endswith('.go'): - normalized = normalized[:-3] - - # Convert Windows paths to forward slashes - normalized = normalized.replace('\\', '/') - - return normalized - - def _normalize_rust_import(self, import_path: str) -> str: - """Normalize Rust import paths.""" - normalized = import_path - - # Remove .rs extension if present - if normalized.endswith('.rs'): - normalized = normalized[:-3] - - # Convert :: to / for consistency (though :: is correct Rust syntax) - # This is for classification purposes only - normalized = normalized.replace('::', '/') - - return normalized - - def register_normalizer(self, language: str, normalizer: Callable[[str], str]) -> None: - """ - Register a custom normalizer for a language. - - Args: - language: Language name - normalizer: Function that takes import_path and returns normalized path - """ - self._normalizers[language.lower()] = normalizer - logger.debug(f"Registered custom normalizer for {language}") - - def get_supported_languages(self) -> Set[str]: - """ - Get set of languages with custom normalizers. - - Returns: - Set of supported language names - """ - return set(self._normalizers.keys()) - - def normalize_package_name(self, package_name: str, language: str) -> str: - """ - Normalize a package name for consistent lookup. - - Args: - package_name: Package name to normalize - language: Programming language - - Returns: - Normalized package name - """ - normalized = package_name.strip().lower() - - # Language-specific package name normalization - if language.lower() == 'python': - # Python package names use hyphens and underscores interchangeably - normalized = normalized.replace('_', '-') - elif language.lower() in ['javascript', 'typescript']: - # JavaScript packages typically use hyphens - # But handle scoped packages specially - if normalized.startswith('@'): - pass # Keep scoped packages as-is - else: - normalized = normalized.replace('_', '-') - elif language.lower() == 'zig': - # Zig packages typically use hyphens - normalized = normalized.replace('_', '-') - elif language.lower() == 'objective-c': - # Objective-C frameworks use CamelCase, preserve case - normalized = package_name.strip() - - return normalized - - def extract_base_package_name(self, import_path: str, language: str) -> str: - """ - Extract the base package name from an import path. - - Args: - import_path: Full import path - language: Programming language - - Returns: - Base package name - """ - normalized = self.normalize_import_path(import_path, language) - - if language.lower() in ['javascript', 'typescript']: - # Handle scoped packages - if normalized.startswith('@'): - parts = normalized.split('/') - if len(parts) >= 2: - return f"{parts[0]}/{parts[1]}" - return parts[0] - else: - return normalized.split('/')[0] - - elif language.lower() == 'python': - # Python: first component of dotted path - if normalized.startswith('.'): - # Relative import, return as-is - return normalized - return normalized.split('.')[0] - - elif language.lower() == 'zig': - # Zig: handle different import patterns - if '/' in normalized: - parts = normalized.split('/') - if len(parts) == 2: - # owner/repo pattern - return normalized - return parts[0] - return normalized - - elif language.lower() == 'objective-c': - # Objective-C: framework name - return normalized - - else: - # Default: first component - return normalized.split('/')[0].split('.')[0] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/dependencies/registry.py b/src/code_index_mcp/tools/scip/dependencies/registry.py deleted file mode 100644 index f74b9df..0000000 --- a/src/code_index_mcp/tools/scip/dependencies/registry.py +++ /dev/null @@ -1,371 +0,0 @@ -""" -Dependency registry and caching system. - -This module provides centralized caching and registry functionality for -dependency classification results and metadata. -""" - -import time -import logging -from typing import Dict, Optional, Any, Set, List, Tuple -from collections import defaultdict, Counter - -logger = logging.getLogger(__name__) - - -class DependencyRegistry: - """ - Centralized registry and caching system for dependency classification. - - Provides: - - Classification result caching - - Dependency metadata storage - - Performance statistics - - Cache management and cleanup - """ - - def __init__(self, max_cache_size: int = 10000, cache_ttl: int = 3600): - """ - Initialize the dependency registry. - - Args: - max_cache_size: Maximum number of entries to cache - cache_ttl: Cache time-to-live in seconds - """ - self.max_cache_size = max_cache_size - self.cache_ttl = cache_ttl - - # Classification cache: {cache_key: (classification, timestamp)} - self._classification_cache: Dict[str, Tuple[str, float]] = {} - - # Dependency metadata cache: {language: {package: metadata}} - self._metadata_cache: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict) - - # Standard library cache: {language: (modules_set, timestamp)} - self._stdlib_cache: Dict[str, Tuple[Set[str], float]] = {} - - # Package manager file cache: {language: (files_set, timestamp)} - self._package_files_cache: Dict[str, Tuple[Set[str], float]] = {} - - # Statistics - self._stats = { - 'cache_hits': 0, - 'cache_misses': 0, - 'classifications_performed': 0, - 'cache_evictions': 0, - 'last_cleanup': time.time() - } - - # Classification counters - self._classification_counts = Counter() - - def cache_classification(self, cache_key: str, classification: str) -> None: - """ - Cache a dependency classification result. - - Args: - cache_key: Unique cache key for the classification - classification: Classification result to cache - """ - current_time = time.time() - - # Check if cache is full and needs cleanup - if len(self._classification_cache) >= self.max_cache_size: - self._cleanup_cache() - - # Store the classification with timestamp - self._classification_cache[cache_key] = (classification, current_time) - self._classification_counts[classification] += 1 - self._stats['classifications_performed'] += 1 - - logger.debug(f"Cached classification: {cache_key} -> {classification}") - - def get_cached_classification(self, cache_key: str) -> Optional[str]: - """ - Retrieve a cached classification result. - - Args: - cache_key: Cache key to look up - - Returns: - Cached classification or None if not found/expired - """ - if cache_key not in self._classification_cache: - self._stats['cache_misses'] += 1 - return None - - classification, timestamp = self._classification_cache[cache_key] - current_time = time.time() - - # Check if the cache entry has expired - if current_time - timestamp > self.cache_ttl: - del self._classification_cache[cache_key] - self._stats['cache_misses'] += 1 - logger.debug(f"Cache entry expired: {cache_key}") - return None - - self._stats['cache_hits'] += 1 - return classification - - def cache_dependency_metadata( - self, - language: str, - package_name: str, - metadata: Dict[str, Any] - ) -> None: - """ - Cache dependency metadata. - - Args: - language: Programming language - package_name: Package/dependency name - metadata: Metadata to cache - """ - self._metadata_cache[language][package_name] = { - **metadata, - 'cached_at': time.time() - } - logger.debug(f"Cached metadata for {language}:{package_name}") - - def get_cached_metadata( - self, - language: str, - package_name: str - ) -> Optional[Dict[str, Any]]: - """ - Retrieve cached dependency metadata. - - Args: - language: Programming language - package_name: Package/dependency name - - Returns: - Cached metadata or None if not found/expired - """ - if language not in self._metadata_cache: - return None - - if package_name not in self._metadata_cache[language]: - return None - - metadata = self._metadata_cache[language][package_name] - current_time = time.time() - - # Check if metadata has expired - cached_at = metadata.get('cached_at', 0) - if current_time - cached_at > self.cache_ttl: - del self._metadata_cache[language][package_name] - return None - - return metadata - - def cache_standard_library_modules(self, language: str, modules: Set[str]) -> None: - """ - Cache standard library modules for a language. - - Args: - language: Programming language - modules: Set of standard library module names - """ - self._stdlib_cache[language] = (modules, time.time()) - logger.debug(f"Cached {len(modules)} stdlib modules for {language}") - - def get_cached_standard_library_modules(self, language: str) -> Optional[Set[str]]: - """ - Retrieve cached standard library modules. - - Args: - language: Programming language - - Returns: - Set of standard library modules or None if not cached/expired - """ - if language not in self._stdlib_cache: - return None - - modules, timestamp = self._stdlib_cache[language] - current_time = time.time() - - # Stdlib modules rarely change, use longer TTL - if current_time - timestamp > self.cache_ttl * 24: # 24x longer TTL - del self._stdlib_cache[language] - return None - - return modules - - def cache_package_manager_files(self, language: str, files: Set[str]) -> None: - """ - Cache package manager files for a language. - - Args: - language: Programming language - files: Set of package manager file names - """ - self._package_files_cache[language] = (files, time.time()) - logger.debug(f"Cached {len(files)} package manager files for {language}") - - def get_cached_package_manager_files(self, language: str) -> Optional[Set[str]]: - """ - Retrieve cached package manager files. - - Args: - language: Programming language - - Returns: - Set of package manager files or None if not cached/expired - """ - if language not in self._package_files_cache: - return None - - files, timestamp = self._package_files_cache[language] - current_time = time.time() - - # Package manager files rarely change, use longer TTL - if current_time - timestamp > self.cache_ttl * 12: # 12x longer TTL - del self._package_files_cache[language] - return None - - return files - - def get_dependency_list(self, language: str, classification: str) -> List[str]: - """ - Get list of dependencies of a specific classification for a language. - - Args: - language: Programming language - classification: Classification type to filter by - - Returns: - List of dependency names - """ - if language not in self._metadata_cache: - return [] - - dependencies = [] - for package_name, metadata in self._metadata_cache[language].items(): - if metadata.get('classification') == classification: - dependencies.append(package_name) - - return dependencies - - def get_classification_summary(self) -> Dict[str, int]: - """ - Get summary of classification counts. - - Returns: - Dictionary with classification counts - """ - return dict(self._classification_counts) - - def _cleanup_cache(self) -> None: - """Clean up expired cache entries.""" - current_time = time.time() - - # Clean classification cache - expired_keys = [] - for cache_key, (classification, timestamp) in self._classification_cache.items(): - if current_time - timestamp > self.cache_ttl: - expired_keys.append(cache_key) - - for key in expired_keys: - del self._classification_cache[key] - self._stats['cache_evictions'] += 1 - - # Clean metadata cache - for language in list(self._metadata_cache.keys()): - expired_packages = [] - for package, metadata in self._metadata_cache[language].items(): - cached_at = metadata.get('cached_at', 0) - if current_time - cached_at > self.cache_ttl: - expired_packages.append(package) - - for package in expired_packages: - del self._metadata_cache[language][package] - - # Remove empty language entries - if not self._metadata_cache[language]: - del self._metadata_cache[language] - - # Clean stdlib cache - expired_langs = [] - for language, (modules, timestamp) in self._stdlib_cache.items(): - if current_time - timestamp > self.cache_ttl * 24: - expired_langs.append(language) - - for lang in expired_langs: - del self._stdlib_cache[lang] - - # Clean package files cache - expired_langs = [] - for language, (files, timestamp) in self._package_files_cache.items(): - if current_time - timestamp > self.cache_ttl * 12: - expired_langs.append(language) - - for lang in expired_langs: - del self._package_files_cache[lang] - - self._stats['last_cleanup'] = current_time - logger.debug(f"Cache cleanup completed, evicted {len(expired_keys)} classification entries") - - def clear_cache(self) -> None: - """Clear all cached data.""" - self._classification_cache.clear() - self._metadata_cache.clear() - self._stdlib_cache.clear() - self._package_files_cache.clear() - - # Reset stats but keep historical counters - self._stats.update({ - 'cache_hits': 0, - 'cache_misses': 0, - 'cache_evictions': 0, - 'last_cleanup': time.time() - }) - - logger.debug("Cleared all dependency registry cache") - - def get_stats(self) -> Dict[str, Any]: - """ - Get registry statistics. - - Returns: - Dictionary with statistics - """ - current_time = time.time() - - stats = { - **self._stats, - 'cache_size': len(self._classification_cache), - 'metadata_entries': sum(len(packages) for packages in self._metadata_cache.values()), - 'stdlib_languages': len(self._stdlib_cache), - 'package_files_languages': len(self._package_files_cache), - 'classification_counts': dict(self._classification_counts), - 'cache_hit_rate': ( - self._stats['cache_hits'] / - max(1, self._stats['cache_hits'] + self._stats['cache_misses']) - ), - 'uptime': current_time - self._stats['last_cleanup'] - } - - return stats - - def optimize_cache(self) -> None: - """Optimize cache for better performance.""" - # Remove least recently used entries if cache is getting full - if len(self._classification_cache) > self.max_cache_size * 0.8: - current_time = time.time() - - # Sort by timestamp and remove oldest entries - sorted_entries = sorted( - self._classification_cache.items(), - key=lambda x: x[1][1] # Sort by timestamp - ) - - # Remove oldest 20% of entries - remove_count = int(len(sorted_entries) * 0.2) - for i in range(remove_count): - cache_key, (classification, timestamp) = sorted_entries[i] - del self._classification_cache[cache_key] - self._stats['cache_evictions'] += 1 - - logger.debug(f"Optimized cache, removed {remove_count} oldest entries") diff --git a/src/code_index_mcp/tools/scip/position/__init__.py b/src/code_index_mcp/tools/scip/position/__init__.py deleted file mode 100644 index c684147..0000000 --- a/src/code_index_mcp/tools/scip/position/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Position resolution system for SCIP symbols. - -This package provides the modular position resolution system that replaces -complex position detection logic in SCIPSymbolAnalyzer, following the -refactoring plan for better maintainability and accuracy. - -Key Components: -- PositionResolver: Main position resolution engine using strategy pattern -- PositionStrategy: Abstract base for position detection strategies -- SCIPOccurrenceStrategy: SCIP occurrence-based position detection (high confidence) -- TreeSitterStrategy: Tree-sitter AST-based position detection (medium confidence) -- HeuristicStrategy: Fallback heuristic position detection (low confidence) -- PositionCalculator: Utility for position calculations and conversions -- LocationInfo: Enhanced location information with confidence levels - -The system provides: -- Multi-layered position detection with confidence scoring -- Fallback mechanisms for robust symbol location -- Caching for performance optimization -- Integration with SCIPSymbolManager -- Support for different SCIP symbol formats -""" - -from .resolver import PositionResolver, get_position_resolver, resolve_position -from .calculator import PositionCalculator -from .confidence import ConfidenceLevel, LocationInfo -from .strategies import ( - PositionStrategy, - SCIPOccurrenceStrategy, - TreeSitterStrategy, - HeuristicStrategy -) - -__all__ = [ - 'PositionResolver', - 'get_position_resolver', - 'resolve_position', - 'PositionCalculator', - 'ConfidenceLevel', - 'LocationInfo', - 'PositionStrategy', - 'SCIPOccurrenceStrategy', - 'TreeSitterStrategy', - 'HeuristicStrategy' -] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/calculator.py b/src/code_index_mcp/tools/scip/position/calculator.py deleted file mode 100644 index 84694fd..0000000 --- a/src/code_index_mcp/tools/scip/position/calculator.py +++ /dev/null @@ -1,394 +0,0 @@ -""" -Position calculation utilities. - -This module provides utilities for position calculations, conversions, -and position-related operations for SCIP symbol analysis. -""" - -import logging -from typing import Optional, Dict, Any, List, Tuple -from .confidence import LocationInfo, ConfidenceLevel - -logger = logging.getLogger(__name__) - - -class PositionCalculator: - """ - Utility class for position calculations and conversions. - - Provides methods for: - - Converting between different position formats - - Calculating position offsets and distances - - Validating and normalizing positions - - Estimating positions based on context - """ - - def __init__(self): - """Initialize the position calculator.""" - self._line_cache: Dict[str, List[int]] = {} # Cache for line start byte positions - - def convert_byte_to_line_column( - self, - byte_offset: int, - file_content: str, - file_key: Optional[str] = None - ) -> Tuple[int, int]: - """ - Convert byte offset to line and column numbers. - - Args: - byte_offset: Byte offset in file - file_content: File content string - file_key: Optional cache key for the file - - Returns: - Tuple of (line, column) - both 1-based - """ - if byte_offset < 0: - return 1, 1 - - if byte_offset >= len(file_content): - # Return end of file position - lines = file_content.splitlines() - if lines: - return len(lines), len(lines[-1]) + 1 - return 1, 1 - - # Get line start positions (cached) - line_starts = self._get_line_starts(file_content, file_key) - - # Binary search to find line - line_number = self._binary_search_line(line_starts, byte_offset) - - # Calculate column within the line - line_start = line_starts[line_number - 1] # line_number is 1-based - column = byte_offset - line_start + 1 # Convert to 1-based - - return line_number, column - - def convert_line_column_to_byte( - self, - line: int, - column: int, - file_content: str, - file_key: Optional[str] = None - ) -> int: - """ - Convert line and column to byte offset. - - Args: - line: Line number (1-based) - column: Column number (1-based) - file_content: File content string - file_key: Optional cache key for the file - - Returns: - Byte offset in file - """ - if line < 1 or column < 1: - return 0 - - # Get line start positions (cached) - line_starts = self._get_line_starts(file_content, file_key) - - if line > len(line_starts): - # Beyond end of file - return len(file_content) - - line_start = line_starts[line - 1] # Convert to 0-based - byte_offset = line_start + column - 1 # Convert column to 0-based - - # Ensure we don't go beyond file end - return min(byte_offset, len(file_content)) - - def estimate_position_by_symbol_type( - self, - symbol_type: str, - document_info: Optional[Dict[str, Any]] = None - ) -> LocationInfo: - """ - Estimate position based on symbol type characteristics. - - Args: - symbol_type: Type of symbol (class, function, variable, etc.) - document_info: Optional document information for better estimation - - Returns: - LocationInfo with estimated position - """ - # Default positions based on common patterns - type_positions = { - 'class': (1, 1), # Classes usually at file start - 'interface': (1, 1), # Interfaces usually at file start - 'module': (1, 1), # Modules at file start - 'namespace': (1, 1), # Namespaces at file start - 'function': (5, 1), # Functions after imports - 'method': (10, 5), # Methods inside classes - 'variable': (3, 1), # Variables after imports - 'constant': (2, 1), # Constants near file start - 'field': (8, 5), # Fields inside classes/structs - 'property': (12, 5), # Properties inside classes - 'enum': (1, 1), # Enums at file start - 'enum_member': (15, 5), # Enum members inside enums - } - - default_line, default_column = type_positions.get(symbol_type, (1, 1)) - - # Adjust based on document info - if document_info: - # If we have information about document size, adjust positions - estimated_lines = document_info.get('estimated_lines', 100) - symbol_count = document_info.get('symbol_count', 10) - - if symbol_count > 0: - # Distribute symbols throughout the file - if symbol_type in ['method', 'field', 'property']: - # These are typically inside classes, estimate deeper in file - default_line = min(estimated_lines // 2, default_line + symbol_count) - elif symbol_type in ['function', 'variable']: - # These might be distributed throughout - default_line = min(estimated_lines // 3, default_line + (symbol_count // 2)) - - return LocationInfo.from_heuristic( - line=default_line, - column=default_column, - heuristic_type=f"symbol_type_{symbol_type}", - method="position_calculator_estimate" - ) - - def estimate_position_in_class( - self, - class_location: LocationInfo, - member_index: int = 0, - member_type: str = "method" - ) -> LocationInfo: - """ - Estimate position of a class member relative to class location. - - Args: - class_location: Location of the containing class - member_index: Index of the member within the class - member_type: Type of class member - - Returns: - LocationInfo with estimated member position - """ - if not class_location.is_reliable(): - # If class location is unreliable, use basic estimation - return self.estimate_position_by_symbol_type(member_type) - - # Estimate member position based on class location - base_line = class_location.line - base_column = class_location.column - - # Different member types have different typical offsets - member_offsets = { - 'field': (2, 4), - 'property': (3, 4), - 'method': (4, 4), - 'constructor': (1, 4), - 'destructor': (5, 4), - } - - line_offset, column_offset = member_offsets.get(member_type, (3, 4)) - - # Add index-based spacing - estimated_line = base_line + line_offset + (member_index * 2) - estimated_column = base_column + column_offset - - metadata = { - 'class_line': class_location.line, - 'class_column': class_location.column, - 'member_index': member_index, - 'member_type': member_type, - 'based_on_class_location': True - } - - return LocationInfo( - line=estimated_line, - column=estimated_column, - confidence=ConfidenceLevel.LOW, - method="class_member_estimation", - metadata=metadata - ) - - def calculate_distance(self, loc1: LocationInfo, loc2: LocationInfo) -> int: - """ - Calculate distance between two locations (in lines). - - Args: - loc1: First location - loc2: Second location - - Returns: - Distance in lines (absolute value) - """ - return abs(loc1.line - loc2.line) - - def is_within_range( - self, - location: LocationInfo, - start_line: int, - end_line: int - ) -> bool: - """ - Check if location is within a line range. - - Args: - location: Location to check - start_line: Start of range (inclusive) - end_line: End of range (inclusive) - - Returns: - True if location is within range - """ - return start_line <= location.line <= end_line - - def adjust_position_for_language( - self, - location: LocationInfo, - language: str, - symbol_type: str - ) -> LocationInfo: - """ - Adjust position based on language-specific conventions. - - Args: - location: Original location - language: Programming language - symbol_type: Type of symbol - - Returns: - Adjusted LocationInfo - """ - # Language-specific adjustments - adjustments = { - 'python': self._adjust_for_python, - 'javascript': self._adjust_for_javascript, - 'typescript': self._adjust_for_javascript, # Same as JS - 'zig': self._adjust_for_zig, - 'objective-c': self._adjust_for_objc, - } - - adjust_func = adjustments.get(language.lower()) - if adjust_func: - return adjust_func(location, symbol_type) - - return location - - def validate_position( - self, - location: LocationInfo, - max_line: Optional[int] = None, - max_column: Optional[int] = None - ) -> LocationInfo: - """ - Validate and correct position if necessary. - - Args: - location: Location to validate - max_line: Maximum valid line number - max_column: Maximum valid column number - - Returns: - Validated LocationInfo - """ - corrected_line = max(1, location.line) - corrected_column = max(1, location.column) - - if max_line and corrected_line > max_line: - corrected_line = max_line - - if max_column and corrected_column > max_column: - corrected_column = max_column - - if corrected_line != location.line or corrected_column != location.column: - # Position was corrected, update metadata - validated_location = LocationInfo( - line=corrected_line, - column=corrected_column, - confidence=location.confidence, - method=location.method, - metadata=location.metadata.copy() if location.metadata else {} - ) - - validated_location.add_metadata('position_corrected', True) - validated_location.add_metadata('original_line', location.line) - validated_location.add_metadata('original_column', location.column) - - return validated_location - - return location - - def _get_line_starts(self, file_content: str, file_key: Optional[str]) -> List[int]: - """Get cached line start positions.""" - if file_key and file_key in self._line_cache: - return self._line_cache[file_key] - - line_starts = [0] # First line starts at byte 0 - for i, char in enumerate(file_content): - if char == '\n': - line_starts.append(i + 1) - - if file_key: - self._line_cache[file_key] = line_starts - - return line_starts - - def _binary_search_line(self, line_starts: List[int], byte_offset: int) -> int: - """Binary search to find line number for byte offset.""" - left, right = 0, len(line_starts) - 1 - - while left <= right: - mid = (left + right) // 2 - - if mid == len(line_starts) - 1: - # Last line - return mid + 1 - elif line_starts[mid] <= byte_offset < line_starts[mid + 1]: - return mid + 1 # Convert to 1-based - elif byte_offset < line_starts[mid]: - right = mid - 1 - else: - left = mid + 1 - - return len(line_starts) # Fallback to last line - - def _adjust_for_python(self, location: LocationInfo, symbol_type: str) -> LocationInfo: - """Python-specific position adjustments.""" - # Python functions/classes typically have decorators above them - if symbol_type in ['function', 'method', 'class'] and location.line > 1: - # Assume decorators might be present, adjust upward slightly - adjusted_line = max(1, location.line - 1) - if adjusted_line != location.line: - location.add_metadata('python_decorator_adjustment', True) - location.line = adjusted_line - - return location - - def _adjust_for_javascript(self, location: LocationInfo, symbol_type: str) -> LocationInfo: - """JavaScript/TypeScript-specific position adjustments.""" - # No specific adjustments needed for now - return location - - def _adjust_for_zig(self, location: LocationInfo, symbol_type: str) -> LocationInfo: - """Zig-specific position adjustments.""" - # No specific adjustments needed for now - return location - - def _adjust_for_objc(self, location: LocationInfo, symbol_type: str) -> LocationInfo: - """Objective-C specific position adjustments.""" - # Objective-C methods often have + or - prefix - if symbol_type == 'method' and location.column > 1: - # Adjust column to account for method prefix - adjusted_column = max(1, location.column - 1) - if adjusted_column != location.column: - location.add_metadata('objc_method_prefix_adjustment', True) - location.column = adjusted_column - - return location - - def clear_cache(self) -> None: - """Clear the line position cache.""" - self._line_cache.clear() - logger.debug("Cleared position calculator cache") diff --git a/src/code_index_mcp/tools/scip/position/confidence.py b/src/code_index_mcp/tools/scip/position/confidence.py deleted file mode 100644 index f063f82..0000000 --- a/src/code_index_mcp/tools/scip/position/confidence.py +++ /dev/null @@ -1,317 +0,0 @@ -""" -Confidence level management and enhanced location information. - -This module provides enhanced location information with confidence levels -for position resolution results. -""" - -import logging -from enum import Enum -from typing import Optional, Dict, Any -from dataclasses import dataclass - -logger = logging.getLogger(__name__) - - -class ConfidenceLevel(Enum): - """ - Position detection confidence levels. - - Indicates the reliability of position detection results based on - the method used and available data quality. - """ - HIGH = "high" # SCIP occurrence data with exact positions - MEDIUM = "medium" # Tree-sitter AST analysis or symbol structure inference - LOW = "low" # Heuristic fallback or partial data - UNKNOWN = "unknown" # Default/fallback position with minimal confidence - - def __lt__(self, other): - """Allow confidence level comparison.""" - if not isinstance(other, ConfidenceLevel): - return NotImplemented - order = [ConfidenceLevel.UNKNOWN, ConfidenceLevel.LOW, ConfidenceLevel.MEDIUM, ConfidenceLevel.HIGH] - return order.index(self) < order.index(other) - - def __le__(self, other): - return self < other or self == other - - def __gt__(self, other): - return not self <= other - - def __ge__(self, other): - return not self < other - - -@dataclass -class LocationInfo: - """ - Enhanced location information with confidence and metadata. - - Provides comprehensive location information including confidence levels, - detection method metadata, and optional context information. - """ - line: int - column: int - confidence: ConfidenceLevel = ConfidenceLevel.UNKNOWN - method: Optional[str] = None - metadata: Optional[Dict[str, Any]] = None - - def __post_init__(self): - """Validate location information after initialization.""" - if self.line < 1: - logger.warning(f"Invalid line number: {self.line}, setting to 1") - self.line = 1 - - if self.column < 1: - logger.warning(f"Invalid column number: {self.column}, setting to 1") - self.column = 1 - - if self.metadata is None: - self.metadata = {} - - @classmethod - def from_scip_occurrence(cls, occurrence, method: str = "scip_occurrence") -> 'LocationInfo': - """ - Create LocationInfo from SCIP occurrence data. - - Args: - occurrence: SCIP occurrence object - method: Detection method name - - Returns: - LocationInfo with high confidence - """ - try: - if not hasattr(occurrence, 'range') or not occurrence.range: - return cls.default_location(method="scip_occurrence_no_range") - - range_obj = occurrence.range - if not hasattr(range_obj, 'start') or not range_obj.start: - return cls.default_location(method="scip_occurrence_no_start") - - start = range_obj.start - if len(start) >= 2: - # SCIP uses 0-based indexing, convert to 1-based - line = start[0] + 1 - column = start[1] + 1 - - metadata = { - 'scip_range_available': True, - 'range_length': len(start), - 'raw_line': start[0], - 'raw_column': start[1] - } - - # Add end position if available - if hasattr(range_obj, 'end') and range_obj.end and len(range_obj.end) >= 2: - metadata.update({ - 'end_line': range_obj.end[0] + 1, - 'end_column': range_obj.end[1] + 1, - 'span_lines': range_obj.end[0] - start[0] + 1 - }) - - return cls( - line=line, - column=column, - confidence=ConfidenceLevel.HIGH, - method=method, - metadata=metadata - ) - - except (AttributeError, IndexError, TypeError) as e: - logger.debug(f"Error creating LocationInfo from SCIP occurrence: {e}") - - return cls.default_location(method="scip_occurrence_error") - - @classmethod - def from_tree_sitter( - cls, - line: int, - column: int, - node_info: Optional[Dict[str, Any]] = None, - method: str = "tree_sitter" - ) -> 'LocationInfo': - """ - Create LocationInfo from Tree-sitter analysis. - - Args: - line: Line number (1-based) - column: Column number (1-based) - node_info: Optional AST node information - method: Detection method name - - Returns: - LocationInfo with medium confidence - """ - metadata = { - 'tree_sitter_analysis': True - } - - if node_info: - metadata.update({ - 'node_type': node_info.get('type'), - 'node_text': node_info.get('text', '')[:50], # Truncate long text - 'node_start_byte': node_info.get('start_byte'), - 'node_end_byte': node_info.get('end_byte'), - 'node_children_count': node_info.get('children_count', 0) - }) - - return cls( - line=max(1, line), - column=max(1, column), - confidence=ConfidenceLevel.MEDIUM, - method=method, - metadata=metadata - ) - - @classmethod - def from_heuristic( - cls, - line: int, - column: int, - heuristic_type: str, - method: str = "heuristic" - ) -> 'LocationInfo': - """ - Create LocationInfo from heuristic analysis. - - Args: - line: Line number (1-based) - column: Column number (1-based) - heuristic_type: Type of heuristic used - method: Detection method name - - Returns: - LocationInfo with low confidence - """ - metadata = { - 'heuristic_type': heuristic_type, - 'estimated': True - } - - return cls( - line=max(1, line), - column=max(1, column), - confidence=ConfidenceLevel.LOW, - method=method, - metadata=metadata - ) - - @classmethod - def default_location(cls, method: str = "default") -> 'LocationInfo': - """ - Create default LocationInfo for fallback cases. - - Args: - method: Detection method name - - Returns: - LocationInfo with unknown confidence at (1,1) - """ - return cls( - line=1, - column=1, - confidence=ConfidenceLevel.UNKNOWN, - method=method, - metadata={'fallback': True} - ) - - def is_reliable(self) -> bool: - """ - Check if the location information is reliable. - - Returns: - True if confidence is medium or high - """ - return self.confidence in (ConfidenceLevel.HIGH, ConfidenceLevel.MEDIUM) - - def is_high_confidence(self) -> bool: - """ - Check if the location has high confidence. - - Returns: - True if confidence is high - """ - return self.confidence == ConfidenceLevel.HIGH - - def update_confidence(self, new_confidence: ConfidenceLevel, reason: str = "") -> None: - """ - Update confidence level with optional reason. - - Args: - new_confidence: New confidence level - reason: Optional reason for the update - """ - old_confidence = self.confidence - self.confidence = new_confidence - - if not self.metadata: - self.metadata = {} - - self.metadata.update({ - 'confidence_updated': True, - 'previous_confidence': old_confidence.value, - 'update_reason': reason - }) - - logger.debug(f"Updated confidence from {old_confidence.value} to {new_confidence.value}: {reason}") - - def add_metadata(self, key: str, value: Any) -> None: - """ - Add metadata information. - - Args: - key: Metadata key - value: Metadata value - """ - if not self.metadata: - self.metadata = {} - self.metadata[key] = value - - def to_dict(self) -> Dict[str, Any]: - """ - Convert LocationInfo to dictionary. - - Returns: - Dictionary representation - """ - return { - 'line': self.line, - 'column': self.column, - 'confidence': self.confidence.value, - 'method': self.method, - 'metadata': self.metadata or {} - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> 'LocationInfo': - """ - Create LocationInfo from dictionary. - - Args: - data: Dictionary with location data - - Returns: - LocationInfo instance - """ - confidence_str = data.get('confidence', 'unknown') - try: - confidence = ConfidenceLevel(confidence_str) - except ValueError: - confidence = ConfidenceLevel.UNKNOWN - - return cls( - line=data.get('line', 1), - column=data.get('column', 1), - confidence=confidence, - method=data.get('method'), - metadata=data.get('metadata', {}) - ) - - def __str__(self) -> str: - """String representation of LocationInfo.""" - return f"LocationInfo(line={self.line}, column={self.column}, confidence={self.confidence.value})" - - def __repr__(self) -> str: - """Detailed string representation.""" - return f"LocationInfo(line={self.line}, column={self.column}, confidence={self.confidence.value}, method={self.method})" \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/resolver.py b/src/code_index_mcp/tools/scip/position/resolver.py deleted file mode 100644 index fdc32b2..0000000 --- a/src/code_index_mcp/tools/scip/position/resolver.py +++ /dev/null @@ -1,436 +0,0 @@ -""" -Main position resolution system. - -This module provides the PositionResolver that coordinates different position -detection strategies to find symbol positions with appropriate confidence levels. -""" - -import logging -from typing import Optional, Dict, Any, List -from .confidence import LocationInfo, ConfidenceLevel -from .strategies.scip_occurrence import SCIPOccurrenceStrategy -from .strategies.tree_sitter_strategy import TreeSitterStrategy -from .strategies.heuristic import HeuristicStrategy -from .strategies.base import PositionStrategy - -logger = logging.getLogger(__name__) - - -class PositionResolver: - """ - Main position resolution coordinator. - - This class manages multiple position detection strategies and applies them - in order of confidence level to find the best possible position for SCIP symbols. - - Strategy Order (by confidence): - 1. SCIPOccurrenceStrategy (HIGH) - Uses SCIP occurrence data - 2. TreeSitterStrategy (MEDIUM) - Uses AST analysis - 3. HeuristicStrategy (LOW) - Uses pattern matching and estimation - """ - - def __init__(self): - """Initialize the position resolver with default strategies.""" - self._strategies: List[PositionStrategy] = [] - self._strategy_cache: Dict[str, PositionStrategy] = {} - self._resolution_cache: Dict[str, LocationInfo] = {} - self._setup_default_strategies() - - def _setup_default_strategies(self) -> None: - """Setup default position detection strategies in order of confidence.""" - self._strategies = [ - SCIPOccurrenceStrategy(), # Highest confidence - TreeSitterStrategy(), # Medium confidence - HeuristicStrategy() # Lowest confidence (fallback) - ] - - # Build strategy cache for quick lookup - for strategy in self._strategies: - self._strategy_cache[strategy.name] = strategy - - logger.debug(f"Initialized position resolver with {len(self._strategies)} strategies") - - def resolve_position( - self, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None, - preferred_confidence: Optional[ConfidenceLevel] = None - ) -> Optional[LocationInfo]: - """ - Resolve position for a SCIP symbol using the best available strategy. - - Args: - scip_symbol: SCIP symbol identifier - document: Document containing source text or SCIP data - context: Optional context information (file path, project info, etc.) - preferred_confidence: Minimum confidence level required - - Returns: - LocationInfo with the best confidence available, or None if not found - """ - if not scip_symbol: - return None - - # Check cache first - cache_key = self._create_cache_key(scip_symbol, context) - if cache_key in self._resolution_cache: - cached_result = self._resolution_cache[cache_key] - if self._meets_confidence_requirement(cached_result, preferred_confidence): - return cached_result - - # Try strategies in order of confidence - best_location = None - - for strategy in self._strategies: - try: - # Check if strategy can handle this symbol - if not strategy.can_handle_symbol(scip_symbol, document): - continue - - # Try to resolve position - location = strategy.try_resolve(scip_symbol, document, context) - - if location: - # Add strategy information to metadata - location.add_metadata('strategy_used', strategy.name) - location.add_metadata('strategy_confidence', strategy.get_confidence_level().value) - - # Check if this meets our confidence requirements - if self._meets_confidence_requirement(location, preferred_confidence): - # Cache and return immediately if confidence requirement is met - self._resolution_cache[cache_key] = location - logger.debug(f"Resolved {scip_symbol} using {strategy.name} with {location.confidence.value} confidence") - return location - - # Keep track of best location found so far - if not best_location or location.confidence > best_location.confidence: - best_location = location - - except Exception as e: - logger.debug(f"Strategy {strategy.name} failed for {scip_symbol}: {e}") - continue - - # Cache the best result found (even if it doesn't meet preferred confidence) - if best_location: - self._resolution_cache[cache_key] = best_location - logger.debug(f"Resolved {scip_symbol} using fallback with {best_location.confidence.value} confidence") - - return best_location - - def resolve_multiple_positions( - self, - symbols: List[str], - document, - context: Optional[Dict[str, Any]] = None - ) -> Dict[str, Optional[LocationInfo]]: - """ - Resolve positions for multiple SCIP symbols efficiently. - - Args: - symbols: List of SCIP symbol identifiers - document: Document containing source text or SCIP data - context: Optional context information - - Returns: - Dictionary mapping symbol -> LocationInfo (or None if not found) - """ - results = {} - - for symbol in symbols: - results[symbol] = self.resolve_position(symbol, document, context) - - return results - - def try_strategy( - self, - strategy_name: str, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None - ) -> Optional[LocationInfo]: - """ - Try a specific strategy to resolve a position. - - Args: - strategy_name: Name of the strategy to use - scip_symbol: SCIP symbol identifier - document: Document containing source text or SCIP data - context: Optional context information - - Returns: - LocationInfo if the strategy succeeds, None otherwise - """ - if strategy_name not in self._strategy_cache: - logger.warning(f"Unknown strategy: {strategy_name}") - return None - - strategy = self._strategy_cache[strategy_name] - - if not strategy.can_handle_symbol(scip_symbol, document): - return None - - try: - location = strategy.try_resolve(scip_symbol, document, context) - if location: - location.add_metadata('strategy_used', strategy.name) - location.add_metadata('strategy_confidence', strategy.get_confidence_level().value) - return location - except Exception as e: - logger.debug(f"Strategy {strategy_name} failed for {scip_symbol}: {e}") - return None - - def get_available_strategies(self) -> List[str]: - """ - Get list of available strategy names. - - Returns: - List of strategy names - """ - return [strategy.name for strategy in self._strategies] - - def get_strategy_info(self) -> List[Dict[str, Any]]: - """ - Get information about all available strategies. - - Returns: - List of dictionaries with strategy information - """ - return [ - { - 'name': strategy.name, - 'confidence_level': strategy.get_confidence_level().value, - 'description': strategy.__class__.__doc__.strip().split('\n')[0] if strategy.__class__.__doc__ else '' - } - for strategy in self._strategies - ] - - def add_strategy(self, strategy: PositionStrategy, priority: Optional[int] = None) -> None: - """ - Add a custom position detection strategy. - - Args: - strategy: PositionStrategy instance to add - priority: Optional priority (lower number = higher priority) - If None, adds at appropriate position based on confidence - """ - if priority is not None: - self._strategies.insert(priority, strategy) - else: - # Insert based on confidence level - inserted = False - for i, existing_strategy in enumerate(self._strategies): - if strategy.get_confidence_level() > existing_strategy.get_confidence_level(): - self._strategies.insert(i, strategy) - inserted = True - break - - if not inserted: - self._strategies.append(strategy) - - # Update cache - self._strategy_cache[strategy.name] = strategy - - logger.debug(f"Added strategy {strategy.name} with {strategy.get_confidence_level().value} confidence") - - def remove_strategy(self, strategy_name: str) -> bool: - """ - Remove a strategy by name. - - Args: - strategy_name: Name of the strategy to remove - - Returns: - True if strategy was removed, False if not found - """ - if strategy_name not in self._strategy_cache: - return False - - strategy = self._strategy_cache[strategy_name] - self._strategies.remove(strategy) - del self._strategy_cache[strategy_name] - - logger.debug(f"Removed strategy {strategy_name}") - return True - - def clear_cache(self) -> None: - """Clear all cached resolution results.""" - self._resolution_cache.clear() - logger.debug("Cleared position resolution cache") - - def get_cache_stats(self) -> Dict[str, Any]: - """ - Get cache statistics. - - Returns: - Dictionary with cache statistics - """ - return { - 'cache_size': len(self._resolution_cache), - 'strategies_count': len(self._strategies), - 'strategy_names': self.get_available_strategies() - } - - def find_best_positions( - self, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None, - max_results: int = 3 - ) -> List[LocationInfo]: - """ - Find multiple possible positions for a symbol using different strategies. - - Args: - scip_symbol: SCIP symbol identifier - document: Document containing source text or SCIP data - context: Optional context information - max_results: Maximum number of results to return - - Returns: - List of LocationInfo objects sorted by confidence - """ - positions = [] - - for strategy in self._strategies[:max_results]: - try: - if strategy.can_handle_symbol(scip_symbol, document): - location = strategy.try_resolve(scip_symbol, document, context) - if location: - location.add_metadata('strategy_used', strategy.name) - location.add_metadata('strategy_confidence', strategy.get_confidence_level().value) - positions.append(location) - except Exception as e: - logger.debug(f"Strategy {strategy.name} failed for {scip_symbol}: {e}") - - # Sort by confidence level (highest first) - positions.sort(key=lambda x: x.confidence, reverse=True) - - return positions[:max_results] - - def _create_cache_key(self, scip_symbol: str, context: Optional[Dict[str, Any]]) -> str: - """Create a cache key for resolution results.""" - if not context: - return scip_symbol - - # Include relevant context in cache key - relevant_keys = ['file_path', 'language', 'project_path'] - context_parts = [] - - for key in relevant_keys: - if key in context: - context_parts.append(f"{key}:{context[key]}") - - if context_parts: - return f"{scip_symbol}#{':'.join(context_parts)}" - return scip_symbol - - def _meets_confidence_requirement( - self, - location: LocationInfo, - preferred_confidence: Optional[ConfidenceLevel] - ) -> bool: - """Check if location meets the preferred confidence requirement.""" - if preferred_confidence is None: - return True - return location.confidence >= preferred_confidence - - def diagnose_resolution( - self, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None - ) -> Dict[str, Any]: - """ - Diagnose position resolution for debugging purposes. - - Args: - scip_symbol: SCIP symbol identifier - document: Document containing source text or SCIP data - context: Optional context information - - Returns: - Dictionary with diagnostic information - """ - diagnosis = { - 'symbol': scip_symbol, - 'strategies_tested': [], - 'successful_strategies': [], - 'failed_strategies': [], - 'best_result': None, - 'context_available': context is not None, - 'document_type': type(document).__name__ - } - - for strategy in self._strategies: - strategy_info = { - 'name': strategy.name, - 'confidence_level': strategy.get_confidence_level().value, - 'can_handle': False, - 'result': None, - 'error': None - } - - try: - strategy_info['can_handle'] = strategy.can_handle_symbol(scip_symbol, document) - - if strategy_info['can_handle']: - location = strategy.try_resolve(scip_symbol, document, context) - if location: - strategy_info['result'] = location.to_dict() - diagnosis['successful_strategies'].append(strategy.name) - - if not diagnosis['best_result'] or location.confidence > ConfidenceLevel(diagnosis['best_result']['confidence']): - diagnosis['best_result'] = location.to_dict() - else: - diagnosis['failed_strategies'].append(strategy.name) - else: - diagnosis['failed_strategies'].append(strategy.name) - - except Exception as e: - strategy_info['error'] = str(e) - diagnosis['failed_strategies'].append(strategy.name) - - diagnosis['strategies_tested'].append(strategy_info) - - return diagnosis - - -# Global resolver instance for convenience -_resolver_instance: Optional[PositionResolver] = None - - -def get_position_resolver() -> PositionResolver: - """ - Get the global position resolver instance. - - Returns: - Global PositionResolver instance - """ - global _resolver_instance - if _resolver_instance is None: - _resolver_instance = PositionResolver() - return _resolver_instance - - -def resolve_position( - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None, - preferred_confidence: Optional[ConfidenceLevel] = None -) -> Optional[LocationInfo]: - """ - Convenience function to resolve a position using the global resolver. - - Args: - scip_symbol: SCIP symbol identifier - document: Document containing source text or SCIP data - context: Optional context information - preferred_confidence: Minimum confidence level required - - Returns: - LocationInfo with the best confidence available, or None if not found - """ - return get_position_resolver().resolve_position( - scip_symbol, document, context, preferred_confidence - ) \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/__init__.py b/src/code_index_mcp/tools/scip/position/strategies/__init__.py deleted file mode 100644 index 9d63180..0000000 --- a/src/code_index_mcp/tools/scip/position/strategies/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -Position detection strategies. - -This package provides different strategies for detecting symbol positions -with varying levels of confidence and accuracy. -""" - -from .base import PositionStrategy -from .scip_occurrence import SCIPOccurrenceStrategy -from .tree_sitter_strategy import TreeSitterStrategy -from .heuristic import HeuristicStrategy - -__all__ = [ - 'PositionStrategy', - 'SCIPOccurrenceStrategy', - 'TreeSitterStrategy', - 'HeuristicStrategy' -] \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/base.py b/src/code_index_mcp/tools/scip/position/strategies/base.py deleted file mode 100644 index c8959c1..0000000 --- a/src/code_index_mcp/tools/scip/position/strategies/base.py +++ /dev/null @@ -1,185 +0,0 @@ -""" -Base position detection strategy. - -This module provides the abstract base class for all position detection strategies. -""" - -import logging -from abc import ABC, abstractmethod -from typing import Optional, Dict, Any -from ..confidence import LocationInfo, ConfidenceLevel - -logger = logging.getLogger(__name__) - - -class PositionStrategy(ABC): - """ - Abstract base class for position detection strategies. - - Each strategy implements a different approach to detecting symbol positions - with varying levels of accuracy and confidence. - """ - - def __init__(self, name: str): - """ - Initialize the position strategy. - - Args: - name: Human-readable name for this strategy - """ - self.name = name - self._stats = { - 'attempts': 0, - 'successes': 0, - 'failures': 0 - } - - @abstractmethod - def try_resolve( - self, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None - ) -> Optional[LocationInfo]: - """ - Attempt to resolve symbol position using this strategy. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document containing symbols and occurrences - context: Optional context information (symbol parser, etc.) - - Returns: - LocationInfo if position found, None otherwise - """ - pass - - @abstractmethod - def get_confidence_level(self) -> ConfidenceLevel: - """ - Return the confidence level this strategy typically provides. - - Returns: - ConfidenceLevel for this strategy's results - """ - pass - - def get_priority(self) -> int: - """ - Get priority for this strategy (higher = tried first). - - Returns: - Priority value (0-100, where 100 is highest priority) - """ - # Map confidence levels to priorities - confidence_priorities = { - ConfidenceLevel.HIGH: 90, - ConfidenceLevel.MEDIUM: 60, - ConfidenceLevel.LOW: 30, - ConfidenceLevel.UNKNOWN: 10 - } - return confidence_priorities.get(self.get_confidence_level(), 50) - - def can_handle_symbol(self, scip_symbol: str, document) -> bool: - """ - Check if this strategy can handle the given symbol. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document - - Returns: - True if strategy can attempt to resolve this symbol - """ - # Default implementation: can handle any symbol - return True - - def resolve( - self, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None - ) -> Optional[LocationInfo]: - """ - Public method to resolve position with statistics tracking. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document - context: Optional context information - - Returns: - LocationInfo if position found, None otherwise - """ - self._stats['attempts'] += 1 - - try: - if not self.can_handle_symbol(scip_symbol, document): - self._stats['failures'] += 1 - return None - - result = self.try_resolve(scip_symbol, document, context) - - if result is not None: - self._stats['successes'] += 1 - # Ensure the result has proper metadata - if not result.metadata: - result.metadata = {} - result.metadata['strategy'] = self.name - result.metadata['strategy_confidence'] = self.get_confidence_level().value - - logger.debug(f"Strategy '{self.name}' resolved {scip_symbol} at {result.line}:{result.column}") - return result - else: - self._stats['failures'] += 1 - return None - - except Exception as e: - self._stats['failures'] += 1 - logger.debug(f"Strategy '{self.name}' failed for {scip_symbol}: {e}") - return None - - def get_success_rate(self) -> float: - """ - Get success rate for this strategy. - - Returns: - Success rate as a float between 0.0 and 1.0 - """ - if self._stats['attempts'] == 0: - return 0.0 - return self._stats['successes'] / self._stats['attempts'] - - def get_stats(self) -> Dict[str, Any]: - """ - Get statistics for this strategy. - - Returns: - Dictionary with strategy statistics - """ - return { - 'name': self.name, - 'confidence_level': self.get_confidence_level().value, - 'priority': self.get_priority(), - 'success_rate': self.get_success_rate(), - **self._stats - } - - def reset_stats(self) -> None: - """Reset strategy statistics.""" - self._stats = { - 'attempts': 0, - 'successes': 0, - 'failures': 0 - } - logger.debug(f"Reset statistics for strategy '{self.name}'") - - def __str__(self) -> str: - """String representation of the strategy.""" - return f"{self.__class__.__name__}(name='{self.name}', confidence={self.get_confidence_level().value})" - - def __repr__(self) -> str: - """Detailed string representation.""" - return (f"{self.__class__.__name__}(name='{self.name}', " - f"confidence={self.get_confidence_level().value}, " - f"success_rate={self.get_success_rate():.2f})") \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/heuristic.py b/src/code_index_mcp/tools/scip/position/strategies/heuristic.py deleted file mode 100644 index 2449e21..0000000 --- a/src/code_index_mcp/tools/scip/position/strategies/heuristic.py +++ /dev/null @@ -1,568 +0,0 @@ -""" -Heuristic-based position detection strategy. - -This strategy uses heuristic analysis and pattern matching to find symbol -positions with low confidence as a fallback when other methods fail. -""" - -import logging -import re -from typing import Optional, Dict, Any, List, Tuple -from .base import PositionStrategy -from ..confidence import LocationInfo, ConfidenceLevel - -logger = logging.getLogger(__name__) - - -class HeuristicStrategy(PositionStrategy): - """ - Heuristic-based position detection strategy. - - This strategy provides low confidence position detection using - pattern matching, text search, and educated guesses when more - reliable methods are not available. - """ - - def __init__(self): - """Initialize the heuristic strategy.""" - super().__init__("heuristic") - self._common_patterns = self._build_common_patterns() - - def _build_common_patterns(self) -> Dict[str, List[Dict[str, Any]]]: - """Build common symbol detection patterns across languages.""" - return { - 'function_patterns': [ - { - 'pattern': r'\bdef\s+{name}\s*\(', - 'language': 'python', - 'confidence_boost': 0.8, - 'description': 'Python function definition' - }, - { - 'pattern': r'\bfunction\s+{name}\s*\(', - 'language': 'javascript', - 'confidence_boost': 0.8, - 'description': 'JavaScript function declaration' - }, - { - 'pattern': r'\bfn\s+{name}\s*\(', - 'language': 'zig', - 'confidence_boost': 0.8, - 'description': 'Zig function definition' - }, - { - 'pattern': r'\b{name}\s*=\s*function', - 'language': 'javascript', - 'confidence_boost': 0.7, - 'description': 'JavaScript function expression' - }, - { - 'pattern': r'\b{name}\s*=\s*\([^)]*\)\s*=>', - 'language': 'javascript', - 'confidence_boost': 0.7, - 'description': 'JavaScript arrow function' - } - ], - 'class_patterns': [ - { - 'pattern': r'\bclass\s+{name}\s*[:({{]', - 'language': 'python', - 'confidence_boost': 0.9, - 'description': 'Python class definition' - }, - { - 'pattern': r'\bclass\s+{name}\s*\{{', - 'language': 'javascript', - 'confidence_boost': 0.9, - 'description': 'JavaScript class declaration' - }, - { - 'pattern': r'\b@interface\s+{name}\s*[:(]', - 'language': 'objective-c', - 'confidence_boost': 0.9, - 'description': 'Objective-C interface declaration' - } - ], - 'variable_patterns': [ - { - 'pattern': r'\b{name}\s*=', - 'language': 'general', - 'confidence_boost': 0.5, - 'description': 'Variable assignment' - }, - { - 'pattern': r'\bconst\s+{name}\s*=', - 'language': 'javascript', - 'confidence_boost': 0.7, - 'description': 'JavaScript const declaration' - }, - { - 'pattern': r'\blet\s+{name}\s*=', - 'language': 'javascript', - 'confidence_boost': 0.7, - 'description': 'JavaScript let declaration' - }, - { - 'pattern': r'\bvar\s+{name}\s*=', - 'language': 'javascript', - 'confidence_boost': 0.6, - 'description': 'JavaScript var declaration' - } - ], - 'import_patterns': [ - { - 'pattern': r'\bfrom\s+\S+\s+import\s+.*{name}', - 'language': 'python', - 'confidence_boost': 0.6, - 'description': 'Python import statement' - }, - { - 'pattern': r'\bimport\s+.*{name}', - 'language': 'python', - 'confidence_boost': 0.6, - 'description': 'Python import statement' - }, - { - 'pattern': r'\bimport\s+\{{.*{name}.*\}}', - 'language': 'javascript', - 'confidence_boost': 0.6, - 'description': 'JavaScript named import' - } - ] - } - - def get_confidence_level(self) -> ConfidenceLevel: - """Heuristic analysis provides low confidence positions.""" - return ConfidenceLevel.LOW - - def can_handle_symbol(self, scip_symbol: str, document) -> bool: - """ - Check if we can attempt heuristic analysis for this symbol. - - Args: - scip_symbol: SCIP symbol identifier - document: Document context - - Returns: - Always True as this is the fallback strategy - """ - # Heuristic strategy can always attempt to find a symbol - return True - - def try_resolve( - self, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None - ) -> Optional[LocationInfo]: - """ - Try to resolve position using heuristic analysis. - - Args: - scip_symbol: SCIP symbol identifier - document: Document containing source text or metadata - context: Optional context information - - Returns: - LocationInfo with low confidence if found, None otherwise - """ - # Get source text - source_text = self._get_source_text(document, context) - if not source_text: - return None - - # Parse symbol information - symbol_info = self._parse_symbol(scip_symbol) - if not symbol_info: - return None - - # Try different heuristic approaches in order of confidence - strategies = [ - self._find_by_definition_patterns, - self._find_by_usage_patterns, - self._find_by_text_search, - self._find_by_line_estimation - ] - - best_location = None - best_confidence_score = 0.0 - - for strategy_func in strategies: - try: - location = strategy_func(source_text, symbol_info, context) - if location: - confidence_score = location.metadata.get('confidence_score', 0.0) - if confidence_score > best_confidence_score: - best_location = location - best_confidence_score = confidence_score - except Exception as e: - logger.debug(f"Heuristic strategy failed: {strategy_func.__name__}: {e}") - - return best_location - - def _get_source_text(self, document, context: Optional[Dict[str, Any]]) -> Optional[str]: - """Extract source text from document or context.""" - # Try context first - if context: - if 'source_text' in context: - return context['source_text'] - if 'file_content' in context: - return context['file_content'] - - # Try document - if hasattr(document, 'text') and document.text: - return document.text - if hasattr(document, 'content') and document.content: - return document.content - - # Try reading from file path - if context and 'file_path' in context: - try: - with open(context['file_path'], 'r', encoding='utf-8') as f: - return f.read() - except (OSError, UnicodeDecodeError) as e: - logger.debug(f"Failed to read source file: {e}") - - return None - - def _parse_symbol(self, scip_symbol: str) -> Optional[Dict[str, Any]]: - """Parse SCIP symbol to extract useful information.""" - try: - info = { - 'original': scip_symbol, - 'name': None, - 'type': 'unknown', - 'scope': [], - 'language': None - } - - # Extract from SCIP symbol format - if scip_symbol.startswith('local '): - local_part = scip_symbol[6:] - - # Remove descriptor suffix - if local_part.endswith('.'): - local_part = local_part[:-1] - - # Parse different symbol types - if '(' in local_part: - # Function-like symbol - base_name = local_part.split('(')[0] - info['type'] = 'function' - elif local_part.count('.') > 0: - # Nested symbol (method, attribute, etc.) - parts = local_part.split('.') - base_name = parts[-1] - info['scope'] = parts[:-1] - info['type'] = 'method' if len(parts) > 1 else 'attribute' - else: - # Simple identifier - base_name = local_part - info['type'] = 'identifier' - - # Clean up name - if '/' in base_name: - info['name'] = base_name.split('/')[-1] - else: - info['name'] = base_name - - # Try to infer language - info['language'] = self._infer_language(scip_symbol) - - return info - - except Exception as e: - logger.debug(f"Failed to parse symbol {scip_symbol}: {e}") - - return None - - def _infer_language(self, scip_symbol: str) -> Optional[str]: - """Infer programming language from SCIP symbol.""" - symbol_lower = scip_symbol.lower() - - if '.py' in symbol_lower or 'python' in symbol_lower: - return 'python' - elif '.js' in symbol_lower or '.ts' in symbol_lower or 'javascript' in symbol_lower: - return 'javascript' - elif '.zig' in symbol_lower: - return 'zig' - elif '.java' in symbol_lower: - return 'java' - elif '.m' in symbol_lower or '.mm' in symbol_lower or 'objc' in symbol_lower: - return 'objective-c' - elif '.go' in symbol_lower: - return 'go' - elif '.rs' in symbol_lower: - return 'rust' - - return None - - def _find_by_definition_patterns( - self, - source_text: str, - symbol_info: Dict[str, Any], - context: Optional[Dict[str, Any]] - ) -> Optional[LocationInfo]: - """Find symbol using definition patterns.""" - symbol_name = symbol_info['name'] - symbol_type = symbol_info['type'] - language = symbol_info['language'] - - if not symbol_name: - return None - - # Get relevant patterns based on symbol type - pattern_groups = [] - if symbol_type == 'function': - pattern_groups.append(self._common_patterns['function_patterns']) - elif symbol_type in ['class', 'identifier']: - pattern_groups.append(self._common_patterns['class_patterns']) - pattern_groups.append(self._common_patterns['variable_patterns']) - else: - pattern_groups.append(self._common_patterns['variable_patterns']) - - best_match = None - best_confidence = 0.0 - - for patterns in pattern_groups: - for pattern_info in patterns: - # Filter by language if known - if language and pattern_info['language'] != 'general' and pattern_info['language'] != language: - continue - - # Format pattern with symbol name - pattern = pattern_info['pattern'].format(name=re.escape(symbol_name)) - - match = re.search(pattern, source_text, re.MULTILINE | re.IGNORECASE) - if match: - confidence = pattern_info['confidence_boost'] - if confidence > best_confidence: - best_confidence = confidence - best_match = (match, pattern_info) - - if best_match: - match, pattern_info = best_match - line_num = source_text[:match.start()].count('\n') + 1 - line_start = source_text.rfind('\n', 0, match.start()) + 1 - column_num = match.start() - line_start + 1 - - return LocationInfo.from_heuristic( - line=line_num, - column=column_num, - heuristic_type="definition_pattern", - method=f"heuristic_pattern_{pattern_info['language']}" - ) - - return None - - def _find_by_usage_patterns( - self, - source_text: str, - symbol_info: Dict[str, Any], - context: Optional[Dict[str, Any]] - ) -> Optional[LocationInfo]: - """Find symbol by looking for usage patterns.""" - symbol_name = symbol_info['name'] - - if not symbol_name: - return None - - # Look for the symbol in import statements first - import_patterns = self._common_patterns['import_patterns'] - - for pattern_info in import_patterns: - pattern = pattern_info['pattern'].format(name=re.escape(symbol_name)) - match = re.search(pattern, source_text, re.MULTILINE) - - if match: - line_num = source_text[:match.start()].count('\n') + 1 - line_start = source_text.rfind('\n', 0, match.start()) + 1 - column_num = match.start() - line_start + 1 - - metadata = { - 'confidence_score': 0.6, - 'usage_type': 'import', - 'pattern_description': pattern_info['description'] - } - - location = LocationInfo.from_heuristic( - line=line_num, - column=column_num, - heuristic_type="usage_pattern", - method="heuristic_import" - ) - location.metadata.update(metadata) - return location - - return None - - def _find_by_text_search( - self, - source_text: str, - symbol_info: Dict[str, Any], - context: Optional[Dict[str, Any]] - ) -> Optional[LocationInfo]: - """Find symbol using simple text search.""" - symbol_name = symbol_info['name'] - - if not symbol_name or len(symbol_name) < 2: - return None - - # Look for word boundary matches - pattern = rf'\b{re.escape(symbol_name)}\b' - matches = list(re.finditer(pattern, source_text)) - - if matches: - # Use the first match (usually the definition) - match = matches[0] - line_num = source_text[:match.start()].count('\n') + 1 - line_start = source_text.rfind('\n', 0, match.start()) + 1 - column_num = match.start() - line_start + 1 - - metadata = { - 'confidence_score': 0.3, - 'total_matches': len(matches), - 'search_method': 'text_search' - } - - location = LocationInfo.from_heuristic( - line=line_num, - column=column_num, - heuristic_type="text_search", - method="heuristic_text_search" - ) - location.metadata.update(metadata) - return location - - return None - - def _find_by_line_estimation( - self, - source_text: str, - symbol_info: Dict[str, Any], - context: Optional[Dict[str, Any]] - ) -> Optional[LocationInfo]: - """Estimate position based on file structure and symbol type.""" - total_lines = source_text.count('\n') + 1 - - # Make educated guesses based on symbol type and common patterns - estimated_line = 1 - confidence_score = 0.1 - - symbol_type = symbol_info['type'] - - if symbol_type == 'function': - # Functions often appear in the middle of files - estimated_line = max(1, total_lines // 3) - confidence_score = 0.2 - elif symbol_type == 'class': - # Classes often appear early in files - estimated_line = max(1, total_lines // 4) - confidence_score = 0.15 - elif symbol_type == 'import': - # Imports usually at the top - estimated_line = min(10, total_lines // 10) - confidence_score = 0.25 - else: - # Default to somewhere in the first half - estimated_line = max(1, total_lines // 2) - - metadata = { - 'confidence_score': confidence_score, - 'estimation_method': 'line_estimation', - 'total_lines': total_lines, - 'symbol_type': symbol_type - } - - location = LocationInfo.from_heuristic( - line=estimated_line, - column=1, - heuristic_type="line_estimation", - method="heuristic_estimation" - ) - location.metadata.update(metadata) - return location - - def find_all_occurrences( - self, - symbol_name: str, - source_text: str, - context: Optional[Dict[str, Any]] = None - ) -> List[LocationInfo]: - """ - Find all occurrences of a symbol in source text. - - Args: - symbol_name: Name of the symbol to find - source_text: Source code text - context: Optional context information - - Returns: - List of LocationInfo objects for all occurrences - """ - occurrences = [] - - if not symbol_name or len(symbol_name) < 2: - return occurrences - - # Find all word boundary matches - pattern = rf'\b{re.escape(symbol_name)}\b' - matches = re.finditer(pattern, source_text) - - for i, match in enumerate(matches): - line_num = source_text[:match.start()].count('\n') + 1 - line_start = source_text.rfind('\n', 0, match.start()) + 1 - column_num = match.start() - line_start + 1 - - metadata = { - 'occurrence_index': i, - 'confidence_score': 0.3, - 'search_method': 'all_occurrences' - } - - location = LocationInfo.from_heuristic( - line=line_num, - column=column_num, - heuristic_type="occurrence", - method="heuristic_all_occurrences" - ) - location.metadata.update(metadata) - occurrences.append(location) - - return occurrences - - def get_heuristic_confidence( - self, - symbol_info: Dict[str, Any], - context: Optional[Dict[str, Any]] = None - ) -> float: - """ - Calculate heuristic confidence score for a symbol. - - Args: - symbol_info: Parsed symbol information - context: Optional context information - - Returns: - Confidence score between 0.0 and 1.0 - """ - base_confidence = 0.3 # Base confidence for heuristic methods - - # Boost confidence based on symbol characteristics - if symbol_info.get('type') == 'function': - base_confidence += 0.2 - elif symbol_info.get('type') == 'class': - base_confidence += 0.15 - - # Boost if we have language information - if symbol_info.get('language'): - base_confidence += 0.1 - - # Boost if symbol name is longer (less likely to be false positive) - name_length = len(symbol_info.get('name', '')) - if name_length > 5: - base_confidence += 0.1 - elif name_length > 10: - base_confidence += 0.15 - - return min(1.0, base_confidence) \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/scip_occurrence.py b/src/code_index_mcp/tools/scip/position/strategies/scip_occurrence.py deleted file mode 100644 index 1d1c257..0000000 --- a/src/code_index_mcp/tools/scip/position/strategies/scip_occurrence.py +++ /dev/null @@ -1,236 +0,0 @@ -""" -SCIP occurrence-based position detection strategy. - -This strategy uses SCIP occurrence data to find exact symbol positions -with high confidence. -""" - -import logging -from typing import Optional, Dict, Any -from .base import PositionStrategy -from ..confidence import LocationInfo, ConfidenceLevel - -logger = logging.getLogger(__name__) - -# Try to import SCIP protobuf definitions -try: - from ....scip.proto import scip_pb2 - SCIP_PROTO_AVAILABLE = True -except ImportError: - scip_pb2 = None - SCIP_PROTO_AVAILABLE = False - - -class SCIPOccurrenceStrategy(PositionStrategy): - """ - SCIP occurrence-based position detection strategy. - - This strategy provides the highest confidence position detection by - using SCIP occurrence data which contains exact position information - from the original indexing process. - """ - - def __init__(self): - """Initialize the SCIP occurrence strategy.""" - super().__init__("scip_occurrence") - - def get_confidence_level(self) -> ConfidenceLevel: - """SCIP occurrences provide high confidence positions.""" - return ConfidenceLevel.HIGH - - def can_handle_symbol(self, scip_symbol: str, document) -> bool: - """ - Check if document has occurrences for the symbol. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document - - Returns: - True if document has occurrences we can search - """ - return hasattr(document, 'occurrences') and document.occurrences - - def try_resolve( - self, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None - ) -> Optional[LocationInfo]: - """ - Try to resolve position using SCIP occurrence data. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document containing occurrences - context: Optional context information - - Returns: - LocationInfo with high confidence if found, None otherwise - """ - # Strategy 1: Look for definition occurrence first (most reliable) - location = self._find_definition_occurrence(scip_symbol, document) - if location: - location.add_metadata('occurrence_type', 'definition') - return location - - # Strategy 2: Look for any occurrence with position data - location = self._find_any_occurrence(scip_symbol, document) - if location: - location.add_metadata('occurrence_type', 'reference') - return location - - # No occurrences found for this symbol - return None - - def _find_definition_occurrence(self, scip_symbol: str, document) -> Optional[LocationInfo]: - """ - Find the definition occurrence for a symbol. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document - - Returns: - LocationInfo if definition found, None otherwise - """ - for occurrence in document.occurrences: - if occurrence.symbol == scip_symbol and self._is_definition(occurrence): - location = self._parse_occurrence_location(occurrence) - if location: - location.add_metadata('is_definition', True) - return location - return None - - def _find_any_occurrence(self, scip_symbol: str, document) -> Optional[LocationInfo]: - """ - Find any occurrence with location data for a symbol. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document - - Returns: - LocationInfo if any occurrence found, None otherwise - """ - for occurrence in document.occurrences: - if occurrence.symbol == scip_symbol: - location = self._parse_occurrence_location(occurrence) - if location: - location.add_metadata('is_definition', self._is_definition(occurrence)) - location.add_metadata('symbol_roles', getattr(occurrence, 'symbol_roles', 0)) - return location - return None - - def _is_definition(self, occurrence) -> bool: - """ - Check if an occurrence represents a definition. - - Args: - occurrence: SCIP occurrence object - - Returns: - True if this occurrence is a definition - """ - if not hasattr(occurrence, 'symbol_roles'): - return False - - try: - if SCIP_PROTO_AVAILABLE: - return bool(occurrence.symbol_roles & scip_pb2.SymbolRole.Definition) - else: - # Fallback: Definition role = 1 - return bool(occurrence.symbol_roles & 1) - except (AttributeError, TypeError): - return False - - def _parse_occurrence_location(self, occurrence) -> Optional[LocationInfo]: - """ - Parse location information from SCIP occurrence. - - Args: - occurrence: SCIP occurrence object - - Returns: - LocationInfo if parsing successful, None otherwise - """ - try: - if not hasattr(occurrence, 'range') or not occurrence.range: - return None - - range_obj = occurrence.range - if not hasattr(range_obj, 'start') or not range_obj.start: - return None - - start = range_obj.start - if len(start) >= 2: - # SCIP uses 0-based indexing, convert to 1-based - line = start[0] + 1 - column = start[1] + 1 - - # Create LocationInfo with metadata - metadata = { - 'scip_range_available': True, - 'range_length': len(start), - 'raw_line': start[0], - 'raw_column': start[1] - } - - # Add end position if available - if hasattr(range_obj, 'end') and range_obj.end and len(range_obj.end) >= 2: - metadata.update({ - 'end_line': range_obj.end[0] + 1, - 'end_column': range_obj.end[1] + 1, - 'span_lines': range_obj.end[0] - start[0] + 1 - }) - - return LocationInfo( - line=line, - column=column, - confidence=ConfidenceLevel.HIGH, - method="scip_occurrence", - metadata=metadata - ) - - except (AttributeError, IndexError, TypeError) as e: - logger.debug(f"Error parsing occurrence location: {e}") - - return None - - def get_occurrence_info(self, scip_symbol: str, document) -> Dict[str, Any]: - """ - Get detailed information about occurrences for a symbol. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document - - Returns: - Dictionary with occurrence statistics and information - """ - info = { - 'total_occurrences': 0, - 'definition_occurrences': 0, - 'reference_occurrences': 0, - 'occurrences_with_position': 0, - 'role_distribution': {} - } - - for occurrence in document.occurrences: - if occurrence.symbol == scip_symbol: - info['total_occurrences'] += 1 - - if self._is_definition(occurrence): - info['definition_occurrences'] += 1 - else: - info['reference_occurrences'] += 1 - - if self._parse_occurrence_location(occurrence): - info['occurrences_with_position'] += 1 - - # Track role distribution - roles = getattr(occurrence, 'symbol_roles', 0) - role_key = str(roles) - info['role_distribution'][role_key] = info['role_distribution'].get(role_key, 0) + 1 - - return info \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/position/strategies/tree_sitter_strategy.py b/src/code_index_mcp/tools/scip/position/strategies/tree_sitter_strategy.py deleted file mode 100644 index 8db6fd8..0000000 --- a/src/code_index_mcp/tools/scip/position/strategies/tree_sitter_strategy.py +++ /dev/null @@ -1,523 +0,0 @@ -""" -Tree-sitter AST-based position detection strategy. - -This strategy uses Tree-sitter AST analysis to find symbol positions -with medium confidence by parsing source code. -""" - -import logging -import re -from typing import Optional, Dict, Any, List, Tuple -from .base import PositionStrategy -from ..confidence import LocationInfo, ConfidenceLevel - -logger = logging.getLogger(__name__) - -# Try to import tree-sitter -try: - import tree_sitter - from tree_sitter import Language, Parser - TREE_SITTER_AVAILABLE = True -except ImportError: - tree_sitter = None - Language = None - Parser = None - TREE_SITTER_AVAILABLE = False - - -class TreeSitterStrategy(PositionStrategy): - """ - Tree-sitter AST-based position detection strategy. - - This strategy provides medium confidence position detection by - parsing source code with Tree-sitter and analyzing the AST structure - to find symbol definitions and references. - """ - - def __init__(self): - """Initialize the Tree-sitter strategy.""" - super().__init__("tree_sitter") - self._parsers: Dict[str, Parser] = {} - self._languages: Dict[str, Language] = {} - self._setup_parsers() - - def _setup_parsers(self) -> None: - """Setup Tree-sitter parsers for supported languages.""" - if not TREE_SITTER_AVAILABLE: - logger.debug("Tree-sitter not available, TreeSitterStrategy will have limited functionality") - return - - # Language configurations with their Tree-sitter names - language_configs = { - 'python': 'python', - 'javascript': 'javascript', - 'typescript': 'typescript', - 'zig': 'zig', - 'java': 'java', - 'objective-c': 'objc', - 'c': 'c', - 'cpp': 'cpp', - 'go': 'go', - 'rust': 'rust', - } - - for lang_name, ts_name in language_configs.items(): - try: - # This would typically load pre-compiled language libraries - # For now, we'll just track which languages we support - self._languages[lang_name] = ts_name - logger.debug(f"Configured Tree-sitter support for {lang_name}") - except Exception as e: - logger.debug(f"Failed to setup Tree-sitter for {lang_name}: {e}") - - def get_confidence_level(self) -> ConfidenceLevel: - """Tree-sitter AST analysis provides medium confidence positions.""" - return ConfidenceLevel.MEDIUM - - def can_handle_symbol(self, scip_symbol: str, document) -> bool: - """ - Check if we can handle this symbol with Tree-sitter analysis. - - Args: - scip_symbol: SCIP symbol identifier - document: Document context (may contain language info) - - Returns: - True if Tree-sitter is available and language is supported - """ - if not TREE_SITTER_AVAILABLE: - return False - - # Try to detect language from symbol or document - language = self._detect_language(scip_symbol, document) - return language is not None and language in self._languages - - def try_resolve( - self, - scip_symbol: str, - document, - context: Optional[Dict[str, Any]] = None - ) -> Optional[LocationInfo]: - """ - Try to resolve position using Tree-sitter AST analysis. - - Args: - scip_symbol: SCIP symbol identifier - document: Document containing source text - context: Optional context information - - Returns: - LocationInfo with medium confidence if found, None otherwise - """ - if not TREE_SITTER_AVAILABLE: - return None - - # Get source text from document or context - source_text = self._get_source_text(document, context) - if not source_text: - return None - - # Detect language - language = self._detect_language(scip_symbol, document) - if not language or language not in self._languages: - return None - - # Parse symbol to extract name and type - symbol_info = self._parse_scip_symbol(scip_symbol) - if not symbol_info: - return None - - # Try different AST-based search strategies - location = self._find_by_ast_analysis(source_text, language, symbol_info) - if location: - location.add_metadata('ast_analysis', True) - location.add_metadata('language', language) - return location - - # Fallback to pattern matching with AST guidance - location = self._find_by_pattern_with_ast(source_text, language, symbol_info) - if location: - location.add_metadata('pattern_with_ast', True) - location.add_metadata('language', language) - return location - - return None - - def _get_source_text(self, document, context: Optional[Dict[str, Any]]) -> Optional[str]: - """ - Extract source text from document or context. - - Args: - document: Document object - context: Optional context information - - Returns: - Source text or None if not available - """ - # Try to get from context first - if context and 'source_text' in context: - return context['source_text'] - - # Try to get from document - if hasattr(document, 'text') and document.text: - return document.text - - if hasattr(document, 'content') and document.content: - return document.content - - # Try file path in context - if context and 'file_path' in context: - try: - with open(context['file_path'], 'r', encoding='utf-8') as f: - return f.read() - except (OSError, UnicodeDecodeError) as e: - logger.debug(f"Failed to read source file: {e}") - - return None - - def _detect_language(self, scip_symbol: str, document) -> Optional[str]: - """ - Detect programming language from symbol or document. - - Args: - scip_symbol: SCIP symbol identifier - document: Document context - - Returns: - Language name or None if not detected - """ - # Try to get from document first - if hasattr(document, 'language') and document.language: - return document.language.lower() - - # Infer from SCIP symbol patterns - if 'python' in scip_symbol or '.py' in scip_symbol: - return 'python' - elif 'javascript' in scip_symbol or '.js' in scip_symbol or 'npm' in scip_symbol: - return 'javascript' - elif 'typescript' in scip_symbol or '.ts' in scip_symbol: - return 'typescript' - elif '.zig' in scip_symbol or 'zig' in scip_symbol: - return 'zig' - elif '.java' in scip_symbol or 'java' in scip_symbol: - return 'java' - elif '.m' in scip_symbol or '.mm' in scip_symbol or 'objc' in scip_symbol: - return 'objective-c' - elif '.go' in scip_symbol: - return 'go' - elif '.rs' in scip_symbol or 'rust' in scip_symbol: - return 'rust' - - return None - - def _parse_scip_symbol(self, scip_symbol: str) -> Optional[Dict[str, Any]]: - """ - Parse SCIP symbol to extract meaningful information. - - Args: - scip_symbol: SCIP symbol identifier - - Returns: - Dictionary with symbol information or None if parsing failed - """ - try: - # Basic SCIP symbol format: "local ." - if scip_symbol.startswith('local '): - local_part = scip_symbol[6:] # Remove "local " - - # Split into local-id and descriptor - if '(' in local_part: - # Function-like symbol - name_part = local_part.split('(')[0] - symbol_type = 'function' - elif '.' in local_part: - # Method or attribute - parts = local_part.split('.') - name_part = parts[-2] if len(parts) > 1 else parts[0] - symbol_type = 'method' if len(parts) > 2 else 'attribute' - else: - # Simple identifier - name_part = local_part.rstrip('.') - symbol_type = 'identifier' - - # Extract base name - if '/' in name_part: - base_name = name_part.split('/')[-1] - else: - base_name = name_part - - return { - 'name': base_name, - 'full_name': name_part, - 'type': symbol_type, - 'scip_symbol': scip_symbol - } - - except (IndexError, AttributeError) as e: - logger.debug(f"Failed to parse SCIP symbol {scip_symbol}: {e}") - - return None - - def _find_by_ast_analysis( - self, - source_text: str, - language: str, - symbol_info: Dict[str, Any] - ) -> Optional[LocationInfo]: - """ - Find symbol position using full AST analysis. - - Args: - source_text: Source code text - language: Programming language - symbol_info: Parsed symbol information - - Returns: - LocationInfo if found, None otherwise - """ - # This would typically involve: - # 1. Parse source code with Tree-sitter - # 2. Traverse AST to find matching symbol definitions - # 3. Extract precise position information - - # For now, we'll simulate this with pattern matching - # In a real implementation, this would use tree-sitter parsing - - symbol_name = symbol_info['name'] - symbol_type = symbol_info['type'] - - # Language-specific AST-guided patterns - patterns = self._get_ast_patterns(language, symbol_type, symbol_name) - - for pattern_info in patterns: - match = re.search(pattern_info['pattern'], source_text, re.MULTILINE) - if match: - line_num = source_text[:match.start()].count('\n') + 1 - line_start = source_text.rfind('\n', 0, match.start()) + 1 - column_num = match.start() - line_start + 1 - - metadata = { - 'pattern_type': pattern_info['type'], - 'confidence_reason': pattern_info['reason'], - 'match_text': match.group()[:50], # Truncate long matches - 'ast_guided': True - } - - return LocationInfo.from_tree_sitter( - line=line_num, - column=column_num, - node_info={ - 'type': pattern_info['type'], - 'text': match.group(), - 'start_byte': match.start(), - 'end_byte': match.end() - }, - method="tree_sitter_ast" - ) - - return None - - def _find_by_pattern_with_ast( - self, - source_text: str, - language: str, - symbol_info: Dict[str, Any] - ) -> Optional[LocationInfo]: - """ - Find symbol position using pattern matching with AST guidance. - - Args: - source_text: Source code text - language: Programming language - symbol_info: Parsed symbol information - - Returns: - LocationInfo if found, None otherwise - """ - symbol_name = symbol_info['name'] - - # Simple pattern matching as fallback - # This would be enhanced with AST context in a full implementation - - # Look for function definitions, class definitions, etc. - basic_patterns = [ - rf'\bdef\s+{re.escape(symbol_name)}\s*\(', # Python function - rf'\bclass\s+{re.escape(symbol_name)}\s*[:(]', # Python class - rf'\bfunction\s+{re.escape(symbol_name)}\s*\(', # JavaScript function - rf'\b{re.escape(symbol_name)}\s*=\s*function', # JS function assignment - rf'\bconst\s+{re.escape(symbol_name)}\s*=', # JS/TS const - rf'\blet\s+{re.escape(symbol_name)}\s*=', # JS/TS let - rf'\bvar\s+{re.escape(symbol_name)}\s*=', # JS var - ] - - for pattern in basic_patterns: - match = re.search(pattern, source_text, re.MULTILINE | re.IGNORECASE) - if match: - line_num = source_text[:match.start()].count('\n') + 1 - line_start = source_text.rfind('\n', 0, match.start()) + 1 - column_num = match.start() - line_start + 1 - - metadata = { - 'pattern_match': True, - 'match_text': match.group()[:50], - 'fallback_pattern': True - } - - return LocationInfo.from_tree_sitter( - line=line_num, - column=column_num, - node_info={ - 'text': match.group(), - 'start_byte': match.start(), - 'end_byte': match.end() - }, - method="tree_sitter_pattern" - ) - - return None - - def _get_ast_patterns(self, language: str, symbol_type: str, symbol_name: str) -> List[Dict[str, Any]]: - """ - Get AST-guided patterns for symbol detection. - - Args: - language: Programming language - symbol_type: Type of symbol (function, class, etc.) - symbol_name: Name of the symbol - - Returns: - List of pattern information dictionaries - """ - escaped_name = re.escape(symbol_name) - patterns = [] - - if language == 'python': - if symbol_type == 'function': - patterns.extend([ - { - 'pattern': rf'^\s*def\s+{escaped_name}\s*\(', - 'type': 'function_definition', - 'reason': 'Python function definition pattern' - }, - { - 'pattern': rf'^\s*async\s+def\s+{escaped_name}\s*\(', - 'type': 'async_function_definition', - 'reason': 'Python async function definition pattern' - } - ]) - elif symbol_type in ['class', 'identifier']: - patterns.append({ - 'pattern': rf'^\s*class\s+{escaped_name}\s*[:(]', - 'type': 'class_definition', - 'reason': 'Python class definition pattern' - }) - - elif language in ['javascript', 'typescript']: - if symbol_type == 'function': - patterns.extend([ - { - 'pattern': rf'\bfunction\s+{escaped_name}\s*\(', - 'type': 'function_declaration', - 'reason': 'JavaScript function declaration' - }, - { - 'pattern': rf'\b{escaped_name}\s*=\s*function', - 'type': 'function_expression', - 'reason': 'JavaScript function expression' - }, - { - 'pattern': rf'\b{escaped_name}\s*=\s*\([^)]*\)\s*=>', - 'type': 'arrow_function', - 'reason': 'JavaScript arrow function' - } - ]) - elif symbol_type in ['class', 'identifier']: - patterns.append({ - 'pattern': rf'\bclass\s+{escaped_name}\s*\{{', - 'type': 'class_declaration', - 'reason': 'JavaScript class declaration' - }) - - elif language == 'zig': - patterns.extend([ - { - 'pattern': rf'\bfn\s+{escaped_name}\s*\(', - 'type': 'function_definition', - 'reason': 'Zig function definition' - }, - { - 'pattern': rf'\bconst\s+{escaped_name}\s*=', - 'type': 'const_declaration', - 'reason': 'Zig constant declaration' - } - ]) - - elif language == 'java': - patterns.extend([ - { - 'pattern': rf'\b(public|private|protected)?\s*(static)?\s*\w+\s+{escaped_name}\s*\(', - 'type': 'method_definition', - 'reason': 'Java method definition' - }, - { - 'pattern': rf'\b(public|private|protected)?\s*class\s+{escaped_name}\s*\{{', - 'type': 'class_definition', - 'reason': 'Java class definition' - } - ]) - - return patterns - - def get_supported_languages(self) -> List[str]: - """ - Get list of languages supported by this strategy. - - Returns: - List of supported language names - """ - return list(self._languages.keys()) - - def get_ast_info( - self, - source_text: str, - language: str, - symbol_name: str - ) -> Dict[str, Any]: - """ - Get detailed AST information for a symbol. - - Args: - source_text: Source code text - language: Programming language - symbol_name: Name of the symbol to analyze - - Returns: - Dictionary with AST analysis information - """ - info = { - 'language': language, - 'symbol_name': symbol_name, - 'tree_sitter_available': TREE_SITTER_AVAILABLE, - 'language_supported': language in self._languages, - 'patterns_found': [], - 'potential_matches': 0 - } - - if language in self._languages: - # Get all potential patterns for this symbol - symbol_info = {'name': symbol_name, 'type': 'identifier'} - patterns = self._get_ast_patterns(language, 'identifier', symbol_name) - - for pattern_info in patterns: - matches = re.finditer(pattern_info['pattern'], source_text, re.MULTILINE) - for match in matches: - line_num = source_text[:match.start()].count('\n') + 1 - info['patterns_found'].append({ - 'type': pattern_info['type'], - 'line': line_num, - 'text': match.group()[:50], - 'reason': pattern_info['reason'] - }) - info['potential_matches'] += 1 - - return info \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/relationship_info.py b/src/code_index_mcp/tools/scip/relationship_info.py deleted file mode 100644 index ed640b0..0000000 --- a/src/code_index_mcp/tools/scip/relationship_info.py +++ /dev/null @@ -1,611 +0,0 @@ -""" -Relationship Information - New unified relationship data structures - -This module defines the new relationship data structures for enhanced -symbol relationship analysis with complete SCIP standard support. -""" - -from dataclasses import dataclass, field -from typing import Dict, List, Optional, Any -from enum import Enum - - -class RelationshipType(Enum): - """Unified relationship types for all programming languages""" - - # Function relationships - FUNCTION_CALL = "function_call" - METHOD_CALL = "method_call" - - # Type relationships - INHERITANCE = "inheritance" - INTERFACE_IMPLEMENTATION = "interface_implementation" - TYPE_REFERENCE = "type_reference" - - # Variable relationships - VARIABLE_REFERENCE = "variable_reference" - VARIABLE_ASSIGNMENT = "variable_assignment" - - # Module relationships - MODULE_IMPORT = "module_import" - MODULE_EXPORT = "module_export" - - # Generic relationships (fallback) - REFERENCE = "reference" - DEFINITION = "definition" - - -@dataclass -class RelationshipInfo: - """Complete information about a single relationship""" - - target: str # Target symbol name - target_symbol_id: str # Complete SCIP symbol ID - relationship_type: RelationshipType # Type of relationship - source: Optional[str] = None # Source symbol name (for reverse relationships) - source_symbol_id: Optional[str] = None # Source symbol ID (for reverse relationships) - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary format for JSON output""" - result = { - "target": self.target, - "target_symbol_id": self.target_symbol_id, - "relationship_type": self.relationship_type.value - } - - if self.source: - result["source"] = self.source - if self.source_symbol_id: - result["source_symbol_id"] = self.source_symbol_id - - return result - - -@dataclass -class SymbolRelationships: - """Container for all relationships of a symbol""" - - # Active relationships (this symbol to others) - calls: List[RelationshipInfo] = field(default_factory=list) - inherits_from: List[RelationshipInfo] = field(default_factory=list) - implements: List[RelationshipInfo] = field(default_factory=list) - references: List[RelationshipInfo] = field(default_factory=list) - - # Passive relationships (others to this symbol) - called_by: List[RelationshipInfo] = field(default_factory=list) - inherited_by: List[RelationshipInfo] = field(default_factory=list) - implemented_by: List[RelationshipInfo] = field(default_factory=list) - referenced_by: List[RelationshipInfo] = field(default_factory=list) - - def add_relationship(self, relationship: RelationshipInfo, is_reverse: bool = False): - """Add a relationship to the appropriate category with deduplication""" - rel_type = relationship.relationship_type - - if is_reverse: - # This is a reverse relationship (others -> this symbol) - if rel_type in [RelationshipType.FUNCTION_CALL, RelationshipType.METHOD_CALL]: - self._add_unique_relationship(self.called_by, relationship) - elif rel_type == RelationshipType.INHERITANCE: - self._add_unique_relationship(self.inherited_by, relationship) - elif rel_type == RelationshipType.INTERFACE_IMPLEMENTATION: - self._add_unique_relationship(self.implemented_by, relationship) - else: - self._add_unique_relationship(self.referenced_by, relationship) - else: - # This is a forward relationship (this symbol -> others) - if rel_type in [RelationshipType.FUNCTION_CALL, RelationshipType.METHOD_CALL]: - self._add_unique_relationship(self.calls, relationship) - elif rel_type == RelationshipType.INHERITANCE: - self._add_unique_relationship(self.inherits_from, relationship) - elif rel_type == RelationshipType.INTERFACE_IMPLEMENTATION: - self._add_unique_relationship(self.implements, relationship) - else: - self._add_unique_relationship(self.references, relationship) - - def _add_unique_relationship(self, relationship_list: List[RelationshipInfo], new_relationship: RelationshipInfo): - """Add relationship only if it doesn't already exist""" - for existing in relationship_list: - if (existing.target_symbol_id == new_relationship.target_symbol_id and - existing.relationship_type == new_relationship.relationship_type): - return # Skip duplicate - relationship_list.append(new_relationship) - - def get_total_count(self) -> int: - """Get total number of relationships""" - return (len(self.calls) + len(self.called_by) + - len(self.inherits_from) + len(self.inherited_by) + - len(self.implements) + len(self.implemented_by) + - len(self.references) + len(self.referenced_by)) - - def to_dict(self) -> Dict[str, List[Dict[str, Any]]]: - """Convert to dictionary format for JSON output - simplified for token efficiency""" - result = {} - - # Only include called_by relationships - if self.called_by: - result["called_by"] = [rel.to_dict() for rel in self.called_by] - - return result - - -@dataclass -class RelationshipsSummary: - """Summary statistics for all relationships in a file""" - - total_relationships: int - by_type: Dict[str, int] - cross_file_relationships: int - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary format for JSON output""" - return { - "total_relationships": self.total_relationships, - "by_type": self.by_type, - "cross_file_relationships": self.cross_file_relationships - } - - -class SCIPRelationshipReader: - """Reads and parses relationships from SCIP index""" - - def __init__(self): - """Initialize the relationship reader""" - self._symbol_kinds = {} # symbol_id -> SymbolKind mapping - - def extract_relationships_from_document(self, document, scip_index=None) -> Dict[str, SymbolRelationships]: - """ - Enhanced relationship extraction from both symbol.relationships and occurrences. - - This dual-source approach dramatically improves relationship coverage: - - symbol.relationships: Explicit relationships (inheritance, implements) - - occurrences: Implicit relationships (function calls, references) - - Cross-document analysis: Enables called_by relationships across files - - Args: - document: SCIP document containing symbols and relationships - scip_index: Optional full SCIP index for cross-document analysis - - Returns: - Dictionary mapping symbol_id -> SymbolRelationships - """ - all_relationships = {} - - # Step 0: Build global symbol registry for cross-document analysis - self._build_global_symbol_registry(document, scip_index) - - # Step 1: Extract from explicit symbol relationships (existing logic) - self._extract_from_symbol_relationships(document, all_relationships) - - # Step 2: Extract from occurrences with cross-document support - self._extract_from_occurrences(document, all_relationships, scip_index) - - # Step 3: Build reverse relationships with cross-document support - self._build_reverse_relationships(all_relationships, document, scip_index) - - return all_relationships - - def _build_global_symbol_registry(self, document, scip_index=None): - """Build comprehensive symbol registry supporting cross-document analysis.""" - # Clear previous state - self._symbol_kinds.clear() - - # Build registry from current document - self._add_document_to_registry(document) - - # If full index provided, build global registry for cross-document analysis - if scip_index: - for doc in scip_index.documents: - if doc != document: # Avoid duplicate processing - self._add_document_to_registry(doc) - - def _add_document_to_registry(self, document): - """Add document symbols to the global registry.""" - for symbol_info in document.symbols: - symbol_id = symbol_info.symbol - self._symbol_kinds[symbol_id] = symbol_info.kind - - # For function symbols, also map the occurrence format (without ().suffix) - if symbol_info.kind == 11: # SymbolKind.Function - if symbol_id.endswith('().'): - base_id = symbol_id[:-3] # Remove '().' - self._symbol_kinds[base_id] = symbol_info.kind - - def _extract_from_symbol_relationships(self, document, all_relationships: Dict[str, SymbolRelationships]): - """ - Extract relationships from explicit symbol.relationships (original logic). - - Args: - document: SCIP document - all_relationships: Dictionary to populate with relationships - """ - for symbol_info in document.symbols: - symbol_id = symbol_info.symbol - symbol_name = symbol_info.display_name - - if not symbol_info.relationships: - continue - - # Create or get existing relationships container - if symbol_id not in all_relationships: - all_relationships[symbol_id] = SymbolRelationships() - - symbol_rels = all_relationships[symbol_id] - - # Process each explicit relationship - for scip_relationship in symbol_info.relationships: - rel_info = self._parse_scip_relationship( - scip_relationship, symbol_name, symbol_id, document - ) - if rel_info: - symbol_rels.add_relationship(rel_info) - - def _extract_from_occurrences(self, document, all_relationships: Dict[str, SymbolRelationships], scip_index=None): - """ - Extract relationships from document occurrences (major new functionality). - - This extracts the majority of missing relationships, especially function calls. - - Args: - document: SCIP document containing occurrences - all_relationships: Dictionary to populate with relationships - """ - # Process each occurrence to find relationships - for occurrence in document.occurrences: - try: - # Skip if no symbol or range information - if not occurrence.symbol or not hasattr(occurrence, 'range'): - continue - - target_symbol_id = occurrence.symbol - roles = getattr(occurrence, 'symbol_roles', 0) - - # Skip definitions and imports - these aren't "uses" of other symbols - if roles & 1: # Definition role - skip - continue - if roles & 2: # Import role - skip - continue - - # Find which symbol contains this occurrence (context analysis) - source_symbol_id = self._find_containing_symbol(occurrence, document) - if not source_symbol_id or source_symbol_id == target_symbol_id: - continue # Self-reference or no container found - - # Determine relationship type based on roles and symbol characteristics - rel_type = self._determine_occurrence_relationship_type(roles, target_symbol_id, source_symbol_id) - if not rel_type: - continue - - - # Create relationship info - rel_info = RelationshipInfo( - target=self._extract_symbol_name(target_symbol_id), - target_symbol_id=target_symbol_id, - relationship_type=rel_type - ) - - # Add to source symbol's relationships - if source_symbol_id not in all_relationships: - all_relationships[source_symbol_id] = SymbolRelationships() - - all_relationships[source_symbol_id].add_relationship(rel_info) - - # For function calls, also create reverse "called_by" relationship - # This is the key to cross-document relationship building - if (rel_type == RelationshipType.FUNCTION_CALL or rel_type == RelationshipType.METHOD_CALL): - self._add_cross_document_called_by( - all_relationships, target_symbol_id, source_symbol_id, scip_index - ) - - except Exception as e: - # Log but continue processing other occurrences - continue - - def _find_containing_symbol(self, occurrence, document) -> Optional[str]: - """ - Find which symbol definition contains this occurrence. - - This is crucial for establishing "X calls Y" relationships. - """ - if not hasattr(occurrence, 'range') or not occurrence.range: - return None - - try: - occ_line = occurrence.range.start[0] if occurrence.range.start else 0 - except (AttributeError, IndexError): - return None - - # Find symbol definitions that could contain this occurrence - containing_symbols = [] - - for other_occurrence in document.occurrences: - try: - # Only consider definitions - roles = getattr(other_occurrence, 'symbol_roles', 0) - if not (roles & 1): # Must be definition - continue - - if not hasattr(other_occurrence, 'range') or not other_occurrence.range: - continue - - def_line = other_occurrence.range.start[0] if other_occurrence.range.start else 0 - - # Simple heuristic: find the closest preceding definition - if def_line <= occ_line: - containing_symbols.append((other_occurrence.symbol, def_line)) - - except Exception: - continue - - # Return the symbol with the closest line number to the occurrence - if containing_symbols: - containing_symbols.sort(key=lambda x: x[1], reverse=True) # Closest first - return containing_symbols[0][0] - - # If no containing symbol found, use file-level context for cross-file relationships - # This handles cases like run.py calling server.py functions - if hasattr(document, 'relative_path') and document.relative_path: - file_name = document.relative_path.replace('\\', '/').split('/')[-1] - return f"local file:{file_name}" - - return None - - def _determine_occurrence_relationship_type(self, roles: int, target_symbol_id: str, - source_symbol_id: str) -> Optional[RelationshipType]: - """ - Determine relationship type from occurrence roles and symbol characteristics. - - Args: - roles: SCIP symbol roles (bit flags) - target_symbol_id: Symbol being referenced - source_symbol_id: Symbol doing the referencing - - Returns: - RelationshipType or None if not a relevant relationship - """ - # Write access (assignment/modification) - if roles & 4: # Write role - return RelationshipType.VARIABLE_ASSIGNMENT - - # Read access - determine specific type - if roles == 0 or roles & 8: # Read role or unspecified - if self._is_function_symbol(target_symbol_id): - return RelationshipType.FUNCTION_CALL if not self._is_method_symbol(target_symbol_id) else RelationshipType.METHOD_CALL - elif self._is_class_symbol(target_symbol_id): - return RelationshipType.TYPE_REFERENCE - else: - return RelationshipType.VARIABLE_REFERENCE - - # Type role - if roles & 64: # Type role - return RelationshipType.TYPE_REFERENCE - - # Default to generic reference - return RelationshipType.REFERENCE - - def _is_function_symbol(self, symbol_id: str) -> bool: - """Check if symbol represents a function using SymbolKind.""" - # Check our symbol kinds cache - symbol_kind = self._symbol_kinds.get(symbol_id) - return symbol_kind == 11 # SymbolKind.Function - - def _is_method_symbol(self, symbol_id: str) -> bool: - """Check if symbol represents a method (function within a class).""" - return '#' in symbol_id and self._is_function_symbol(symbol_id) - - def _is_class_symbol(self, symbol_id: str) -> bool: - """Check if symbol represents a class using SymbolKind.""" - # Check our symbol kinds cache - symbol_kind = self._symbol_kinds.get(symbol_id) - return symbol_kind == 3 # SymbolKind.Class - - - def _parse_scip_relationship(self, scip_relationship, source_name: str, - source_symbol_id: str, document) -> Optional[RelationshipInfo]: - """ - Parse a single SCIP relationship into RelationshipInfo - - Args: - scip_relationship: SCIP Relationship object - source_name: Name of the source symbol - source_symbol_id: SCIP ID of the source symbol - document: SCIP document for context - - Returns: - RelationshipInfo object or None if parsing fails - """ - target_symbol_id = scip_relationship.symbol - - # Extract target symbol name from symbol ID - target_name = self._extract_symbol_name(target_symbol_id) - - # Determine relationship type from SCIP flags - rel_type = self._determine_relationship_type(scip_relationship, target_symbol_id) - - - return RelationshipInfo( - target=target_name, - target_symbol_id=target_symbol_id, - relationship_type=rel_type - ) - - def _determine_relationship_type(self, scip_relationship, target_symbol_id: str) -> RelationshipType: - """Determine the relationship type from SCIP flags and symbol ID""" - - # Check SCIP relationship flags - if scip_relationship.is_implementation: - return RelationshipType.INTERFACE_IMPLEMENTATION - elif scip_relationship.is_type_definition: - return RelationshipType.TYPE_REFERENCE - elif scip_relationship.is_definition: - return RelationshipType.DEFINITION - elif scip_relationship.is_reference: - # Need to determine if it's inheritance, call, or reference - if target_symbol_id.endswith("#"): - # Class symbol - could be inheritance or type reference - return RelationshipType.INHERITANCE # Assume inheritance for now - elif target_symbol_id.endswith("()."): - # Function symbol - function call - return RelationshipType.FUNCTION_CALL - else: - # Generic reference - return RelationshipType.REFERENCE - else: - # Fallback - return RelationshipType.REFERENCE - - def _extract_symbol_name(self, symbol_id: str) -> str: - """Extract the symbol name from SCIP symbol ID""" - try: - # Handle file-level symbols - if symbol_id.startswith("local file:"): - return symbol_id[11:] # Remove "local file:" prefix - - # SCIP symbol format: scip- / - if "/" in symbol_id: - symbol_part = symbol_id.split("/")[-1] - # Remove descriptor suffix (like #, ()., etc.) - if symbol_part.endswith("#"): - return symbol_part[:-1] - elif symbol_part.endswith("()."): - return symbol_part[:-3] - else: - return symbol_part - return symbol_id - except: - return symbol_id - - - def _add_cross_document_called_by(self, all_relationships: Dict[str, SymbolRelationships], - target_symbol_id: str, source_symbol_id: str, - scip_index=None): - """ - Add cross-document called_by relationship. - - This creates the reverse relationship that enables cross-file function call tracking. - For example, when run.py calls server.main(), we add main as called_by run. - - Args: - all_relationships: Current document's relationships - target_symbol_id: Function being called (e.g., 'local main') - source_symbol_id: Function making the call (e.g., 'local ') - scip_index: Full SCIP index for cross-document lookup - """ - # Find the definition format symbol ID for the target function - definition_symbol_id = self._find_definition_symbol_id(target_symbol_id, scip_index) - if not definition_symbol_id: - return - - # Create called_by relationship - source_name = self._extract_symbol_name(source_symbol_id) - called_by_rel = RelationshipInfo( - target=source_name, - target_symbol_id=source_symbol_id, - relationship_type=RelationshipType.FUNCTION_CALL - ) - - # Add to target function's called_by relationships (with deduplication) - if definition_symbol_id not in all_relationships: - all_relationships[definition_symbol_id] = SymbolRelationships() - - # Check if this called_by relationship already exists to avoid duplicates - existing_called_by = all_relationships[definition_symbol_id].called_by - for existing_rel in existing_called_by: - if (existing_rel.target_symbol_id == called_by_rel.target_symbol_id and - existing_rel.relationship_type == called_by_rel.relationship_type): - return # Skip duplicate - - all_relationships[definition_symbol_id].called_by.append(called_by_rel) - - def _find_definition_symbol_id(self, occurrence_symbol_id: str, scip_index=None) -> Optional[str]: - """ - Find the definition format symbol ID from occurrence format. - - SCIP uses different formats: - - Occurrences: 'local main' - - Definitions: 'local main().' - - This method maps from occurrence to definition format using SymbolKind. - """ - if not scip_index: - return None - - # If already in definition format, return as-is - if occurrence_symbol_id.endswith('().'): - return occurrence_symbol_id - - # Search all documents for function symbol with this base name - for doc in scip_index.documents: - for symbol_info in doc.symbols: - if symbol_info.kind == 11: # SymbolKind.Function - symbol_id = symbol_info.symbol - if symbol_id.endswith('().'): - # Extract base name from definition format - base_name = symbol_id[:-3] # Remove '().' - if base_name == occurrence_symbol_id: - return symbol_id - - return None - - def _build_reverse_relationships(self, all_relationships: Dict[str, SymbolRelationships], - document, scip_index=None): - """Build reverse relationships (called_by, inherited_by, etc.) with cross-document support""" - - # Create a comprehensive mapping of all symbols for reverse lookup - symbol_names = {} - - # Add symbols from current document - for symbol_info in document.symbols: - symbol_names[symbol_info.symbol] = symbol_info.display_name - - # Add symbols from all other documents if full index provided - if scip_index: - for doc in scip_index.documents: - if doc != document: # Avoid duplicate processing - for symbol_info in doc.symbols: - if symbol_info.symbol not in symbol_names: # Avoid overriding - symbol_names[symbol_info.symbol] = symbol_info.display_name - - # Build reverse relationships (iterate over a copy to avoid modification during iteration) - for source_symbol_id, source_rels in list(all_relationships.items()): - source_name = symbol_names.get(source_symbol_id, "unknown") - - # Process each forward relationship to create reverse relationships - for rel in source_rels.calls: - self._add_reverse_relationship( - all_relationships, rel.target_symbol_id, rel, source_name, source_symbol_id - ) - - for rel in source_rels.inherits_from: - self._add_reverse_relationship( - all_relationships, rel.target_symbol_id, rel, source_name, source_symbol_id - ) - - for rel in source_rels.implements: - self._add_reverse_relationship( - all_relationships, rel.target_symbol_id, rel, source_name, source_symbol_id - ) - - for rel in source_rels.references: - self._add_reverse_relationship( - all_relationships, rel.target_symbol_id, rel, source_name, source_symbol_id - ) - - def _add_reverse_relationship(self, all_relationships: Dict[str, SymbolRelationships], - target_symbol_id: str, original_rel: RelationshipInfo, - source_name: str, source_symbol_id: str): - """Add a reverse relationship to the target symbol""" - - if target_symbol_id not in all_relationships: - all_relationships[target_symbol_id] = SymbolRelationships() - - # Create reverse relationship - reverse_rel = RelationshipInfo( - target=source_name, - target_symbol_id=source_symbol_id, - relationship_type=original_rel.relationship_type, - source=original_rel.target, - source_symbol_id=original_rel.target_symbol_id - ) - - # Add as reverse relationship - all_relationships[target_symbol_id].add_relationship(reverse_rel, is_reverse=True) \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/scip_index_tool.py b/src/code_index_mcp/tools/scip/scip_index_tool.py deleted file mode 100644 index f9e0c9c..0000000 --- a/src/code_index_mcp/tools/scip/scip_index_tool.py +++ /dev/null @@ -1,230 +0,0 @@ -""" -SCIP Index Tool - Pure technical component for SCIP index operations. - -This tool handles low-level SCIP index operations without any business logic. -It provides technical capabilities that can be composed by business services. -""" - -from typing import Optional, List -from dataclasses import dataclass -from pathlib import Path -import logging -from ...scip.proto.scip_pb2 import Index as SCIPIndex -from ...indexing.scip_builder import SCIPIndexBuilder - -logger = logging.getLogger(__name__) - -# Import FileInfo from the central location to avoid duplication -from ...indexing.index_provider import FileInfo - - -class SCIPIndexTool: - """ - Pure technical component for SCIP index operations. - - This tool provides low-level SCIP index capabilities without any - business logic or decision making. It's designed to be composed - by business services to achieve business goals. - """ - - def __init__(self): - self._scip_index: Optional[SCIPIndex] = None - self._builder = SCIPIndexBuilder() - self._project_path: Optional[str] = None - self._settings = None # Will be set when needed - - def is_index_available(self) -> bool: - """ - Check if SCIP index is available and ready for use. - - Returns: - True if index is available, False otherwise - """ - return self._scip_index is not None and len(self._scip_index.documents) > 0 - - def build_index(self, project_path: str) -> int: - """ - Build SCIP index for the specified project path. - - This is a pure technical operation that unconditionally rebuilds the index. - Business logic for deciding when to rebuild should be handled by the caller. - - Args: - project_path: Absolute path to the project directory - - Returns: - Number of files indexed - - Raises: - ValueError: If project path is invalid - RuntimeError: If index building fails - """ - if not Path(project_path).exists(): - logger.error(f"SCIP INDEX: Project path does not exist: {project_path}") - raise ValueError(f"Project path does not exist: {project_path}") - - # Build new index (pure technical operation) - try: - logger.info(f"Building index for {project_path}") - self._project_path = project_path - - # Initialize settings for this project - from ...project_settings import ProjectSettings - self._settings = ProjectSettings(project_path, skip_load=False) - - self._scip_index = self._builder.build_scip_index(project_path) - logger.info(f"Built index with {len(self._scip_index.documents)} files") - - return len(self._scip_index.documents) - except Exception as e: - logger.error(f"Failed to build index: {e}") - raise RuntimeError(f"Failed to build SCIP index: {e}") from e - - def save_index(self) -> bool: - """ - Save the current SCIP index to disk. - - This is a pure technical operation that saves the current in-memory index. - - Returns: - True if saved successfully, False otherwise - """ - try: - if self._settings is None: - logger.error("No settings available, cannot save index") - return False - - if self._scip_index is None: - logger.error("No index available to save") - return False - - self.save_current_index() - logger.info("Index saved successfully") - return True - except Exception as e: - logger.error(f"Failed to save index: {e}") - return False - - def get_file_list(self) -> List[FileInfo]: - """ - Get list of all indexed files. - - Returns: - List of FileInfo objects for all indexed files - - Raises: - RuntimeError: If index is not available - """ - if not self.is_index_available(): - raise RuntimeError("SCIP index is not available. Call build_index() first.") - - files = [] - for document in self._scip_index.documents: - file_info = FileInfo( - relative_path=document.relative_path, - language=document.language, - absolute_path=str(Path(self._project_path) / document.relative_path) if self._project_path else "" - ) - files.append(file_info) - - return files - - def get_file_count(self) -> int: - """ - Get the number of indexed files. - - Returns: - Number of files in the index - - Raises: - RuntimeError: If index is not available - """ - if not self.is_index_available(): - raise RuntimeError("SCIP index is not available") - - return len(self._scip_index.documents) - - def get_project_metadata(self) -> dict: - """ - Get project metadata from SCIP index. - - Returns: - Dictionary containing project metadata - - Raises: - RuntimeError: If index is not available - """ - if not self.is_index_available(): - raise RuntimeError("SCIP index is not available") - - return { - 'project_root': self._scip_index.metadata.project_root, - 'total_files': len(self._scip_index.documents), - 'tool_version': self._scip_index.metadata.tool_info.version, - 'languages': list(set(doc.language for doc in self._scip_index.documents)) - } - - def load_existing_index(self, project_path: str) -> bool: - """ - Try to load existing SCIP index from disk. - - Args: - project_path: Absolute path to the project directory - - Returns: - True if loaded successfully, False if no index exists or load failed - """ - try: - from ...project_settings import ProjectSettings - - self._project_path = project_path - settings = ProjectSettings(project_path, skip_load=False) - self._settings = settings - - # Try to load existing SCIP index - scip_index = settings.load_scip_index() - if scip_index is not None: - self._scip_index = scip_index - return True - else: - return False - - except Exception as e: - return False - - def save_current_index(self) -> bool: - """ - Save the current SCIP index to disk. - - Returns: - True if saved successfully, False otherwise - """ - if self._scip_index is None: - return False - - if self._settings is None: - return False - - try: - self._settings.save_scip_index(self._scip_index) - return True - except Exception: - return False - - def clear_index(self) -> None: - """Clear the current SCIP index.""" - self._scip_index = None - self._project_path = None - # Keep settings for potential reload - - def get_raw_index(self) -> Optional[SCIPIndex]: - """ - Get the raw SCIP index for advanced operations. - - Note: This should only be used by other technical tools, - not by business services. - - Returns: - Raw SCIP index or None if not available - """ - return self._scip_index diff --git a/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py b/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py deleted file mode 100644 index d357dc3..0000000 --- a/src/code_index_mcp/tools/scip/scip_symbol_analyzer.py +++ /dev/null @@ -1,1565 +0,0 @@ -""" -SCIP Symbol Analyzer - Enhanced symbol analysis for accurate code intelligence - -This module provides the main SCIPSymbolAnalyzer class that replaces the legacy -SCIPQueryTool with accurate symbol location detection, proper type classification, -and comprehensive call relationship analysis. -""" - -import os -import logging -from typing import Dict, List, Optional, Any, Set -from functools import lru_cache - -from .symbol_definitions import ( - SymbolDefinition, FileAnalysis, ImportGroup, LocationInfo, - SymbolLocationError, SymbolResolutionError -) -# Removed SCIPRelationshipReader - relationships now read directly from SCIP index -from ...scip.core.symbol_manager import SCIPSymbolManager -from .relationship_info import SymbolRelationships, RelationshipInfo, RelationshipType - -logger = logging.getLogger(__name__) - -# Try to import SCIP protobuf definitions -try: - from ...scip.proto import scip_pb2 - SCIP_PROTO_AVAILABLE = True -except ImportError: - scip_pb2 = None - SCIP_PROTO_AVAILABLE = False - logger.warning("SCIP protobuf definitions not available") - - -class SCIPSymbolAnalyzer: - """ - Enhanced SCIP symbol analyzer with accurate position detection and call relationships. - - This class replaces the legacy SCIPQueryTool and provides: - - Accurate symbol location extraction from SCIP Range data - - Proper symbol type classification using SCIP SymbolKind enum - - Comprehensive call relationship analysis - - Cross-file symbol resolution - - LLM-optimized output formatting - """ - - def __init__(self): - """Initialize the symbol analyzer.""" - self._symbol_kind_cache: Dict[int, str] = {} - self._scip_symbol_cache: Dict[str, Dict[str, Any]] = {} - self._symbol_parser: Optional[SCIPSymbolManager] = None - # Removed relationship reader - relationships now read directly from SCIP index - - # Initialize SCIP symbol kind mapping - self._init_symbol_kind_mapping() - - def _init_symbol_kind_mapping(self): - """Initialize SCIP SymbolKind enum mapping.""" - if not SCIP_PROTO_AVAILABLE: - # Fallback numeric mapping when protobuf not available - self._symbol_kind_map = { - 3: 'class', # CLASS - 11: 'function', # FUNCTION - 14: 'method', # METHOD - 29: 'variable', # VARIABLE - 4: 'constant', # CONSTANT - 6: 'enum', # ENUM - 7: 'enum_member', # ENUM_MEMBER - 9: 'field', # FIELD - 23: 'property', # PROPERTY - 5: 'constructor', # CONSTRUCTOR - 15: 'module', # MODULE - 16: 'namespace', # NAMESPACE - 12: 'interface', # INTERFACE - 25: 'struct', # STRUCT - 33: 'trait', # TRAIT - 35: 'macro', # MACRO - } - else: - # Use actual protobuf enum when available - self._symbol_kind_map = {} - # Will be populated dynamically using scip_pb2.SymbolKind.Name() - - def analyze_file(self, file_path: str, scip_index) -> FileAnalysis: - """ - Main entry point for file analysis. - - Args: - file_path: Relative path to the file to analyze - scip_index: SCIP index containing all project data - - Returns: - FileAnalysis object with complete symbol information - - Raises: - ValueError: If file not found or analysis fails - """ - try: - logger.debug(f"Starting analysis for file: {file_path}") - - # Initialize symbol parser from index metadata (for scip-* symbol parsing) - try: - project_root = getattr(getattr(scip_index, 'metadata', None), 'project_root', '') or '' - if project_root: - self._symbol_parser = SCIPSymbolManager(project_root) - except Exception: - self._symbol_parser = None - - # Step 1: Find the document in SCIP index - document = self._find_document(file_path, scip_index) - if not document: - logger.warning(f"Document not found in SCIP index: {file_path}") - return self._create_empty_analysis(file_path) - - logger.debug(f"Found document with {len(document.symbols)} symbols") - - # Step 2: Extract all symbols with accurate metadata - symbols = self._extract_all_symbols(document) - logger.debug(f"Extracted {len(symbols)} symbols") - - # Step 3: Extract call relationships - self._extract_call_relationships(document, symbols, scip_index) - logger.debug("Completed call relationship extraction") - - # Step 4: Organize results into final structure - result = self._organize_results(document, symbols, scip_index) - logger.debug(f"Analysis complete: {len(result.functions)} functions, {len(result.classes)} classes") - - return result - - except Exception as e: - logger.error(f"Failed to analyze file {file_path}: {e}") - # Return partial analysis rather than failing completely - return self._create_error_analysis(file_path, str(e)) - - def _find_document(self, file_path: str, scip_index) -> Optional[Any]: - """ - Find the SCIP document for the given file path. - - Args: - file_path: File path to search for - scip_index: SCIP index object - - Returns: - SCIP document or None if not found - """ - if not hasattr(scip_index, 'documents'): - logger.error("Invalid SCIP index: missing documents attribute") - return None - - # Normalize path for comparison - normalized_target = self._normalize_path(file_path) - - # Try exact match first - for document in scip_index.documents: - if self._normalize_path(document.relative_path) == normalized_target: - return document - - # Try case-insensitive match - normalized_lower = normalized_target.lower() - for document in scip_index.documents: - if self._normalize_path(document.relative_path).lower() == normalized_lower: - logger.debug(f"Found case-insensitive match for {file_path}") - return document - - return None - - def _normalize_path(self, path: str) -> str: - """Normalize file path for consistent comparison.""" - return path.replace('\\', '/').lstrip('./') - - def _extract_all_symbols(self, document) -> Dict[str, SymbolDefinition]: - """ - Extract all symbols from the document in a single pass. - - Args: - document: SCIP document object - - Returns: - Dictionary mapping SCIP symbols to SymbolDefinition objects - """ - symbols = {} - - for symbol_info in document.symbols: - try: - # Extract basic symbol information - scip_symbol = symbol_info.symbol - display_name = getattr(symbol_info, 'display_name', '') - symbol_kind = getattr(symbol_info, 'kind', 0) - - # Parse symbol name and classification - parsed_name, class_name = self._parse_symbol_identity(scip_symbol, display_name) - if not parsed_name: - continue - - # Get symbol type from SCIP kind - symbol_type = self._classify_symbol_type(symbol_kind, scip_symbol) - - # Extract precise location - # Extract location (never fails now) - location = self._extract_precise_location(scip_symbol, document) - - # Debug: Check location type - if not isinstance(location, LocationInfo): - logger.error(f"Location extraction returned wrong type: {type(location)} for symbol {scip_symbol}") - location = LocationInfo(line=1, column=1) # Fallback - - # Create symbol definition - symbol_def = SymbolDefinition( - name=parsed_name, - line=location.line, - column=location.column, - symbol_type=symbol_type, - class_name=class_name, - scip_symbol=scip_symbol - ) - - # Extract additional metadata - self._enrich_symbol_metadata(symbol_def, symbol_info, document) - - symbols[scip_symbol] = symbol_def - logger.debug(f"Processed symbol: {parsed_name} ({symbol_type}) at {location.line}:{location.column}") - - except Exception as e: - logger.warning(f"Failed to process symbol {getattr(symbol_info, 'symbol', 'unknown')}: {e}") - continue - - return symbols - - def _parse_symbol_identity(self, scip_symbol: str, display_name: str = '') -> tuple[str, Optional[str]]: - """ - Parse symbol name and class ownership from SCIP symbol string. - - Args: - scip_symbol: SCIP symbol identifier - display_name: Display name from symbol info - - Returns: - Tuple of (symbol_name, class_name) - """ - # Use display name if available and meaningful - if display_name and not display_name.startswith('__'): - name = display_name - else: - # Extract from SCIP symbol - name = self._extract_name_from_scip_symbol(scip_symbol) - - # Extract class name if this is a class member - class_name = self._extract_class_name(scip_symbol) - - return name, class_name - - @lru_cache(maxsize=500) - def _extract_name_from_scip_symbol(self, scip_symbol: str) -> str: - """Extract clean, human-readable symbol name from SCIP symbol identifier.""" - try: - if scip_symbol.startswith('local:'): - # local:src.module.Class#method_name(). - symbol_path = scip_symbol[6:] # Remove 'local:' prefix - - if '#' in symbol_path: - # Method or field: extract after '#' - method_part = symbol_path.split('#')[-1] - return self._clean_symbol_name(method_part) - else: - # Class or top-level function: extract last part - class_part = symbol_path.split('.')[-1] - return self._clean_symbol_name(class_part) - - elif scip_symbol.startswith('external:'): - # external:module.path/ClassName#method_name(). - if '/' in scip_symbol: - after_slash = scip_symbol.split('/')[-1] - if '#' in after_slash: - method_part = after_slash.split('#')[-1] - return self._clean_symbol_name(method_part) - else: - return self._clean_symbol_name(after_slash) - else: - # Just module reference - module_part = scip_symbol[9:] # Remove 'external:' - return self._clean_symbol_name(module_part.split('.')[-1]) - - # Fallback: clean up whatever we have - return self._clean_symbol_name(scip_symbol.split('/')[-1].split('#')[-1]) - - except Exception as e: - logger.debug(f"Error extracting name from {scip_symbol}: {e}") - return "unknown" - - def _clean_symbol_name(self, raw_name: str) -> str: - """Clean symbol name for human readability.""" - # Remove common suffixes and prefixes - cleaned = raw_name.rstrip('().#') - - # Remove module path prefixes if present - if '.' in cleaned: - cleaned = cleaned.split('.')[-1] - - # Handle special cases - if not cleaned or cleaned.isdigit(): - return "unknown" - - return cleaned - - @lru_cache(maxsize=500) - def _extract_class_name(self, scip_symbol: str) -> Optional[str]: - """Extract clean class name if this symbol belongs to a class. - - Supports: - - Legacy local/external formats with '#': local:...Class#method / external:.../Class#method - - Current scip-* local format where descriptors encode path as - //(). - """ - try: - # Newer scip-* local symbols: parse descriptors path - if scip_symbol.startswith('scip-'): - parts = scip_symbol.split(' ', 4) - descriptors = parts[4] if len(parts) == 5 else (parts[3] if len(parts) >= 4 else '') - if descriptors: - components = [p for p in descriptors.split('/') if p] - if len(components) >= 2: - candidate = components[-2] - return candidate if candidate and not candidate.isdigit() else None - - if '#' not in scip_symbol: - return None - - if scip_symbol.startswith('local:'): - # local:src.module.ClassName#method - symbol_path = scip_symbol[6:] # Remove 'local:' - class_part = symbol_path.split('#')[0] - - # Extract just the class name (last part of module path) - if '.' in class_part: - class_name = class_part.split('.')[-1] - else: - class_name = class_part - - return class_name if class_name and not class_name.isdigit() else None - - elif scip_symbol.startswith('external:'): - # external:module/ClassName#method - if '/' in scip_symbol: - path_part = scip_symbol.split('/')[-1] - if '#' in path_part: - class_name = path_part.split('#')[0] - return class_name if class_name and not class_name.isdigit() else None - - except Exception as e: - logger.debug(f"Error extracting class name from {scip_symbol}: {e}") - - return None - - def _classify_symbol_type(self, scip_kind: int, scip_symbol: str) -> str: - """ - Classify symbol type using SCIP SymbolKind enum. - - Args: - scip_kind: SCIP SymbolKind enum value - scip_symbol: SCIP symbol string for additional context - - Returns: - Standardized symbol type string - """ - # Try to get cached result - if scip_kind in self._symbol_kind_cache: - base_type = self._symbol_kind_cache[scip_kind] - else: - base_type = self._get_scip_kind_name(scip_kind) - self._symbol_kind_cache[scip_kind] = base_type - - # Refine classification based on index symbol structure - if base_type == 'function': - # Legacy/colon formats use '#' - if '#' in scip_symbol: - return 'method' - # Current scip-* local descriptors path: //(). - if scip_symbol.startswith('scip-'): - parts = scip_symbol.split(' ', 4) - descriptors = parts[4] if len(parts) == 5 else (parts[3] if len(parts) >= 4 else '') - if descriptors: - components = [p for p in descriptors.split('/') if p] - if len(components) >= 2: - last_comp = components[-1] - if last_comp.endswith('().') or last_comp.endswith('()'): - return 'method' - - return base_type - - def _get_scip_kind_name(self, kind: int) -> str: - """Get symbol type name from SCIP SymbolKind.""" - if SCIP_PROTO_AVAILABLE: - try: - # Use protobuf enum name - enum_name = scip_pb2.SymbolKind.Name(kind) - return self._normalize_kind_name(enum_name) - except (ValueError, AttributeError): - pass - - # Fallback to numeric mapping - return self._symbol_kind_map.get(kind, 'unknown') - - def _normalize_kind_name(self, enum_name: str) -> str: - """Normalize SCIP enum name to standard type.""" - enum_name = enum_name.lower() - - # Map SCIP names to our standard names - if enum_name == 'class': - return 'class' - elif enum_name in ['function', 'func']: - return 'function' - elif enum_name == 'method': - return 'method' - elif enum_name in ['variable', 'var']: - return 'variable' - elif enum_name in ['constant', 'const']: - return 'constant' - elif enum_name == 'field': - return 'field' - elif enum_name == 'property': - return 'property' - else: - return enum_name - - def _extract_precise_location(self, scip_symbol: str, document) -> LocationInfo: - """ - Never-fail location extraction with intelligent fallbacks using SCIPSymbolManager. - - Args: - scip_symbol: SCIP symbol identifier - document: SCIP document containing occurrences - - Returns: - LocationInfo with best available location and confidence level - """ - # Layer 1: Standard SCIP occurrence location - location = self._find_definition_location(scip_symbol, document) - if location: - location.confidence = 'definition' - return location - - location = self._find_any_location(scip_symbol, document) - if location: - location.confidence = 'occurrence' - return location - - # Layer 2: SCIPSymbolManager-based symbol structure inference - if self._symbol_parser: - location = self._infer_location_from_symbol_structure(scip_symbol, document) - if location: - location.confidence = 'inferred' - return location - - # Layer 3: Symbol type-based default location - location = self._get_default_location_by_symbol_type(scip_symbol) - location.confidence = 'default' - return location - - def _find_definition_location(self, scip_symbol: str, document) -> Optional[LocationInfo]: - """Find the definition occurrence for a symbol.""" - for occurrence in document.occurrences: - if occurrence.symbol == scip_symbol and self._is_definition(occurrence): - location = self._parse_occurrence_location(occurrence) - if location: - return location - return None - - def _find_any_location(self, scip_symbol: str, document) -> Optional[LocationInfo]: - """Find any occurrence with location data for a symbol.""" - for occurrence in document.occurrences: - if occurrence.symbol == scip_symbol: - location = self._parse_occurrence_location(occurrence) - if location: - return location - return None - - def _is_definition(self, occurrence) -> bool: - """Check if an occurrence represents a definition.""" - if not hasattr(occurrence, 'symbol_roles'): - return False - - try: - if SCIP_PROTO_AVAILABLE: - return bool(occurrence.symbol_roles & scip_pb2.SymbolRole.Definition) - else: - # Fallback: Definition role = 1 - return bool(occurrence.symbol_roles & 1) - except (AttributeError, TypeError): - return False - - def _parse_occurrence_location(self, occurrence) -> Optional[LocationInfo]: - """Parse location information from SCIP occurrence.""" - try: - if not hasattr(occurrence, 'range') or not occurrence.range: - return None - - range_obj = occurrence.range - if not hasattr(range_obj, 'start') or not range_obj.start: - return None - - start = range_obj.start - if len(start) >= 2: - # SCIP uses 0-based indexing, convert to 1-based - line = start[0] + 1 - column = start[1] + 1 - return LocationInfo(line=line, column=column) - - except (AttributeError, IndexError, TypeError) as e: - logger.debug(f"Failed to parse occurrence location: {e}") - - return None - - def _enrich_symbol_metadata(self, symbol: SymbolDefinition, symbol_info, document): - """Enrich symbol with additional metadata from SCIP data.""" - # Extract documentation if available - if hasattr(symbol_info, 'documentation') and symbol_info.documentation: - # Could extract docstrings here if needed - pass - - # For functions/methods, extract parameter information - if symbol.is_callable(): - symbol.parameters = self._extract_function_parameters(symbol.scip_symbol, symbol_info, document) - symbol.return_type = self._extract_return_type(symbol.scip_symbol, symbol_info) - symbol.is_async = self._is_async_function(symbol.scip_symbol, symbol_info) - - # For classes, extract methods and attributes - elif symbol.symbol_type == 'class': - symbol.methods, symbol.attributes = self._extract_class_members(symbol.scip_symbol, document) - symbol.inherits_from = self._extract_inheritance(symbol.scip_symbol, symbol_info) - - # For variables, extract type and scope information - elif symbol.symbol_type == 'variable': - symbol.type = self._extract_variable_type(symbol.scip_symbol, symbol_info) - symbol.is_global = self._is_global_variable(symbol.scip_symbol, document) - - # For constants, extract value if available - elif symbol.symbol_type == 'constant': - symbol.value = self._extract_constant_value(symbol.scip_symbol, symbol_info) - - def _extract_call_relationships(self, document, symbols: Dict[str, SymbolDefinition], scip_index): - """ - Extract relationships from SCIP index and build correct called_by relationships. - - Args: - document: SCIP document containing symbols and relationships - symbols: Dictionary of extracted symbols - scip_index: Full SCIP index - """ - logger.debug("Building called_by relationships from SCIP index") - - # Step 1: Collect all call relationships from the document - call_relationships = [] # List of (caller_id, target_id) tuples - - for symbol_info in document.symbols: - caller_id = symbol_info.symbol - - # Process each relationship of this symbol - for scip_rel in symbol_info.relationships: - if scip_rel.is_reference: # This indicates a call/reference relationship - target_id = scip_rel.symbol - call_relationships.append((caller_id, target_id)) - - # Step 2: Build called_by relationships by reversing the direction - for caller_id, target_id in call_relationships: - # Find the target symbol and add the caller to its called_by list - if target_id in symbols: - target_symbol = symbols[target_id] - caller_name = self._extract_symbol_name(caller_id) - - # Create RelationshipInfo for called_by - rel_info = RelationshipInfo( - target=caller_name, - target_symbol_id=caller_id, - relationship_type=RelationshipType.FUNCTION_CALL - ) - - # Add to target symbol's called_by relationships with deduplication - target_symbol.relationships.add_relationship(rel_info, is_reverse=True) - - logger.debug(f"Relationship extraction completed for {len(symbols)} symbols") - - def _convert_scip_relationships(self, scip_relationships, document): - """ - Convert SCIP Relationship objects to our SymbolRelationships format. - - Args: - scip_relationships: List of SCIP Relationship objects - document: SCIP document for context - - Returns: - SymbolRelationships object or None - """ - if not scip_relationships: - return None - - symbol_rels = SymbolRelationships() - - for scip_rel in scip_relationships: - # Extract symbol name from the relationship - target_name = self._extract_symbol_name(scip_rel.symbol) - - - # Create RelationshipInfo - rel_info = RelationshipInfo( - target=target_name, - target_symbol_id=scip_rel.symbol, - relationship_type=RelationshipType.FUNCTION_CALL if scip_rel.is_reference else RelationshipType.REFERENCE - ) - - # Add to appropriate category based on relationship type with deduplication - if scip_rel.is_reference: - # This is a "called_by" relationship (the symbol calls us) - symbol_rels.add_relationship(rel_info, is_reverse=True) - elif scip_rel.is_implementation: - symbol_rels.add_relationship(rel_info, is_reverse=True) # implements - elif scip_rel.is_type_definition: - symbol_rels.add_relationship(rel_info, is_reverse=False) # references - else: - symbol_rels.add_relationship(rel_info, is_reverse=False) # references - - return symbol_rels - - def _find_call_occurrence_position(self, caller_id: str, target_id: str, document) -> tuple[int, int]: - """ - Find the position where caller calls the target by looking up call occurrences. - - Args: - caller_id: The symbol ID of the calling function - target_id: The symbol ID of the called function - document: SCIP document containing occurrences - - Returns: - Tuple of (line, column) of the call or (0, 0) if not found - """ - try: - # Look through document occurrences to find where target_id is referenced - call_positions = [] - - for occurrence in document.occurrences: - if occurrence.symbol == target_id: - # Debug log the occurrence details - logger.debug(f"Found occurrence for {target_id}: roles={occurrence.symbol_roles}, range={occurrence.range}") - - # Only include reference/call occurrences, not definitions - # SCIP SymbolRole: 1=Definition, 8=Read/Reference - if occurrence.symbol_roles != 1: # Not a definition - # Extract line and column from the occurrence range - if occurrence.range and occurrence.range.start: - # SCIP uses 0-based indexing, convert to 1-based for display - line = occurrence.range.start[0] + 1 if len(occurrence.range.start) > 0 else 1 - column = occurrence.range.start[1] + 1 if len(occurrence.range.start) > 1 else 1 - call_positions.append((line, column)) - logger.debug(f"Added call position: line={line}, column={column}") - - # Return the first call position found (we can improve this later to be more specific) - if call_positions: - return call_positions[0] - - # Fallback: if not found in occurrences, return default - return 0, 0 - - except (AttributeError, IndexError, TypeError) as e: - # Handle any issues with accessing the occurrence data - logger.debug(f"Error in _find_call_occurrence_position: {e}") - return 0, 0 - - def _extract_symbol_name(self, symbol_id: str) -> str: - """Extract readable name from symbol ID.""" - if symbol_id.startswith('local '): - # Remove 'local ' prefix and any suffix - name = symbol_id[6:] - # Remove common suffixes - for suffix in ['().', '#', '.', '()']: - if name.endswith(suffix): - name = name[:-len(suffix)] - break - return name - return symbol_id - - def _organize_results(self, document, symbols: Dict[str, SymbolDefinition], scip_index=None) -> FileAnalysis: - """ - Organize extracted symbols into final FileAnalysis structure. - - Args: - document: SCIP document - symbols: Extracted symbol definitions - scip_index: Full SCIP index for external symbol extraction - - Returns: - FileAnalysis with organized results - """ - # Create file analysis result - result = FileAnalysis( - file_path=document.relative_path, - language=document.language, - line_count=self._estimate_line_count(document), - size_bytes=0 # TODO: Could get from filesystem if needed - ) - - # Add symbols to appropriate collections - for symbol in symbols.values(): - result.add_symbol(symbol) - - # Extract import information from occurrences - self._extract_imports(document, result.imports) - - # Also extract imports from external symbols (for strategies like Objective-C) - if scip_index: - self._extract_imports_from_external_symbols(scip_index, result.imports) - - return result - - - - def _estimate_line_count(self, document) -> int: - """Estimate line count from document data.""" - # Try to get from document text if available - if hasattr(document, 'text') and document.text: - return len(document.text.splitlines()) - - # Fallback: estimate from occurrence ranges - max_line = 0 - for occurrence in document.occurrences: - try: - if occurrence.range and occurrence.range.start: - line = occurrence.range.start[0] + 1 - max_line = max(max_line, line) - except (AttributeError, IndexError): - continue - - return max_line if max_line > 0 else 100 # Default estimate - - def _is_function_call(self, occurrence) -> bool: - """ - Check if an occurrence represents a function call. - - Based on debug analysis, function calls have roles=0 in our SCIP data, - so we need to identify them by other characteristics. - - Args: - occurrence: SCIP occurrence object - - Returns: - True if this occurrence is a function call - """ - try: - symbol = occurrence.symbol - roles = getattr(occurrence, 'symbol_roles', 0) - - # Check if it's a definition (role = 1) - these are NOT calls - if roles & 1: - return False - - # Check if it's an import (role = 2) - these are NOT calls - if roles & 2: - return False - - # For roles = 0, check if it looks like a function call by symbol format - if roles == 0: - # Function calls typically have () in the symbol - if '()' in symbol: - # But exclude definitions at line start positions - if hasattr(occurrence, 'range') and occurrence.range: - if hasattr(occurrence.range, 'start') and occurrence.range.start: - line = occurrence.range.start[0] + 1 - col = occurrence.range.start[1] + 1 - # Function definitions usually start at column 1 or 5 (indented) - # Function calls are usually at higher column positions - return col > 5 - return True - - # Traditional role-based detection as fallback - if SCIP_PROTO_AVAILABLE: - return bool(roles & (scip_pb2.SymbolRole.Read | scip_pb2.SymbolRole.Reference)) - else: - # Fallback: Read=8, Reference=4 - return bool(roles & (8 | 4)) - - except (AttributeError, TypeError): - return False - - def _find_containing_function(self, occurrence, function_symbols: Dict[str, SymbolDefinition], document) -> Optional[SymbolDefinition]: - """ - Find which function contains the given occurrence. - - Args: - occurrence: SCIP occurrence object - function_symbols: Map of SCIP symbols to function definitions - document: SCIP document - - Returns: - SymbolDefinition of the containing function, or None - """ - try: - occurrence_line = self._get_occurrence_line(occurrence) - if occurrence_line <= 0: - return None - - # Find the function that contains this line - best_match = None - best_distance = float('inf') - - for scip_symbol, func_def in function_symbols.items(): - # Function should start before or at the occurrence line - if func_def.line <= occurrence_line: - distance = occurrence_line - func_def.line - if distance < best_distance: - best_distance = distance - best_match = func_def - - return best_match - - except Exception as e: - logger.debug(f"Error finding containing function: {e}") - return None - - def _get_occurrence_line(self, occurrence) -> int: - """Extract line number from SCIP occurrence.""" - try: - if hasattr(occurrence, 'range') and occurrence.range: - if hasattr(occurrence.range, 'start') and occurrence.range.start: - return occurrence.range.start[0] + 1 # Convert to 1-based - except (AttributeError, IndexError, TypeError): - pass - return 0 - - def _resolve_call_target(self, target_symbol: str, scip_index, current_document) -> Optional[Dict[str, Any]]: - """Use SCIPSymbolManager to resolve call target information. - - Args: - target_symbol: SCIP symbol being called - scip_index: Full SCIP index for cross-file lookup - current_document: Current document for local symbol context - - Returns: - Dictionary with call target information or None - """ - if not self._symbol_parser: - return self._fallback_resolve_target(target_symbol, current_document) - - try: - # Use SCIPSymbolManager to parse symbol - symbol_info = self._symbol_parser.parse_symbol(target_symbol) - if not symbol_info: - return None - - # Extract clear symbol name from descriptors - target_name = self._extract_symbol_name_from_descriptors(symbol_info.descriptors) - - # Handle based on manager type - if symbol_info.manager == 'local': - # Local call: use existing file path extraction - file_path = self._symbol_parser.get_file_path_from_symbol(target_symbol) - target_line = self._find_local_symbol_location(target_symbol, current_document) - return { - 'name': target_name, - 'scope': 'local', - 'file': file_path or current_document.relative_path, - 'line': target_line - } - - elif symbol_info.manager in ['stdlib', 'pip', 'npm']: - # External call: get info from parsed results - return { - 'name': target_name, - 'scope': 'external', - 'package': symbol_info.package, - 'module': self._extract_module_from_descriptors(symbol_info.descriptors) - } - - return None - - except Exception as e: - logger.debug(f"Error resolving call target {target_symbol}: {e}") - return None - - - def _find_symbol_definition(self, target_symbol: str, scip_index) -> tuple[Optional[str], int]: - """ - Find the definition location of a symbol in the SCIP index. - - Args: - target_symbol: SCIP symbol to find - scip_index: Full SCIP index - - Returns: - Tuple of (file_path, line_number) or (None, 0) if not found - """ - try: - for document in scip_index.documents: - for occurrence in document.occurrences: - if (occurrence.symbol == target_symbol and - self._is_definition(occurrence)): - line = self._get_occurrence_line(occurrence) - return document.relative_path, line - except Exception as e: - logger.debug(f"Error finding symbol definition: {e}") - - return None, 0 - - def _extract_symbol_name_from_descriptors(self, descriptors: str) -> str: - """Extract symbol name from SCIP descriptors.""" - # utils.py/helper_function() -> helper_function - # MyClass/method() -> method - if '/' in descriptors: - symbol_part = descriptors.split('/')[-1] - return symbol_part.rstrip('().') - return descriptors.rstrip('().') - - def _extract_module_from_descriptors(self, descriptors: str) -> Optional[str]: - """Extract module name from descriptors.""" - # os/ -> os, pathlib/Path -> pathlib - if '/' in descriptors: - return descriptors.split('/')[0] - return descriptors.strip('/') - - def _fallback_resolve_target(self, target_symbol: str, current_document) -> Optional[Dict[str, Any]]: - """Fallback resolution when SCIPSymbolManager is not available.""" - try: - # Parse the target symbol using legacy method - target_name, target_class = self._parse_symbol_identity(target_symbol) - if not target_name: - return None - - # Basic resolution for legacy formats - if target_symbol.startswith('local:'): - target_location = self._find_local_symbol_location(target_symbol, current_document) - return { - 'name': target_name, - 'scope': 'local', - 'file': current_document.relative_path, - 'line': target_location - } - - return { - 'name': target_name, - 'scope': 'unknown', - 'file': 'unknown', - 'line': 0 - } - - except Exception as e: - logger.debug(f"Fallback resolution failed for {target_symbol}: {e}") - return None - - def _find_local_symbol_location(self, target_symbol: str, document) -> int: - """Find the line number for a local symbol definition.""" - try: - for occurrence in document.occurrences: - if (occurrence.symbol == target_symbol and - self._is_definition(occurrence)): - return self._get_occurrence_line(occurrence) - except Exception as e: - logger.debug(f"Error finding local symbol location: {e}") - return 0 - - - - def _extract_imports(self, document, imports: ImportGroup): - """Use SCIPSymbolManager to correctly parse imports.""" - if not self._symbol_parser: - logger.debug("No symbol parser available, skipping import extraction") - return - - try: - seen_modules = set() - - # Method 1: Extract from occurrences with Import role (traditional approach) - for occurrence in document.occurrences: - # Only process Import role symbols - if not self._is_import_occurrence(occurrence): - continue - - symbol_info = self._symbol_parser.parse_symbol(occurrence.symbol) - if not symbol_info: - continue - - # Handle based on manager type - if symbol_info.manager == 'stdlib': - module_name = self._extract_module_from_descriptors(symbol_info.descriptors) - if module_name and module_name not in seen_modules: - imports.add_import(module_name, 'standard_library') - seen_modules.add(module_name) - - elif symbol_info.manager == 'pip': - # pip packages: package name is the module name - package_name = symbol_info.package - if package_name and package_name not in seen_modules: - imports.add_import(package_name, 'third_party') - seen_modules.add(package_name) - - elif symbol_info.manager == 'local': - # Local imports: extract module path from descriptors - module_path = self._extract_local_module_path(symbol_info.descriptors) - if module_path and module_path not in seen_modules: - # For Zig imports, classify by module name - if any('.zig' in part for part in symbol_info.descriptors.split('/')): - import_type = self._classify_zig_import(module_path) - else: - import_type = 'local' - imports.add_import(module_path, import_type) - seen_modules.add(module_path) - - logger.debug(f"Extracted {len(seen_modules)} unique imports from SCIP occurrences") - - except Exception as e: - logger.debug(f"Error extracting imports from occurrences: {e}") - - def _extract_imports_from_external_symbols(self, scip_index, imports: ImportGroup): - """Extract imports from SCIP index external symbols (for strategies like Objective-C).""" - try: - if not hasattr(scip_index, 'external_symbols'): - logger.debug("No external_symbols in SCIP index") - return - - seen_modules = set() - - for symbol_info in scip_index.external_symbols: - if not symbol_info.symbol: - continue - - # Parse the external symbol - parsed_symbol = self._symbol_parser.parse_symbol(symbol_info.symbol) if self._symbol_parser else None - if not parsed_symbol: - # Fallback: try to extract framework name from symbol string - framework_name = self._extract_framework_from_symbol_string(symbol_info.symbol) - if framework_name and framework_name not in seen_modules: - # Classify based on symbol pattern - import_type = self._classify_external_symbol(symbol_info.symbol) - imports.add_import(framework_name, import_type) - seen_modules.add(framework_name) - logger.debug(f"Extracted external dependency: {framework_name} ({import_type})") - continue - - # Handle based on manager type - if parsed_symbol.manager in ['system', 'unknown']: - # For Objective-C system frameworks - package_name = parsed_symbol.package - if package_name and package_name not in seen_modules: - imports.add_import(package_name, 'standard_library') - seen_modules.add(package_name) - - elif parsed_symbol.manager in ['cocoapods', 'carthage']: - # Third-party Objective-C dependencies - package_name = parsed_symbol.package - if package_name and package_name not in seen_modules: - imports.add_import(package_name, 'third_party') - seen_modules.add(package_name) - - logger.debug(f"Extracted {len(seen_modules)} unique imports from external symbols") - - except Exception as e: - logger.debug(f"Error extracting imports from external symbols: {e}") - - def _extract_framework_from_symbol_string(self, symbol_string: str) -> Optional[str]: - """Extract framework name from SCIP symbol string.""" - try: - # Handle symbols like "scip-unknown unknown Foundation Foundation *." - parts = symbol_string.split() - if len(parts) >= 4: - # The package name is typically the 3rd or 4th part - for part in parts[2:5]: # Check parts 2, 3, 4 - if part and part != 'unknown' and not part.endswith('.'): - return part - return None - except Exception: - return None - - def _classify_external_symbol(self, symbol_string: str) -> str: - """Classify external symbol as standard_library, third_party, or local.""" - try: - # Check for known system frameworks - system_frameworks = { - 'Foundation', 'UIKit', 'CoreData', 'CoreGraphics', 'QuartzCore', - 'AVFoundation', 'CoreLocation', 'MapKit', 'CoreAnimation', - 'Security', 'SystemConfiguration', 'CFNetwork', 'CoreFoundation', - 'AppKit', 'Cocoa', 'WebKit', 'JavaScriptCore' - } - - for framework in system_frameworks: - if framework in symbol_string: - return 'standard_library' - - # Check for third-party indicators - if any(indicator in symbol_string.lower() for indicator in ['cocoapods', 'carthage', 'pods']): - return 'third_party' - - return 'standard_library' # Default for external symbols - - except Exception: - return 'standard_library' - - def _parse_external_module(self, external_symbol: str) -> Optional[Dict[str, str]]: - """Parse external SCIP symbol to extract module information.""" - try: - if not external_symbol.startswith('external:'): - return None - - # Remove 'external:' prefix and parse path - symbol_path = external_symbol[9:] - - # Extract base module path (before '/' or '#') - if '/' in symbol_path: - module_path = symbol_path.split('/')[0] - elif '#' in symbol_path: - module_path = symbol_path.split('#')[0] - else: - module_path = symbol_path - - # Clean up module path - module_path = module_path.rstrip('.') - if not module_path: - return None - - # Categorize the import - category = self._categorize_import(module_path) - - return { - 'module': module_path, - 'category': category - } - - except Exception as e: - logger.debug(f"Error parsing external module {external_symbol}: {e}") - return None - - def _categorize_import(self, module_path: str) -> str: - """Categorize import as standard_library, third_party, or local.""" - # Standard library modules (common ones) - stdlib_modules = { - 'os', 'sys', 'json', 'time', 'datetime', 'logging', 'pathlib', - 'typing', 'dataclasses', 'functools', 'itertools', 'collections', - 're', 'math', 'random', 'threading', 'subprocess', 'shutil', - 'contextlib', 'traceback', 'warnings', 'weakref', 'copy', - 'pickle', 'base64', 'hashlib', 'hmac', 'uuid', 'urllib', - 'http', 'socketserver', 'email', 'mimetypes', 'csv', 'configparser', - 'argparse', 'getopt', 'tempfile', 'glob', 'fnmatch', 'linecache', - 'pprint', 'textwrap', 'string', 'struct', 'codecs', 'unicodedata', - 'io', 'gzip', 'bz2', 'lzma', 'zipfile', 'tarfile' - } - - # Local imports (relative imports or project-specific patterns) - if module_path.startswith('.'): - return 'local' - - # Check for common project patterns - if any(pattern in module_path for pattern in ['src.', 'lib.', 'app.', 'project.']): - return 'local' - - # Standard library check - base_module = module_path.split('.')[0] - if base_module in stdlib_modules: - return 'standard_library' - - # Everything else is third_party - return 'third_party' - - - def _is_import_occurrence(self, occurrence) -> bool: - """Check if occurrence represents an import.""" - # Import role = 2 (based on debug results) - return hasattr(occurrence, 'symbol_roles') and (occurrence.symbol_roles & 2) - - def _extract_local_module_path(self, descriptors: str) -> Optional[str]: - """Extract module path from local descriptors.""" - # utils.py/helper_function() -> utils - # services/user_service.py/UserService -> services.user_service - # test/sample-projects/zig/code-index-example/src/main.zig/std. -> std - if '/' in descriptors: - parts = descriptors.split('/') - if len(parts) >= 2: - # For Zig: extract the symbol name (last part after the file path) - if any('.zig' in part for part in parts): - # Zig import: symbol name is the last part - symbol_name = parts[-1].rstrip('.') - return symbol_name - # For Python: traditional handling - file_part = parts[0] - if file_part.endswith('.py'): - return file_part[:-3].replace('/', '.') - return file_part.replace('/', '.') - return None - - def _classify_zig_import(self, module_name: str) -> str: - """Classify Zig import as standard_library, third_party, or local.""" - # Zig standard library modules - zig_stdlib = { - 'std', 'builtin', 'testing', 'math', 'fmt', 'mem', 'ascii', - 'unicode', 'json', 'crypto', 'compress', 'hash', 'http', - 'net', 'fs', 'os', 'process', 'thread', 'atomic', 'debug', - 'log', 'rand', 'sort', 'time', 'zig' - } - - # Local imports (relative paths) - if module_name.startswith('./') or module_name.startswith('../') or module_name.endswith('.zig'): - return 'local' - - # Standard library - if module_name in zig_stdlib: - return 'standard_library' - - # Everything else is third_party - return 'third_party' - - def _extract_class_name_from_descriptors(self, descriptors: str) -> Optional[str]: - """Extract class name from descriptors.""" - # test_empty_functions.py/TestClass# -> TestClass - # test_empty_functions.py/TestClass/method() -> TestClass (if this is class symbol) - parts = descriptors.split('/') - if len(parts) >= 2: - class_part = parts[1] - # Remove trailing # if present (class symbols end with #) - return class_part.rstrip('#') - return None - - def _is_class_member(self, descriptors: str, class_name: str) -> bool: - """Check if descriptors belongs to specified class member.""" - # test_empty_functions.py/TestClass/method_one() contains TestClass - return f"/{class_name}/" in descriptors - - def _extract_member_name(self, descriptors: str, class_name: str) -> Optional[str]: - """Extract class member name.""" - # test_empty_functions.py/TestClass/method_one() -> method_one - if f"/{class_name}/" in descriptors: - after_class = descriptors.split(f"/{class_name}/", 1)[1] - return after_class.rstrip('().') - return None - - def _is_method_kind(self, kind: int) -> bool: - """Check if SCIP kind represents a method or function.""" - method_kinds = {'function', 'method'} - kind_name = self._get_scip_kind_name(kind) - return kind_name in method_kinds - - def _infer_location_from_symbol_structure(self, scip_symbol: str, document) -> Optional[LocationInfo]: - """Infer location based on symbol structure using SCIPSymbolManager.""" - symbol_info = self._symbol_parser.parse_symbol(scip_symbol) - if not symbol_info: - return None - - try: - # Strategy 1: If class member, estimate based on class location - if '/' in symbol_info.descriptors: - parts = symbol_info.descriptors.split('/') - if len(parts) >= 3: # file.py/ClassName/member - class_symbol = f"{symbol_info.scheme} {symbol_info.manager} {symbol_info.package} {'/'.join(parts[:2])}" - class_location = self._find_symbol_location_in_document(class_symbol, document) - if class_location: - # Members usually 2-10 lines after class definition - return LocationInfo( - line=class_location.line + 3, - column=class_location.column + 4 - ) - - # Strategy 2: Estimate based on file path (if symbol belongs to current file) - if symbol_info.manager == 'local': - file_path = self._symbol_parser.get_file_path_from_symbol(scip_symbol) - if file_path and file_path in document.relative_path: - return self._estimate_position_in_file(symbol_info.descriptors, document) - - except Exception as e: - logger.debug(f"Symbol location inference failed: {e}") - - return None - - def _find_symbol_location_in_document(self, target_symbol: str, document) -> Optional[LocationInfo]: - """Find location of target symbol in document.""" - for occurrence in document.occurrences: - if occurrence.symbol == target_symbol: - location = self._parse_occurrence_location(occurrence) - if location: - return location - return None - - def _estimate_position_in_file(self, descriptors: str, document) -> Optional[LocationInfo]: - """Estimate position based on descriptors and document structure.""" - # Simple heuristic: estimate line based on symbol type - if 'class' in descriptors.lower(): - return LocationInfo(line=max(1, len(document.occurrences) // 4), column=1) - elif any(marker in descriptors.lower() for marker in ['function', 'method']): - return LocationInfo(line=max(5, len(document.occurrences) // 2), column=1) - else: - return LocationInfo(line=1, column=1) - - def _get_default_location_by_symbol_type(self, scip_symbol: str) -> LocationInfo: - """Provide reasonable default location based on symbol type.""" - symbol_lower = scip_symbol.lower() - if 'class' in symbol_lower: - return LocationInfo(line=1, column=1) # Classes usually at file start - elif any(marker in symbol_lower for marker in ['function', 'method']): - return LocationInfo(line=5, column=1) # Functions usually after imports - else: - return LocationInfo(line=1, column=1) # Other symbols default position - - def _create_empty_analysis(self, file_path: str) -> FileAnalysis: - """Create empty analysis result for missing files.""" - return FileAnalysis( - file_path=file_path, - language='unknown', - line_count=0, - size_bytes=0 - ) - - def _create_error_analysis(self, file_path: str, error_message: str) -> FileAnalysis: - """Create error analysis result.""" - logger.error(f"Analysis error for {file_path}: {error_message}") - result = FileAnalysis( - file_path=file_path, - language='unknown', - line_count=0, - size_bytes=0 - ) - # Could add error information to metadata if needed - return result - - def _extract_function_parameters(self, scip_symbol: str, symbol_info, document) -> List[str]: - """ - Extract function parameter names from SCIP data. - - Args: - scip_symbol: SCIP symbol identifier - symbol_info: SCIP symbol information - document: SCIP document containing occurrences - - Returns: - List of parameter names - """ - try: - # Try to extract from documentation (Python strategy stores params here) - if hasattr(symbol_info, 'documentation') and symbol_info.documentation: - for doc_line in symbol_info.documentation: - if doc_line.startswith('Parameters: '): - param_str = doc_line[12:] # Remove 'Parameters: ' - return [p.strip() for p in param_str.split(',') if p.strip()] - - # Try to extract from symbol information signature - if hasattr(symbol_info, 'signature') and symbol_info.signature: - return self._parse_signature_parameters(symbol_info.signature) - - # Fallback: try to extract from symbol occurrences and surrounding context - return self._extract_parameters_from_occurrences(scip_symbol, document) - - except Exception as e: - logger.debug(f"Failed to extract parameters for {scip_symbol}: {e}") - return [] - - def _parse_signature_parameters(self, signature: str) -> List[str]: - """Parse parameter names from function signature.""" - try: - # Basic signature parsing - handle common patterns - if '(' in signature and ')' in signature: - param_section = signature.split('(')[1].split(')')[0] - if not param_section.strip(): - return [] - - params = [] - for param in param_section.split(','): - param = param.strip() - if param: - # Extract parameter name (before type annotation if present) - param_name = param.split(':')[0].strip() - if param_name and param_name != 'self': - params.append(param_name) - elif param_name == 'self': - params.append('self') - - return params - - except Exception as e: - logger.debug(f"Error parsing signature parameters: {e}") - - return [] - - def _extract_parameters_from_occurrences(self, scip_symbol: str, document) -> List[str]: - """Extract parameters by analyzing symbol occurrences in the document.""" - # This is a simplified implementation - # A more sophisticated approach would analyze the AST or source code directly - return [] - - def _extract_return_type(self, scip_symbol: str, symbol_info) -> Optional[str]: - """Extract return type from SCIP data.""" - try: - if hasattr(symbol_info, 'signature') and symbol_info.signature: - signature = symbol_info.signature - if '->' in signature: - return_part = signature.split('->')[-1].strip() - return return_part if return_part else None - except Exception as e: - logger.debug(f"Error extracting return type for {scip_symbol}: {e}") - return None - - def _is_async_function(self, scip_symbol: str, symbol_info) -> bool: - """Check if function is async based on SCIP data.""" - try: - # Check documentation for async marker (Python AST analyzer stores this) - if hasattr(symbol_info, 'documentation') and symbol_info.documentation: - for doc_line in symbol_info.documentation: - if doc_line == 'Async function': - return True - - # Fallback: check signature - if hasattr(symbol_info, 'signature') and symbol_info.signature: - return 'async' in symbol_info.signature.lower() - except Exception as e: - logger.debug(f"Error checking async status for {scip_symbol}: {e}") - return False - - def _extract_class_members(self, class_scip_symbol: str, document) -> tuple[List[str], List[str]]: - """Use SCIPSymbolManager to parse class members.""" - methods = [] - attributes = [] - - if not self._symbol_parser: - return methods, attributes - - try: - # Parse class symbol to get descriptors - class_info = self._symbol_parser.parse_symbol(class_scip_symbol) - if not class_info: - return methods, attributes - - # Extract class name from descriptors: file.py/ClassName -> ClassName - class_name = self._extract_class_name_from_descriptors(class_info.descriptors) - if not class_name: - return methods, attributes - - # Find all class members by looking for matching descriptors - for symbol_info in document.symbols: - if not self._symbol_parser: - continue - - member_info = self._symbol_parser.parse_symbol(symbol_info.symbol) - if not member_info or member_info.manager != 'local': - continue - - # Check if this symbol belongs to the class - if self._is_class_member(member_info.descriptors, class_name): - member_name = self._extract_member_name(member_info.descriptors, class_name) - if member_name: - # Classify based on SCIP kind - if self._is_method_kind(symbol_info.kind): - methods.append(member_name) - else: - attributes.append(member_name) - - except Exception as e: - logger.debug(f"Error extracting class members for {class_scip_symbol}: {e}") - - return methods, attributes - - def _extract_inheritance(self, class_scip_symbol: str, symbol_info) -> List[str]: - """Extract class inheritance information from SCIP data.""" - # This would require more sophisticated SCIP relationship analysis - # For now, return empty list - return [] - - def _extract_variable_type(self, scip_symbol: str, symbol_info) -> Optional[str]: - """Extract variable type from SCIP data.""" - try: - if hasattr(symbol_info, 'signature') and symbol_info.signature: - # Try to extract type annotation - signature = symbol_info.signature - if ':' in signature: - type_part = signature.split(':')[1].strip() - return type_part if type_part else None - except Exception as e: - logger.debug(f"Error extracting variable type for {scip_symbol}: {e}") - return None - - def _is_global_variable(self, scip_symbol: str, document) -> Optional[bool]: - """Check if variable is global based on SCIP symbol structure.""" - try: - # Global variables typically don't have class context - if '#' not in scip_symbol: - return True - return False - except Exception as e: - logger.debug(f"Error checking global status for {scip_symbol}: {e}") - return None - - def _extract_constant_value(self, scip_symbol: str, symbol_info) -> Optional[str]: - """Extract constant value from SCIP data.""" - try: - if hasattr(symbol_info, 'signature') and symbol_info.signature: - signature = symbol_info.signature - if '=' in signature: - value_part = signature.split('=')[1].strip() - return value_part if value_part else None - except Exception as e: - logger.debug(f"Error extracting constant value for {scip_symbol}: {e}") - return None - - def extract_scip_relationships(self, file_path: str, scip_index) -> Dict[str, List[tuple]]: - """ - Extract SCIP relationships from a file using the enhanced analysis pipeline. - - This method provides integration between the symbol analyzer and the new - SCIP relationship management system introduced in the implementation plan. - - Args: - file_path: Relative path to the file to analyze - scip_index: SCIP index containing all project data - - Returns: - Dictionary mapping source_symbol_id -> [(target_symbol_id, relationship_type), ...] - Compatible with SCIPRelationshipManager input format - - Raises: - ValueError: If file analysis fails or file not found - """ - try: - # Perform complete file analysis - file_analysis = self.analyze_file(file_path, scip_index) - - # Extract all SCIP relationships using the enhanced data structures - relationships = file_analysis.to_scip_relationships(self._symbol_parser) - - logger.debug(f"Extracted SCIP relationships for {file_path}: " - f"{len(relationships)} symbols with relationships, " - f"{sum(len(rels) for rels in relationships.values())} total relationships") - - return relationships - - except Exception as e: - logger.error(f"Failed to extract SCIP relationships from {file_path}: {e}") - raise ValueError(f"SCIP relationship extraction failed: {e}") - - def batch_extract_relationships(self, file_paths: List[str], scip_index) -> Dict[str, Dict[str, List[tuple]]]: - """ - Extract SCIP relationships from multiple files efficiently. - - This method provides batch processing capabilities for the relationship - management system, optimizing performance for large codebases. - - Args: - file_paths: List of relative file paths to analyze - scip_index: SCIP index containing all project data - - Returns: - Dictionary mapping file_path -> {source_symbol_id -> [(target_symbol_id, relationship_type), ...]} - """ - results = {} - - for i, file_path in enumerate(file_paths, 1): - try: - relationships = self.extract_scip_relationships(file_path, scip_index) - results[file_path] = relationships - - if i % 10 == 0 or i == len(file_paths): - logger.debug(f"Batch relationship extraction progress: {i}/{len(file_paths)} files") - - except Exception as e: - logger.warning(f"Failed to extract relationships from {file_path}: {e}") - results[file_path] = {} # Empty result for failed files - continue - - total_files = len(results) - total_relationships = sum( - sum(len(rels) for rels in file_rels.values()) - for file_rels in results.values() - ) - - logger.info(f"Batch relationship extraction completed: {total_files} files, {total_relationships} total relationships") - - return results \ No newline at end of file diff --git a/src/code_index_mcp/tools/scip/symbol_definitions.py b/src/code_index_mcp/tools/scip/symbol_definitions.py deleted file mode 100644 index 4bfecd5..0000000 --- a/src/code_index_mcp/tools/scip/symbol_definitions.py +++ /dev/null @@ -1,291 +0,0 @@ -""" -Symbol Definitions - Core data structures for enhanced symbol analysis - -This module defines the data structures used by SCIPSymbolAnalyzer to represent -accurate symbol information and call relationships in a format optimized for LLM consumption. -""" - -from typing import Dict, List, Optional, Any -from dataclasses import dataclass, field - -from .relationship_info import SymbolRelationships - - -class SymbolLocationError(Exception): - """Raised when symbol location cannot be determined from SCIP data.""" - pass - - -class SymbolResolutionError(Exception): - """Raised when symbol cannot be resolved or parsed.""" - pass - - -@dataclass -class LocationInfo: - """Precise location information for a symbol.""" - line: int - column: int - confidence: str = 'high' # 'high', 'fallback', 'estimated' - - def to_dict(self) -> Dict[str, int]: - """Convert to dictionary format for JSON output.""" - return {"line": self.line, "column": self.column} - - -# CallRelationships class removed - now using unified SymbolRelationships - - -@dataclass -class SymbolDefinition: - """Enhanced symbol definition with accurate metadata.""" - name: str - line: int - column: int - symbol_type: str # 'function', 'method', 'class', 'variable', 'constant' - - # Optional metadata - class_name: Optional[str] = None - parameters: List[str] = field(default_factory=list) - return_type: Optional[str] = None - is_async: bool = False - - # Unified relationships (for all symbol types) - relationships: SymbolRelationships = field(default_factory=lambda: SymbolRelationships()) - - # Additional class-specific fields - methods: List[str] = field(default_factory=list) # For classes - attributes: List[str] = field(default_factory=list) # For classes - inherits_from: List[str] = field(default_factory=list) # For classes - - # Variable/constant-specific fields - is_global: Optional[bool] = None # For variables - type: Optional[str] = None # For variables - value: Optional[str] = None # For constants - - # Internal tracking - scip_symbol: str = "" # Original SCIP symbol for debugging - - def is_callable(self) -> bool: - """Check if this symbol represents a callable (function/method).""" - return self.symbol_type in ['function', 'method'] - - def is_class_member(self) -> bool: - """Check if this symbol belongs to a class.""" - return self.class_name is not None - - def to_function_dict(self) -> Dict[str, Any]: - """Convert to function format for JSON output.""" - result = { - "name": self.name, - "line": self.line, - "column": self.column, - "class": self.class_name, - "parameters": self.parameters, - "return_type": self.return_type, - "is_async": self.is_async - } - - # Add relationships if they exist - relationships_dict = self.relationships.to_dict() - if relationships_dict: - result["relationships"] = relationships_dict - - return result - - def to_class_dict(self) -> Dict[str, Any]: - """Convert to class format for JSON output.""" - result = { - "name": self.name, - "line": self.line, - "column": self.column, - "methods": self.methods, - "attributes": self.attributes, - "inherits_from": self.inherits_from - } - - # Add relationships if they exist - relationships_dict = self.relationships.to_dict() - if relationships_dict: - result["relationships"] = relationships_dict - - return result - - def to_variable_dict(self) -> Dict[str, Any]: - """Convert to variable format for JSON output.""" - result = { - "name": self.name, - "line": self.line, - "column": self.column, - "is_global": self.is_global, - "type": self.type - } - - # Add relationships if they exist - relationships_dict = self.relationships.to_dict() - if relationships_dict: - result["relationships"] = relationships_dict - - return result - - def to_constant_dict(self) -> Dict[str, Any]: - """Convert to constant format for JSON output.""" - return { - "name": self.name, - "line": self.line, - "column": self.column, - "value": self.value - } - - def to_scip_relationships(self, symbol_manager=None, language="", file_path="") -> List[tuple]: - """Convert symbol relationships to SCIP format.""" - scip_relationships = [] - - # Convert all relationships to SCIP tuples - for rel in self.relationships.calls: - scip_relationships.append((rel.target_symbol_id, "calls")) - for rel in self.relationships.inherits_from: - scip_relationships.append((rel.target_symbol_id, "inherits")) - for rel in self.relationships.implements: - scip_relationships.append((rel.target_symbol_id, "implements")) - for rel in self.relationships.references: - scip_relationships.append((rel.target_symbol_id, "references")) - - return scip_relationships - - -@dataclass -class ImportGroup: - """Organized import information.""" - standard_library: List[str] = field(default_factory=list) - third_party: List[str] = field(default_factory=list) - local: List[str] = field(default_factory=list) - - def add_import(self, module_name: str, import_type: str = 'unknown'): - """Add an import to the appropriate group.""" - if import_type == 'standard_library': - if module_name not in self.standard_library: - self.standard_library.append(module_name) - elif import_type == 'third_party': - if module_name not in self.third_party: - self.third_party.append(module_name) - elif import_type == 'local': - if module_name not in self.local: - self.local.append(module_name) - - def to_dict(self) -> Dict[str, List[str]]: - """Convert to dictionary format for JSON output.""" - return { - "standard_library": self.standard_library, - "third_party": self.third_party, - "local": self.local - } - - -@dataclass -class FileAnalysis: - """Complete file analysis result matching the exact output specification.""" - file_path: str - language: str - line_count: int - size_bytes: int = 0 - - # Symbol collections organized by type - functions: List[SymbolDefinition] = field(default_factory=list) - classes: List[SymbolDefinition] = field(default_factory=list) - variables: List[SymbolDefinition] = field(default_factory=list) - constants: List[SymbolDefinition] = field(default_factory=list) - - # Dependency information - imports: ImportGroup = field(default_factory=lambda: ImportGroup()) - - - def add_symbol(self, symbol: SymbolDefinition): - """Add a symbol to the appropriate collection based on its type.""" - if symbol.symbol_type == 'function' or symbol.symbol_type == 'method': - self.functions.append(symbol) - elif symbol.symbol_type == 'class': - self.classes.append(symbol) - elif symbol.symbol_type == 'variable': - self.variables.append(symbol) - elif symbol.symbol_type == 'constant': - self.constants.append(symbol) - - def get_function_by_name(self, name: str) -> Optional[SymbolDefinition]: - """Find a function by name.""" - for func in self.functions: - if func.name == name: - return func - return None - - def get_class_by_name(self, name: str) -> Optional[SymbolDefinition]: - """Find a class by name.""" - for cls in self.classes: - if cls.name == name: - return cls - return None - - - def to_dict(self) -> Dict[str, Any]: - """Convert to final JSON output format - simplified for token efficiency.""" - return { - "file_path": self.file_path, - "language": self.language, - "basic_info": { - "line_count": self.line_count - }, - "symbols": { - "functions": [func.to_function_dict() for func in self.functions], - "classes": [cls.to_class_dict() for cls in self.classes], - "variables": [var.to_variable_dict() for var in self.variables], - "constants": [const.to_constant_dict() for const in self.constants] - }, - "status": "success" - } - - def to_scip_relationships(self, symbol_manager=None) -> Dict[str, List[tuple]]: - """ - Extract all SCIP relationships from this file analysis. - - This method provides a unified interface to get all symbol relationships - in SCIP format, compatible with the relationship management system. - - Args: - symbol_manager: Optional symbol manager for generating proper symbol IDs - - Returns: - Dictionary mapping source_symbol_id -> [(target_symbol_id, relationship_type), ...] - """ - all_relationships = {} - - # Process all symbol types - all_symbols = self.functions + self.classes + self.variables + self.constants - - for symbol in all_symbols: - # Create source symbol ID - if symbol_manager: - source_symbol_id = symbol_manager.create_local_symbol( - language=self.language, - file_path=self.file_path, - symbol_path=[symbol.name], - descriptor=self._get_symbol_descriptor(symbol) - ) - else: - source_symbol_id = f"local {symbol.name}{self._get_symbol_descriptor(symbol)}" - - # Get relationships for this symbol - symbol_relationships = symbol.to_scip_relationships(symbol_manager, self.language, self.file_path) - - if symbol_relationships: - all_relationships[source_symbol_id] = symbol_relationships - - return all_relationships - - def _get_symbol_descriptor(self, symbol: SymbolDefinition) -> str: - """Get SCIP descriptor suffix for a symbol.""" - if symbol.symbol_type in ['function', 'method']: - return "()." - elif symbol.symbol_type == 'class': - return "#" - else: - return "" \ No newline at end of file diff --git a/uv.lock b/uv.lock index a2c9dde..6642d2e 100644 --- a/uv.lock +++ b/uv.lock @@ -52,10 +52,9 @@ name = "code-index-mcp" version = "2.1.2" source = { editable = "." } dependencies = [ - { name = "libclang" }, { name = "mcp" }, + { name = "msgpack" }, { name = "pathspec" }, - { name = "protobuf" }, { name = "tree-sitter" }, { name = "tree-sitter-java" }, { name = "tree-sitter-javascript" }, @@ -66,10 +65,9 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "libclang", specifier = ">=16.0.0" }, { name = "mcp", specifier = ">=0.3.0" }, + { name = "msgpack", specifier = ">=1.0.0" }, { name = "pathspec", specifier = ">=0.12.1" }, - { name = "protobuf", specifier = ">=4.21.0" }, { name = "tree-sitter", specifier = ">=0.20.0" }, { name = "tree-sitter-java", specifier = ">=0.20.0" }, { name = "tree-sitter-javascript", specifier = ">=0.20.0" }, @@ -151,23 +149,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] -[[package]] -name = "libclang" -version = "18.1.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6e/5c/ca35e19a4f142adffa27e3d652196b7362fa612243e2b916845d801454fc/libclang-18.1.1.tar.gz", hash = "sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250", size = 39612 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/49/f5e3e7e1419872b69f6f5e82ba56e33955a74bd537d8a1f5f1eff2f3668a/libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a", size = 25836045 }, - { url = "https://files.pythonhosted.org/packages/e2/e5/fc61bbded91a8830ccce94c5294ecd6e88e496cc85f6704bf350c0634b70/libclang-18.1.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5", size = 26502641 }, - { url = "https://files.pythonhosted.org/packages/db/ed/1df62b44db2583375f6a8a5e2ca5432bbdc3edb477942b9b7c848c720055/libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8", size = 26420207 }, - { url = "https://files.pythonhosted.org/packages/1d/fc/716c1e62e512ef1c160e7984a73a5fc7df45166f2ff3f254e71c58076f7c/libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b", size = 24515943 }, - { url = "https://files.pythonhosted.org/packages/3c/3d/f0ac1150280d8d20d059608cf2d5ff61b7c3b7f7bcf9c0f425ab92df769a/libclang-18.1.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592", size = 23784972 }, - { url = "https://files.pythonhosted.org/packages/fe/2f/d920822c2b1ce9326a4c78c0c2b4aa3fde610c7ee9f631b600acb5376c26/libclang-18.1.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe", size = 20259606 }, - { url = "https://files.pythonhosted.org/packages/2d/c2/de1db8c6d413597076a4259cea409b83459b2db997c003578affdd32bf66/libclang-18.1.1-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f", size = 24921494 }, - { url = "https://files.pythonhosted.org/packages/0b/2d/3f480b1e1d31eb3d6de5e3ef641954e5c67430d5ac93b7fa7e07589576c7/libclang-18.1.1-py2.py3-none-win_amd64.whl", hash = "sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb", size = 26415083 }, - { url = "https://files.pythonhosted.org/packages/71/cf/e01dc4cc79779cd82d77888a88ae2fa424d93b445ad4f6c02bfc18335b70/libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8", size = 22361112 }, -] - [[package]] name = "mcp" version = "1.4.1" @@ -188,26 +169,60 @@ wheels = [ ] [[package]] -name = "pathspec" -version = "0.12.1" +name = "msgpack" +version = "1.1.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043 } +sdist = { url = "https://files.pythonhosted.org/packages/45/b1/ea4f68038a18c77c9467400d166d74c4ffa536f34761f7983a104357e614/msgpack-1.1.1.tar.gz", hash = "sha256:77b79ce34a2bdab2594f490c8e80dd62a02d650b91a75159a63ec413b8d104cd", size = 173555 } wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 }, + { url = "https://files.pythonhosted.org/packages/33/52/f30da112c1dc92cf64f57d08a273ac771e7b29dea10b4b30369b2d7e8546/msgpack-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:353b6fc0c36fde68b661a12949d7d49f8f51ff5fa019c1e47c87c4ff34b080ed", size = 81799 }, + { url = "https://files.pythonhosted.org/packages/e4/35/7bfc0def2f04ab4145f7f108e3563f9b4abae4ab0ed78a61f350518cc4d2/msgpack-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:79c408fcf76a958491b4e3b103d1c417044544b68e96d06432a189b43d1215c8", size = 78278 }, + { url = "https://files.pythonhosted.org/packages/e8/c5/df5d6c1c39856bc55f800bf82778fd4c11370667f9b9e9d51b2f5da88f20/msgpack-1.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78426096939c2c7482bf31ef15ca219a9e24460289c00dd0b94411040bb73ad2", size = 402805 }, + { url = "https://files.pythonhosted.org/packages/20/8e/0bb8c977efecfe6ea7116e2ed73a78a8d32a947f94d272586cf02a9757db/msgpack-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b17ba27727a36cb73aabacaa44b13090feb88a01d012c0f4be70c00f75048b4", size = 408642 }, + { url = "https://files.pythonhosted.org/packages/59/a1/731d52c1aeec52006be6d1f8027c49fdc2cfc3ab7cbe7c28335b2910d7b6/msgpack-1.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a17ac1ea6ec3c7687d70201cfda3b1e8061466f28f686c24f627cae4ea8efd0", size = 395143 }, + { url = "https://files.pythonhosted.org/packages/2b/92/b42911c52cda2ba67a6418ffa7d08969edf2e760b09015593c8a8a27a97d/msgpack-1.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:88d1e966c9235c1d4e2afac21ca83933ba59537e2e2727a999bf3f515ca2af26", size = 395986 }, + { url = "https://files.pythonhosted.org/packages/61/dc/8ae165337e70118d4dab651b8b562dd5066dd1e6dd57b038f32ebc3e2f07/msgpack-1.1.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f6d58656842e1b2ddbe07f43f56b10a60f2ba5826164910968f5933e5178af75", size = 402682 }, + { url = "https://files.pythonhosted.org/packages/58/27/555851cb98dcbd6ce041df1eacb25ac30646575e9cd125681aa2f4b1b6f1/msgpack-1.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:96decdfc4adcbc087f5ea7ebdcfd3dee9a13358cae6e81d54be962efc38f6338", size = 406368 }, + { url = "https://files.pythonhosted.org/packages/d4/64/39a26add4ce16f24e99eabb9005e44c663db00e3fce17d4ae1ae9d61df99/msgpack-1.1.1-cp310-cp310-win32.whl", hash = "sha256:6640fd979ca9a212e4bcdf6eb74051ade2c690b862b679bfcb60ae46e6dc4bfd", size = 65004 }, + { url = "https://files.pythonhosted.org/packages/7d/18/73dfa3e9d5d7450d39debde5b0d848139f7de23bd637a4506e36c9800fd6/msgpack-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:8b65b53204fe1bd037c40c4148d00ef918eb2108d24c9aaa20bc31f9810ce0a8", size = 71548 }, + { url = "https://files.pythonhosted.org/packages/7f/83/97f24bf9848af23fe2ba04380388216defc49a8af6da0c28cc636d722502/msgpack-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:71ef05c1726884e44f8b1d1773604ab5d4d17729d8491403a705e649116c9558", size = 82728 }, + { url = "https://files.pythonhosted.org/packages/aa/7f/2eaa388267a78401f6e182662b08a588ef4f3de6f0eab1ec09736a7aaa2b/msgpack-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:36043272c6aede309d29d56851f8841ba907a1a3d04435e43e8a19928e243c1d", size = 79279 }, + { url = "https://files.pythonhosted.org/packages/f8/46/31eb60f4452c96161e4dfd26dbca562b4ec68c72e4ad07d9566d7ea35e8a/msgpack-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a32747b1b39c3ac27d0670122b57e6e57f28eefb725e0b625618d1b59bf9d1e0", size = 423859 }, + { url = "https://files.pythonhosted.org/packages/45/16/a20fa8c32825cc7ae8457fab45670c7a8996d7746ce80ce41cc51e3b2bd7/msgpack-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a8b10fdb84a43e50d38057b06901ec9da52baac6983d3f709d8507f3889d43f", size = 429975 }, + { url = "https://files.pythonhosted.org/packages/86/ea/6c958e07692367feeb1a1594d35e22b62f7f476f3c568b002a5ea09d443d/msgpack-1.1.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba0c325c3f485dc54ec298d8b024e134acf07c10d494ffa24373bea729acf704", size = 413528 }, + { url = "https://files.pythonhosted.org/packages/75/05/ac84063c5dae79722bda9f68b878dc31fc3059adb8633c79f1e82c2cd946/msgpack-1.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:88daaf7d146e48ec71212ce21109b66e06a98e5e44dca47d853cbfe171d6c8d2", size = 413338 }, + { url = "https://files.pythonhosted.org/packages/69/e8/fe86b082c781d3e1c09ca0f4dacd457ede60a13119b6ce939efe2ea77b76/msgpack-1.1.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8b55ea20dc59b181d3f47103f113e6f28a5e1c89fd5b67b9140edb442ab67f2", size = 422658 }, + { url = "https://files.pythonhosted.org/packages/3b/2b/bafc9924df52d8f3bb7c00d24e57be477f4d0f967c0a31ef5e2225e035c7/msgpack-1.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4a28e8072ae9779f20427af07f53bbb8b4aa81151054e882aee333b158da8752", size = 427124 }, + { url = "https://files.pythonhosted.org/packages/a2/3b/1f717e17e53e0ed0b68fa59e9188f3f610c79d7151f0e52ff3cd8eb6b2dc/msgpack-1.1.1-cp311-cp311-win32.whl", hash = "sha256:7da8831f9a0fdb526621ba09a281fadc58ea12701bc709e7b8cbc362feabc295", size = 65016 }, + { url = "https://files.pythonhosted.org/packages/48/45/9d1780768d3b249accecc5a38c725eb1e203d44a191f7b7ff1941f7df60c/msgpack-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:5fd1b58e1431008a57247d6e7cc4faa41c3607e8e7d4aaf81f7c29ea013cb458", size = 72267 }, + { url = "https://files.pythonhosted.org/packages/e3/26/389b9c593eda2b8551b2e7126ad3a06af6f9b44274eb3a4f054d48ff7e47/msgpack-1.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ae497b11f4c21558d95de9f64fff7053544f4d1a17731c866143ed6bb4591238", size = 82359 }, + { url = "https://files.pythonhosted.org/packages/ab/65/7d1de38c8a22cf8b1551469159d4b6cf49be2126adc2482de50976084d78/msgpack-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:33be9ab121df9b6b461ff91baac6f2731f83d9b27ed948c5b9d1978ae28bf157", size = 79172 }, + { url = "https://files.pythonhosted.org/packages/0f/bd/cacf208b64d9577a62c74b677e1ada005caa9b69a05a599889d6fc2ab20a/msgpack-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f64ae8fe7ffba251fecb8408540c34ee9df1c26674c50c4544d72dbf792e5ce", size = 425013 }, + { url = "https://files.pythonhosted.org/packages/4d/ec/fd869e2567cc9c01278a736cfd1697941ba0d4b81a43e0aa2e8d71dab208/msgpack-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a494554874691720ba5891c9b0b39474ba43ffb1aaf32a5dac874effb1619e1a", size = 426905 }, + { url = "https://files.pythonhosted.org/packages/55/2a/35860f33229075bce803a5593d046d8b489d7ba2fc85701e714fc1aaf898/msgpack-1.1.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb643284ab0ed26f6957d969fe0dd8bb17beb567beb8998140b5e38a90974f6c", size = 407336 }, + { url = "https://files.pythonhosted.org/packages/8c/16/69ed8f3ada150bf92745fb4921bd621fd2cdf5a42e25eb50bcc57a5328f0/msgpack-1.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d275a9e3c81b1093c060c3837e580c37f47c51eca031f7b5fb76f7b8470f5f9b", size = 409485 }, + { url = "https://files.pythonhosted.org/packages/c6/b6/0c398039e4c6d0b2e37c61d7e0e9d13439f91f780686deb8ee64ecf1ae71/msgpack-1.1.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4fd6b577e4541676e0cc9ddc1709d25014d3ad9a66caa19962c4f5de30fc09ef", size = 412182 }, + { url = "https://files.pythonhosted.org/packages/b8/d0/0cf4a6ecb9bc960d624c93effaeaae75cbf00b3bc4a54f35c8507273cda1/msgpack-1.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb29aaa613c0a1c40d1af111abf025f1732cab333f96f285d6a93b934738a68a", size = 419883 }, + { url = "https://files.pythonhosted.org/packages/62/83/9697c211720fa71a2dfb632cad6196a8af3abea56eece220fde4674dc44b/msgpack-1.1.1-cp312-cp312-win32.whl", hash = "sha256:870b9a626280c86cff9c576ec0d9cbcc54a1e5ebda9cd26dab12baf41fee218c", size = 65406 }, + { url = "https://files.pythonhosted.org/packages/c0/23/0abb886e80eab08f5e8c485d6f13924028602829f63b8f5fa25a06636628/msgpack-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:5692095123007180dca3e788bb4c399cc26626da51629a31d40207cb262e67f4", size = 72558 }, + { url = "https://files.pythonhosted.org/packages/a1/38/561f01cf3577430b59b340b51329803d3a5bf6a45864a55f4ef308ac11e3/msgpack-1.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3765afa6bd4832fc11c3749be4ba4b69a0e8d7b728f78e68120a157a4c5d41f0", size = 81677 }, + { url = "https://files.pythonhosted.org/packages/09/48/54a89579ea36b6ae0ee001cba8c61f776451fad3c9306cd80f5b5c55be87/msgpack-1.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8ddb2bcfd1a8b9e431c8d6f4f7db0773084e107730ecf3472f1dfe9ad583f3d9", size = 78603 }, + { url = "https://files.pythonhosted.org/packages/a0/60/daba2699b308e95ae792cdc2ef092a38eb5ee422f9d2fbd4101526d8a210/msgpack-1.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:196a736f0526a03653d829d7d4c5500a97eea3648aebfd4b6743875f28aa2af8", size = 420504 }, + { url = "https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d592d06e3cc2f537ceeeb23d38799c6ad83255289bb84c2e5792e5a8dea268a", size = 423749 }, + { url = "https://files.pythonhosted.org/packages/40/1b/54c08dd5452427e1179a40b4b607e37e2664bca1c790c60c442c8e972e47/msgpack-1.1.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4df2311b0ce24f06ba253fda361f938dfecd7b961576f9be3f3fbd60e87130ac", size = 404458 }, + { url = "https://files.pythonhosted.org/packages/2e/60/6bb17e9ffb080616a51f09928fdd5cac1353c9becc6c4a8abd4e57269a16/msgpack-1.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e4141c5a32b5e37905b5940aacbc59739f036930367d7acce7a64e4dec1f5e0b", size = 405976 }, + { url = "https://files.pythonhosted.org/packages/ee/97/88983e266572e8707c1f4b99c8fd04f9eb97b43f2db40e3172d87d8642db/msgpack-1.1.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b1ce7f41670c5a69e1389420436f41385b1aa2504c3b0c30620764b15dded2e7", size = 408607 }, + { url = "https://files.pythonhosted.org/packages/bc/66/36c78af2efaffcc15a5a61ae0df53a1d025f2680122e2a9eb8442fed3ae4/msgpack-1.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4147151acabb9caed4e474c3344181e91ff7a388b888f1e19ea04f7e73dc7ad5", size = 424172 }, + { url = "https://files.pythonhosted.org/packages/8c/87/a75eb622b555708fe0427fab96056d39d4c9892b0c784b3a721088c7ee37/msgpack-1.1.1-cp313-cp313-win32.whl", hash = "sha256:500e85823a27d6d9bba1d057c871b4210c1dd6fb01fbb764e37e4e8847376323", size = 65347 }, + { url = "https://files.pythonhosted.org/packages/ca/91/7dc28d5e2a11a5ad804cf2b7f7a5fcb1eb5a4966d66a5d2b41aee6376543/msgpack-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:6d489fba546295983abd142812bda76b57e33d0b9f5d5b71c09a583285506f69", size = 72341 }, ] [[package]] -name = "protobuf" -version = "6.31.1" +name = "pathspec" +version = "0.12.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/52/f3/b9655a711b32c19720253f6f06326faf90580834e2e83f840472d752bc8b/protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a", size = 441797 } +sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043 } wheels = [ - { url = "https://files.pythonhosted.org/packages/f3/6f/6ab8e4bf962fd5570d3deaa2d5c38f0a363f57b4501047b5ebeb83ab1125/protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9", size = 423603 }, - { url = "https://files.pythonhosted.org/packages/44/3a/b15c4347dd4bf3a1b0ee882f384623e2063bb5cf9fa9d57990a4f7df2fb6/protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447", size = 435283 }, - { url = "https://files.pythonhosted.org/packages/6a/c9/b9689a2a250264a84e66c46d8862ba788ee7a641cdca39bccf64f59284b7/protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:6f1227473dc43d44ed644425268eb7c2e488ae245d51c6866d19fe158e207402", size = 425604 }, - { url = "https://files.pythonhosted.org/packages/76/a1/7a5a94032c83375e4fe7e7f56e3976ea6ac90c5e85fac8576409e25c39c3/protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:a40fc12b84c154884d7d4c4ebd675d5b3b5283e155f324049ae396b95ddebc39", size = 322115 }, - { url = "https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6", size = 321070 }, - { url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724 }, + { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 }, ] [[package]] From 3b1eb7410e8bb0232807d26ec52c597e85c3df93 Mon Sep 17 00:00:00 2001 From: johnhuang316 <134570882+johnhuang316@users.noreply.github.com> Date: Mon, 25 Aug 2025 13:48:39 +0800 Subject: [PATCH 7/8] Refactor TypeScript and Zig parsing strategies to utilize tree-sitter for improved parsing accuracy and performance. Remove regex-based fallback in Zig strategy. Introduce centralized file filtering logic to streamline file processing across services. Update file watcher service to leverage new filtering methods, enhancing exclusion handling for directories and files. Add utility functions for file filtering and update project configuration tool to reflect changes in supported extensions and filtering logic. --- LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md | 345 ------------------ README.md | 50 +-- README_ja.md | 54 +-- README_zh.md | 54 +-- src/code_index_mcp/constants.py | 41 +++ src/code_index_mcp/indexing/__init__.py | 3 +- src/code_index_mcp/indexing/index_provider.py | 83 ++--- .../indexing/json_index_builder.py | 40 +- .../indexing/json_index_manager.py | 126 +++++-- .../indexing/strategies/base_strategy.py | 40 +- .../indexing/strategies/fallback_strategy.py | 19 +- .../indexing/strategies/go_strategy.py | 60 +-- .../indexing/strategies/java_strategy.py | 136 ++----- .../strategies/javascript_strategy.py | 307 +++------------- .../strategies/objective_c_strategy.py | 53 ++- .../indexing/strategies/python_strategy.py | 62 +++- .../indexing/strategies/strategy_factory.py | 145 ++++---- .../strategies/typescript_strategy.py | 338 ++++------------- .../indexing/strategies/zig_strategy.py | 222 ++++------- .../services/file_watcher_service.py | 152 +------- .../tools/config/project_config_tool.py | 36 +- src/code_index_mcp/utils/__init__.py | 4 +- src/code_index_mcp/utils/file_filter.py | 177 +++++++++ 23 files changed, 924 insertions(+), 1623 deletions(-) delete mode 100644 LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md create mode 100644 src/code_index_mcp/utils/file_filter.py diff --git a/LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md b/LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md deleted file mode 100644 index 7710532..0000000 --- a/LLM_OPTIMIZED_INDEX_REPLACEMENT_PLAN.md +++ /dev/null @@ -1,345 +0,0 @@ -# LLM-Optimized Index Replacement Plan - -## Current Architecture Analysis - -### Actual Implementation Process -1. **Project Initialization**: LLM calls `set_project_path()` to establish project root -2. **File Watcher Activation**: Automatic file monitoring starts with debounced re-indexing -3. **Codebase Traversal**: System scans all files using extension whitelist (SUPPORTED_EXTENSIONS) -4. **Language-Specific Processing**: Different strategies for each language's unique characteristics -5. **Dual Storage**: Index stored in temporary path + in-memory for fast access -6. **Query Tools**: LLMs call analysis tools that use the built index - -### SCIP-Based System Issues -- **Complex Protocol**: SCIP protobuf format designed for IDEs, not LLM consumption -- **Over-Engineering**: Multi-layer abstraction (strategies/factories) creates complexity -- **Token Inefficiency**: Verbose SCIP format wastes LLM context tokens -- **Parsing Overhead**: Complex symbol ID generation and validation -- **Cross-Document Complexity**: Relationship building adds minimal LLM value - -### Current Flow Analysis -``` -set_project_path() → File Watcher Activation → Codebase Traversal (Extension Whitelist) → -Language-Specific Strategies → SCIP Builder → Index Storage (Temp + Memory) → -Query Tools Access Index -``` - -### Reusable Components -- **Extension Whitelist**: SUPPORTED_EXTENSIONS constant defining indexable file types -- **File Watcher Service**: Robust debounced file monitoring with auto re-indexing -- **Language Strategy System**: Multi-language support with unique characteristics per language -- **Dual Storage Pattern**: Temporary file storage + in-memory caching for performance -- **Service Architecture**: Clean 3-layer pattern (MCP → Services → Tools) -- **Tree-sitter Parsing**: High-quality AST parsing for supported languages - -## Replacement Architecture - -### Core Principle -Clean slate approach: Delete all SCIP components and build simple, LLM-optimized JSON indexing system from scratch. Preserve three-layer architecture by only replacing the tool layer. - -### New Index Format Design - -#### Design Rationale -The index should optimize for **LLM query patterns** rather than IDE features: - -1. **Function Tracing Focus**: LLMs primarily need to understand "what calls what" -2. **Fast Lookups**: Hash-based access for instant symbol resolution -3. **Minimal Redundancy**: Avoid duplicate data that wastes tokens -4. **Query-Friendly Structure**: Organize data how LLMs will actually access it -5. **Incremental Updates**: Support efficient file-by-file rebuilds - -#### Multi-Language Index Format -```json -{ - "metadata": { - "project_path": "/absolute/path/to/project", - "indexed_files": 275, - "index_version": "1.0.0", - "timestamp": "2025-01-15T10:30:00Z", - "languages": ["python", "javascript", "java", "objective-c"] - }, - - "symbols": { - "src/main.py::process_data": { - "type": "function", - "file": "src/main.py", - "line": 42, - "signature": "def process_data(items: List[str]) -> None:", - "called_by": ["src/main.py::main"] - }, - "src/main.py::MyClass": { - "type": "class", - "file": "src/main.py", - "line": 10 - }, - "src/main.py::MyClass.process": { - "type": "method", - "file": "src/main.py", - "line": 20, - "signature": "def process(self, data: str) -> bool:", - "called_by": ["src/main.py::process_data"] - }, - "src/MyClass.java::com.example.MyClass": { - "type": "class", - "file": "src/MyClass.java", - "line": 5, - "package": "com.example" - }, - "src/MyClass.java::com.example.MyClass.process": { - "type": "method", - "file": "src/MyClass.java", - "line": 10, - "signature": "public void process(String data)", - "called_by": ["src/Main.java::com.example.Main.main"] - }, - "src/main.js::regularFunction": { - "type": "function", - "file": "src/main.js", - "line": 5, - "signature": "function regularFunction(data)", - "called_by": ["src/main.js::main"] - }, - "src/main.js::MyClass.method": { - "type": "method", - "file": "src/main.js", - "line": 15, - "signature": "method(data)", - "called_by": ["src/main.js::regularFunction"] - } - }, - - "files": { - "src/main.py": { - "language": "python", - "line_count": 150, - "symbols": { - "functions": ["process_data", "helper"], - "classes": ["MyClass"] - }, - "imports": ["os", "json", "typing"] - }, - "src/MyClass.java": { - "language": "java", - "line_count": 80, - "symbols": { - "classes": ["MyClass"] - }, - "package": "com.example", - "imports": ["java.util.List", "java.io.File"] - }, - "src/main.js": { - "language": "javascript", - "line_count": 120, - "symbols": { - "functions": ["regularFunction", "helperFunction"], - "classes": ["MyClass"] - }, - "imports": ["fs", "path"], - "exports": ["regularFunction", "MyClass"] - } - } -} -``` - -#### Key Design Decisions - -**1. Universal Qualified Symbol Names** -- Use `"file::symbol"` for standalone symbols, `"file::scope.symbol"` for nested -- **Why**: Eliminates name collisions across all languages, consistent naming -- **LLM Benefit**: Unambiguous symbol identification with clear hierarchy - -**2. Multi-Language Consistency** -- Same symbol format for Python classes, Java packages, JavaScript exports -- **Why**: Single query pattern works across all languages -- **LLM Benefit**: Learn once, query any language the same way - -**3. Called-By Only Relationships** -- Track only `called_by` arrays, not `calls` -- **Why**: Simpler implementation, linear build performance, focuses on usage -- **LLM Benefit**: Direct answers to "where is function X used?" queries - -**4. Language-Specific Fields** -- Java: `package` field, JavaScript: `exports` array, etc. -- **Why**: Preserve important language semantics without complexity -- **LLM Benefit**: Access language-specific information when needed - -**5. Simplified File Structure** -- Organized `symbols` object with arrays by type (functions, classes) -- **Why**: Fast file-level queries, clear organization -- **LLM Benefit**: Immediate file overview showing what symbols exist - -**6. Scope Resolution Strategy** -- Python: `MyClass.method`, Java: `com.example.MyClass.method` -- **Why**: Natural language patterns, includes necessary context -- **LLM Benefit**: Symbol names match how developers think about code - -### Simplified Flow -``` -set_project_path() → File Watcher Activation → Extension Whitelist Traversal → -Language-Specific Simple Parsers → JSON Index Update → Dual Storage (Temp + Memory) → -Query Tools Access Optimized Index -``` - -## Implementation Plan - -### Phase 1: Clean Slate - Remove SCIP System -- **Delete all SCIP tools**: Remove `src/code_index_mcp/scip/` directory completely -- **Remove protobuf dependencies**: Clean up `scip_pb2.py` and related imports -- **Strip SCIP from services**: Remove SCIP references from business logic layers -- **Clean constants**: Remove `SCIP_INDEX_FILE` and related SCIP constants -- **Update dependencies**: Remove protobuf from `pyproject.toml` - -### Phase 2: Tool Layer Replacement -- **Keep three-layer architecture**: Only modify the tool layer, preserve services/MCP layers -- **New simple index format**: Implement lightweight JSON-based indexing tools -- **Language parsers**: Create simple parsers in tool layer (Python `ast`, simplified tree-sitter) -- **Storage tools**: Implement dual storage tools (temp + memory) for new format -- **Query tools**: Build fast lookup tools for the new index structure - -### Phase 3: Service Layer Integration -- **Minimal service changes**: Services delegate to new tools instead of SCIP tools -- **Preserve business logic**: Keep existing service workflows and validation -- **Maintain interfaces**: Services still expose same functionality to MCP layer -- **File watcher integration**: Connect file watcher to new index rebuild tools - -### Phase 4: MCP Layer Compatibility -- **Zero MCP changes**: Existing `@mcp.tool` functions unchanged -- **Same interfaces**: Tools return data in expected formats -- **Backward compatibility**: Existing LLM workflows continue working -- **Performance gains**: Faster responses with same functionality - -### Phase 5: Build from Scratch Mentality -- **New index design**: Simple, LLM-optimized format built fresh -- **Clean codebase**: Remove all SCIP complexity and start simple -- **Fresh dependencies**: Only essential libraries (no protobuf, simplified tree-sitter) -- **Focused scope**: Build only what's needed for LLM use cases - -## Technical Specifications - -### Index Storage -- **Dual Storage**: Temporary path (`%TEMP%/code_indexer//`) + in-memory caching -- **Format**: JSON with msgpack binary serialization for performance -- **Location**: Follow existing pattern (discoverable via constants.py) -- **Extension Filtering**: Use existing SUPPORTED_EXTENSIONS whitelist -- **Size**: ~10-50KB for typical projects vs ~1-5MB SCIP -- **Access**: Direct dict lookups vs protobuf traversal -- **File Watcher Integration**: Automatic updates when files change - -### Language Support -- **Python**: Built-in `ast` module for optimal performance and accuracy -- **JavaScript/TypeScript**: Existing tree-sitter parsers (proven reliability) -- **Other Languages**: Reuse existing tree-sitter implementations -- **Simplify**: Remove SCIP-specific symbol generation overhead -- **Focus**: Extract symbols and `called_by` relationships only - -### Query Performance -- **Target**: <100ms for any query operation -- **Method**: Hash-based lookups vs linear SCIP traversal -- **Caching**: In-memory symbol registry for instant access - -### File Watching -- **Keep**: Existing watchdog-based file monitoring -- **Optimize**: Batch incremental updates vs full rebuilds -- **Debounce**: Maintain 4-6 second debounce for change batching - -## Migration Strategy - -### Backward Compatibility -- **Zero breaking changes**: Same MCP tool interfaces and return formats -- **Preserve workflows**: File watcher, project setup, and query patterns unchanged -- **Service contracts**: Business logic layer contracts remain stable -- **LLM experience**: Existing LLM usage patterns continue working - -### Rollback Plan -- **Git branch strategy**: Preserve SCIP implementation in separate branch -- **Incremental deployment**: Can revert individual components if needed -- **Performance monitoring**: Compare old vs new system metrics -- **Fallback mechanism**: Quick switch back to SCIP if issues arise - -### Testing Strategy -- Compare output accuracy between SCIP and simple index -- Benchmark query performance improvements -- Validate function tracing completeness -- Test incremental update correctness - -## Expected Benefits - -### Performance Improvements -- **Index Build**: 5-10x faster (no protobuf, no complex call analysis) -- **Query Speed**: 10-100x faster (direct hash lookups) -- **Memory Usage**: 80% reduction (simple JSON vs protobuf) -- **Build Complexity**: Linear O(n) vs complex relationship resolution - -### Maintenance Benefits -- **Code Complexity**: 70% reduction (remove entire SCIP system) -- **Dependencies**: Remove protobuf, simplify tree-sitter usage -- **Debugging**: Human-readable JSON vs binary protobuf -- **Call Analysis**: Simple `called_by` tracking vs complex call graph building - -### LLM Integration Benefits -- **Fast Responses**: Sub-100ms query times for any symbol lookup -- **Token Efficiency**: Qualified names eliminate ambiguity -- **Simple Format**: Direct JSON access patterns -- **Focused Data**: Only essential information for code understanding - -## Risk Mitigation - -### Functionality Loss -- **Risk**: Missing advanced SCIP features -- **Mitigation**: Focus on core LLM use cases (function tracing) -- **Validation**: Compare query completeness with existing system - -### Performance Regression -- **Risk**: New implementation slower than expected -- **Mitigation**: Benchmark against SCIP at each phase -- **Fallback**: Maintain SCIP implementation as backup - -### Migration Complexity -- **Risk**: Difficult transition from SCIP -- **Mitigation**: Phased rollout with feature flags -- **Safety**: Comprehensive testing before production use - -## Success Metrics - -### Performance Targets -- Index build time: <5 seconds for 1000 files -- Query response time: <100ms for any operation -- Memory usage: <50MB for typical projects -- Token efficiency: 90% reduction in LLM context usage - -### Quality Targets -- Function detection accuracy: >95% vs SCIP -- Call chain completeness: >90% vs SCIP -- Incremental update correctness: 100% -- File watcher reliability: Zero missed changes - -## Implementation Timeline - -### Week 1-2: Foundation -- Core index structure and storage -- Basic JSON schema implementation -- Simple parser extraction from existing code - -### Week 3-4: Language Integration -- Tree-sitter parser simplification -- Multi-language symbol extraction -- Function call relationship building - -### Week 5-6: MCP Tools -- LLM-optimized tool implementation -- Performance optimization -- Query response formatting - -### Week 7-8: Integration and Testing -- File watcher integration -- Comprehensive testing -- Migration tooling - -### Week 9-10: Production Deployment -- Feature flag rollout -- Performance monitoring -- SCIP deprecation planning - -## Conclusion - -This replacement plan transforms the code-index-mcp from a complex SCIP-based system into a lean, LLM-optimized indexing solution. By focusing on the core use case of function tracing and rapid codebase understanding, we achieve significant performance improvements while maintaining all essential functionality. The simplified architecture reduces maintenance burden and enables faster iteration on LLM-specific features. \ No newline at end of file diff --git a/README.md b/README.md index e893f5b..f51ea87 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ The easiest way to get started with any MCP-compatible application: 2. **Restart your application** – `uvx` automatically handles installation and execution -3. **Start using**: +3. **Start using** (give these prompts to your AI assistant): ``` Set the project path to /Users/dev/my-react-app Find all TypeScript files in this project @@ -62,13 +62,16 @@ The easiest way to get started with any MCP-compatible application: ## Key Features ### 🔍 **Intelligent Search & Analysis** -- **SCIP-Powered**: Industry-standard code intelligence format used by major IDEs +- **Dual-Strategy Architecture**: Specialized tree-sitter parsing for 7 core languages, fallback strategy for 50+ file types +- **Direct Tree-sitter Integration**: No regex fallbacks for specialized languages - fail fast with clear errors - **Advanced Search**: Auto-detects and uses the best available tool (ugrep, ripgrep, ag, or grep) -- **Universal Understanding**: Single system comprehends all programming languages +- **Universal File Support**: Comprehensive coverage from advanced AST parsing to basic file indexing - **File Analysis**: Deep insights into structure, imports, classes, methods, and complexity metrics ### 🗂️ **Multi-Language Support** -- **50+ File Types**: Java, Python, JavaScript/TypeScript, C/C++, Go, Rust, C#, Swift, Kotlin, Ruby, PHP, and more +- **7 Languages with Tree-sitter AST Parsing**: Python, JavaScript, TypeScript, Java, Go, Objective-C, Zig +- **50+ File Types with Fallback Strategy**: C/C++, Rust, Ruby, PHP, and all other programming languages +- **Document & Config Files**: Markdown, JSON, YAML, XML with appropriate handling - **Web Frontend**: Vue, React, Svelte, HTML, CSS, SCSS - **Database**: SQL variants, NoSQL, stored procedures, migrations - **Configuration**: JSON, YAML, XML, Markdown @@ -81,36 +84,32 @@ The easiest way to get started with any MCP-compatible application: - **Rich Metadata**: Captures symbols, references, definitions, and relationships ### ⚡ **Performance & Efficiency** -- **SCIP Indexing**: Fast protobuf-based unified indexing system +- **Tree-sitter AST Parsing**: Native syntax parsing for accurate symbol extraction - **Persistent Caching**: Stores indexes for lightning-fast subsequent access - **Smart Filtering**: Intelligent exclusion of build directories and temporary files - **Memory Efficient**: Optimized for large codebases +- **Direct Dependencies**: No fallback mechanisms - fail fast with clear error messages ## Supported File Types
📁 Programming Languages (Click to expand) -**System & Low-Level:** -- C/C++ (`.c`, `.cpp`, `.h`, `.hpp`) -- Rust (`.rs`) -- Zig (`.zig`, `.zon`) -- Go (`.go`) - -**Object-Oriented:** -- Java (`.java`) -- C# (`.cs`) -- Kotlin (`.kt`) -- Scala (`.scala`) -- Objective-C/C++ (`.m`, `.mm`) -- Swift (`.swift`) - -**Scripting & Dynamic:** -- Python (`.py`) -- JavaScript/TypeScript (`.js`, `.ts`, `.jsx`, `.tsx`, `.mjs`, `.cjs`) -- Ruby (`.rb`) -- PHP (`.php`) -- Shell (`.sh`, `.bash`) +**Languages with Specialized Tree-sitter Strategies:** +- **Python** (`.py`, `.pyw`) - Full AST analysis with class/method extraction and call tracking +- **JavaScript** (`.js`, `.jsx`, `.mjs`, `.cjs`) - ES6+ class and function parsing with tree-sitter +- **TypeScript** (`.ts`, `.tsx`) - Complete type-aware symbol extraction with interfaces +- **Java** (`.java`) - Full class hierarchy, method signatures, and call relationships +- **Go** (`.go`) - Struct methods, receiver types, and function analysis +- **Objective-C** (`.m`, `.mm`) - Class/instance method distinction with +/- notation +- **Zig** (`.zig`, `.zon`) - Function and struct parsing with tree-sitter AST + +**All Other Programming Languages:** +All other programming languages use the **FallbackParsingStrategy** which provides basic file indexing and metadata extraction. This includes: +- **System & Low-Level:** C/C++ (`.c`, `.cpp`, `.h`, `.hpp`), Rust (`.rs`) +- **Object-Oriented:** C# (`.cs`), Kotlin (`.kt`), Scala (`.scala`), Swift (`.swift`) +- **Scripting & Dynamic:** Ruby (`.rb`), PHP (`.php`), Shell (`.sh`, `.bash`) +- **And 40+ more file types** - All handled through the fallback strategy for basic indexing
@@ -212,6 +211,7 @@ Then configure: + ## Available Tools ### 🏗️ **Project Management** diff --git a/README_ja.md b/README_ja.md index 2d33bde..76c419a 100644 --- a/README_ja.md +++ b/README_ja.md @@ -44,7 +44,7 @@ Code Index MCPは、AIモデルと複雑なコードベースの橋渡しをす 2. **アプリケーションを再起動** – `uvx`がインストールと実行を自動処理 -3. **使用開始**: +3. **使用開始**(AIアシスタントにこれらのプロンプトを与える): ``` プロジェクトパスを/Users/dev/my-react-appに設定 このプロジェクトのすべてのTypeScriptファイルを検索 @@ -62,13 +62,16 @@ Code Index MCPは、AIモデルと複雑なコードベースの橋渡しをす ## 主な機能 ### 🔍 **インテリジェント検索・解析** -- **SCIPパワー**:主要IDEで使用される業界標準コードインテリジェンスフォーマット +- **二重戦略アーキテクチャ**:7つのコア言語に特化したTree-sitter解析、50+ファイルタイプにフォールバック戦略 +- **直接Tree-sitter統合**:特化言語で正規表現フォールバックなし - 明確なエラーメッセージで高速フェイル - **高度な検索**:最適なツール(ugrep、ripgrep、ag、grep)を自動検出・使用 -- **汎用理解**:単一システムですべてのプログラミング言語を理解 +- **汎用ファイルサポート**:高度なAST解析から基本ファイルインデックスまでの包括的カバレッジ - **ファイル解析**:構造、インポート、クラス、メソッド、複雑度メトリクスへの深い洞察 ### 🗂️ **多言語サポート** -- **50+ファイルタイプ**:Java、Python、JavaScript/TypeScript、C/C++、Go、Rust、C#、Swift、Kotlin、Ruby、PHPなど +- **7言語でTree-sitter AST解析**:Python、JavaScript、TypeScript、Java、Go、Objective-C、Zig +- **50+ファイルタイプでフォールバック戦略**:C/C++、Rust、Ruby、PHPおよびすべての他のプログラミング言語 +- **文書・設定ファイル**:Markdown、JSON、YAML、XML適切な処理 - **Webフロントエンド**:Vue、React、Svelte、HTML、CSS、SCSS - **データベース**:SQLバリアント、NoSQL、ストアドプロシージャ、マイグレーション - **設定ファイル**:JSON、YAML、XML、Markdown @@ -81,36 +84,32 @@ Code Index MCPは、AIモデルと複雑なコードベースの橋渡しをす - **豊富なメタデータ**:シンボル、参照、定義、関連性をキャプチャ ### ⚡ **パフォーマンス・効率性** -- **スマートインデックス作成**:ビルドディレクトリをインテリジェントにフィルタリングしながら再帰的スキャン +- **Tree-sitter AST解析**:正確なシンボル抽出のためのネイティブ構文解析 - **永続キャッシュ**:超高速な後続アクセスのためのインデックス保存 -- **遅延ロード**:最適化された起動のため必要時のみツール検出 -- **メモリ効率**:大規模コードベース向けのインテリジェントキャッシュ戦略 +- **スマートフィルタリング**:ビルドディレクトリと一時ファイルのインテリジェント除外 +- **メモリ効率**:大規模コードベース向けに最適化 +- **直接依存関係**:フォールバック機構なし - 明確なエラーメッセージで高速フェイル ## サポートされているファイルタイプ
📁 プログラミング言語(クリックで展開) -**システム・低レベル言語:** -- C/C++ (`.c`, `.cpp`, `.h`, `.hpp`) -- Rust (`.rs`) -- Zig (`.zig`) -- Go (`.go`) - -**オブジェクト指向言語:** -- Java (`.java`) -- C# (`.cs`) -- Kotlin (`.kt`) -- Scala (`.scala`) -- Objective-C/C++ (`.m`, `.mm`) -- Swift (`.swift`) - -**スクリプト・動的言語:** -- Python (`.py`) -- JavaScript/TypeScript (`.js`, `.ts`, `.jsx`, `.tsx`, `.mjs`, `.cjs`) -- Ruby (`.rb`) -- PHP (`.php`) -- Shell (`.sh`, `.bash`) +**特化Tree-sitter戦略言語:** +- **Python** (`.py`, `.pyw`) - クラス/メソッド抽出と呼び出し追跡を含む完全AST解析 +- **JavaScript** (`.js`, `.jsx`, `.mjs`, `.cjs`) - Tree-sitterを使用したES6+クラスと関数解析 +- **TypeScript** (`.ts`, `.tsx`) - インターフェースを含む完全な型認識シンボル抽出 +- **Java** (`.java`) - 完全なクラス階層、メソッドシグネチャ、呼び出し関係 +- **Go** (`.go`) - 構造体メソッド、レシーバータイプ、関数解析 +- **Objective-C** (`.m`, `.mm`) - +/-記法を使用したクラス/インスタンスメソッド区別 +- **Zig** (`.zig`, `.zon`) - Tree-sitter ASTを使用した関数と構造体解析 + +**すべての他のプログラミング言語:** +すべての他のプログラミング言語は**フォールバック解析戦略**を使用し、基本ファイルインデックスとメタデータ抽出を提供します。これには以下が含まれます: +- **システム・低レベル言語:** C/C++ (`.c`, `.cpp`, `.h`, `.hpp`)、Rust (`.rs`) +- **オブジェクト指向言語:** C# (`.cs`)、Kotlin (`.kt`)、Scala (`.scala`)、Swift (`.swift`) +- **スクリプト・動的言語:** Ruby (`.rb`)、PHP (`.php`)、Shell (`.sh`, `.bash`) +- **および40+ファイルタイプ** - すべてフォールバック戦略による基本インデックス処理
@@ -234,6 +233,7 @@ pip install code-index-mcp + ## 利用可能なツール ### 🏗️ **プロジェクト管理** diff --git a/README_zh.md b/README_zh.md index 1700e89..5a61fbb 100644 --- a/README_zh.md +++ b/README_zh.md @@ -44,7 +44,7 @@ 2. **重新啟動應用程式** – `uvx` 會自動處理安裝和執行 -3. **開始使用**: +3. **開始使用**(向您的 AI 助理提供這些提示): ``` 設定專案路徑為 /Users/dev/my-react-app 在這個專案中找到所有 TypeScript 檔案 @@ -62,13 +62,16 @@ ## 主要特性 ### 🔍 **智慧搜尋與分析** -- **SCIP 驅動**:業界標準程式碼智能格式,被主流 IDE 採用 +- **雙策略架構**:7 種核心語言使用專業化 Tree-sitter 解析,50+ 種檔案類型使用備用策略 +- **直接 Tree-sitter 整合**:專業化語言無正則表達式備用 - 快速失敗並提供清晰錯誤訊息 - **進階搜尋**:自動偵測並使用最佳工具(ugrep、ripgrep、ag 或 grep) -- **通用理解**:單一系統理解所有程式語言 +- **通用檔案支援**:從進階 AST 解析到基本檔案索引的全面覆蓋 - **檔案分析**:深入了解結構、匯入、類別、方法和複雜度指標 ### 🗂️ **多語言支援** -- **50+ 種檔案類型**:Java、Python、JavaScript/TypeScript、C/C++、Go、Rust、C#、Swift、Kotlin、Ruby、PHP 等 +- **7 種語言使用 Tree-sitter AST 解析**:Python、JavaScript、TypeScript、Java、Go、Objective-C、Zig +- **50+ 種檔案類型使用備用策略**:C/C++、Rust、Ruby、PHP 和所有其他程式語言 +- **文件與配置檔案**:Markdown、JSON、YAML、XML 適當處理 - **網頁前端**:Vue、React、Svelte、HTML、CSS、SCSS - **資料庫**:SQL 變體、NoSQL、存儲過程、遷移腳本 - **配置檔案**:JSON、YAML、XML、Markdown @@ -81,36 +84,32 @@ - **豐富元資料**:捕獲符號、引用、定義和關聯性 ### ⚡ **效能與效率** -- **智慧索引**:遞迴掃描並智慧篩選建構目錄 +- **Tree-sitter AST 解析**:原生語法解析以實現準確的符號提取 - **持久快取**:儲存索引以實現超快速的後續存取 -- **延遲載入**:僅在需要時偵測工具以優化啟動速度 -- **記憶體高效**:針對大型程式碼庫的智慧快取策略 +- **智慧篩選**:智能排除建構目錄和暫存檔案 +- **記憶體高效**:針對大型程式碼庫優化 +- **直接依賴**:無備用機制 - 快速失敗並提供清晰錯誤訊息 ## 支援的檔案類型
📁 程式語言(點擊展開) -**系統與低階語言:** -- C/C++ (`.c`, `.cpp`, `.h`, `.hpp`) -- Rust (`.rs`) -- Zig (`.zig`) -- Go (`.go`) - -**物件導向語言:** -- Java (`.java`) -- C# (`.cs`) -- Kotlin (`.kt`) -- Scala (`.scala`) -- Objective-C/C++ (`.m`, `.mm`) -- Swift (`.swift`) - -**腳本與動態語言:** -- Python (`.py`) -- JavaScript/TypeScript (`.js`, `.ts`, `.jsx`, `.tsx`, `.mjs`, `.cjs`) -- Ruby (`.rb`) -- PHP (`.php`) -- Shell (`.sh`, `.bash`) +**專業化 Tree-sitter 策略語言:** +- **Python** (`.py`, `.pyw`) - 完整 AST 分析,包含類別/方法提取和呼叫追蹤 +- **JavaScript** (`.js`, `.jsx`, `.mjs`, `.cjs`) - ES6+ 類別和函數解析使用 Tree-sitter +- **TypeScript** (`.ts`, `.tsx`) - 完整類型感知符號提取,包含介面 +- **Java** (`.java`) - 完整類別階層、方法簽名和呼叫關係 +- **Go** (`.go`) - 結構方法、接收者類型和函數分析 +- **Objective-C** (`.m`, `.mm`) - 類別/實例方法區分,使用 +/- 標記法 +- **Zig** (`.zig`, `.zon`) - 函數和結構解析使用 Tree-sitter AST + +**所有其他程式語言:** +所有其他程式語言使用 **備用解析策略**,提供基本檔案索引和元資料提取。包括: +- **系統與低階語言:** C/C++ (`.c`, `.cpp`, `.h`, `.hpp`)、Rust (`.rs`) +- **物件導向語言:** C# (`.cs`)、Kotlin (`.kt`)、Scala (`.scala`)、Swift (`.swift`) +- **腳本與動態語言:** Ruby (`.rb`)、PHP (`.php`)、Shell (`.sh`, `.bash`) +- **以及 40+ 種檔案類型** - 全部通過備用策略處理進行基本索引
@@ -234,6 +233,7 @@ pip install code-index-mcp + ## 可用工具 ### 🏗️ **專案管理** diff --git a/src/code_index_mcp/constants.py b/src/code_index_mcp/constants.py index 81b3d9b..d1d4235 100644 --- a/src/code_index_mcp/constants.py +++ b/src/code_index_mcp/constants.py @@ -74,3 +74,44 @@ '.liquibase', '.flyway', # Migration tools ] +# Centralized filtering configuration +FILTER_CONFIG = { + "exclude_directories": { + # Version control + '.git', '.svn', '.hg', '.bzr', + + # Package managers & dependencies + 'node_modules', '__pycache__', '.venv', 'venv', + 'vendor', 'bower_components', + + # Build outputs + 'dist', 'build', 'target', 'out', 'bin', 'obj', + + # IDE & editors + '.idea', '.vscode', '.vs', '.sublime-workspace', + + # Testing & coverage + '.pytest_cache', '.coverage', '.tox', '.nyc_output', + 'coverage', 'htmlcov', + + # OS artifacts + '.DS_Store', 'Thumbs.db', 'desktop.ini' + }, + + "exclude_files": { + # Temporary files + '*.tmp', '*.temp', '*.swp', '*.swo', + + # Backup files + '*.bak', '*~', '*.orig', + + # Log files + '*.log', + + # Lock files + 'package-lock.json', 'yarn.lock', 'Pipfile.lock' + }, + + "supported_extensions": SUPPORTED_EXTENSIONS +} + diff --git a/src/code_index_mcp/indexing/__init__.py b/src/code_index_mcp/indexing/__init__.py index 51259ee..512ad3f 100644 --- a/src/code_index_mcp/indexing/__init__.py +++ b/src/code_index_mcp/indexing/__init__.py @@ -11,8 +11,9 @@ ) # New JSON-based indexing system -from .json_index_builder import JSONIndexBuilder, SymbolInfo, FileInfo, IndexMetadata +from .json_index_builder import JSONIndexBuilder, IndexMetadata from .json_index_manager import JSONIndexManager, get_index_manager +from .models import SymbolInfo, FileInfo __all__ = [ 'generate_qualified_name', diff --git a/src/code_index_mcp/indexing/index_provider.py b/src/code_index_mcp/indexing/index_provider.py index a87ddcf..660bb8d 100644 --- a/src/code_index_mcp/indexing/index_provider.py +++ b/src/code_index_mcp/indexing/index_provider.py @@ -1,43 +1,18 @@ """ -索引提供者接口定义 +Index provider interface definitions. -定义所有索引访问的标准接口,确保不同实现的一致性。 +Defines standard interfaces for all index access, ensuring consistency across different implementations. """ from typing import List, Optional, Dict, Any, Protocol from dataclasses import dataclass - -@dataclass -class SymbolInfo: - """符号信息标准数据结构""" - name: str - kind: str # 'class', 'function', 'method', 'variable', etc. - location: Dict[str, int] # {'line': int, 'column': int} - scope: str - documentation: List[str] - - -# Define FileInfo here to avoid circular imports -@dataclass -class FileInfo: - """文件信息标准数据结构""" - relative_path: str - language: str - absolute_path: str - - def __hash__(self): - return hash(self.relative_path) - - def __eq__(self, other): - if isinstance(other, FileInfo): - return self.relative_path == other.relative_path - return False +from .models import SymbolInfo, FileInfo @dataclass class IndexMetadata: - """索引元数据标准结构""" + """Standard index metadata structure.""" version: str format_type: str created_at: float @@ -49,68 +24,68 @@ class IndexMetadata: class IIndexProvider(Protocol): """ - 索引提供者标准接口 + Standard index provider interface. - 所有索引实现都必须遵循这个接口,确保一致的访问方式。 + All index implementations must follow this interface to ensure consistent access patterns. """ def get_file_list(self) -> List[FileInfo]: """ - 获取所有索引文件列表 + Get list of all indexed files. Returns: - 文件信息列表 + List of file information objects """ ... def get_file_info(self, file_path: str) -> Optional[FileInfo]: """ - 获取特定文件信息 + Get information for a specific file. Args: - file_path: 文件相对路径 + file_path: Relative file path Returns: - 文件信息,如果文件不在索引中则返回None + File information, or None if file is not in index """ ... def query_symbols(self, file_path: str) -> List[SymbolInfo]: """ - 查询文件中的符号信息 + Query symbol information in a file. Args: - file_path: 文件相对路径 + file_path: Relative file path Returns: - 符号信息列表 + List of symbol information objects """ ... - def search_files(self, pattern: str) -> List[FileInfo]: + def search_files(self, pattern: str) -> List[str]: """ - 按模式搜索文件 + Search files by pattern. Args: - pattern: glob模式或正则表达式 + pattern: Glob pattern or regular expression Returns: - 匹配的文件列表 + List of matching file paths """ ... def get_metadata(self) -> IndexMetadata: """ - 获取索引元数据 + Get index metadata. Returns: - 索引元数据信息 + Index metadata information """ ... def is_available(self) -> bool: """ - 检查索引是否可用 + Check if index is available. Returns: True if index is available and functional @@ -120,31 +95,31 @@ def is_available(self) -> bool: class IIndexManager(Protocol): """ - 索引管理器接口 + Index manager interface. - 定义索引生命周期管理的标准接口。 + Defines standard interface for index lifecycle management. """ def initialize(self) -> bool: - """初始化索引管理器""" + """Initialize the index manager.""" ... def get_provider(self) -> Optional[IIndexProvider]: - """获取当前活跃的索引提供者""" + """Get the current active index provider.""" ... def refresh_index(self, force: bool = False) -> bool: - """刷新索引""" + """Refresh the index.""" ... def save_index(self) -> bool: - """保存索引状态""" + """Save index state.""" ... def clear_index(self) -> None: - """清理索引状态""" + """Clear index state.""" ... def get_index_status(self) -> Dict[str, Any]: - """获取索引状态信息""" + """Get index status information.""" ... diff --git a/src/code_index_mcp/indexing/json_index_builder.py b/src/code_index_mcp/indexing/json_index_builder.py index 8e4ddec..0f95c5b 100644 --- a/src/code_index_mcp/indexing/json_index_builder.py +++ b/src/code_index_mcp/indexing/json_index_builder.py @@ -14,7 +14,6 @@ from .strategies import StrategyFactory from .models import SymbolInfo, FileInfo -from ..constants import SUPPORTED_EXTENSIONS logger = logging.getLogger(__name__) @@ -43,10 +42,24 @@ class JSONIndexBuilder: 4. Assembling the final JSON index """ - def __init__(self, project_path: str): + def __init__(self, project_path: str, additional_excludes: Optional[List[str]] = None): + from ..utils import FileFilter + + # Input validation + if not isinstance(project_path, str): + raise ValueError(f"Project path must be a string, got {type(project_path)}") + + project_path = project_path.strip() + if not project_path: + raise ValueError("Project path cannot be empty") + + if not os.path.isdir(project_path): + raise ValueError(f"Project path does not exist: {project_path}") + self.project_path = project_path self.in_memory_index: Optional[Dict[str, Any]] = None self.strategy_factory = StrategyFactory() + self.file_filter = FileFilter(additional_excludes) logger.info(f"Initialized JSON index builder for {project_path}") strategy_info = self.strategy_factory.get_strategy_info() @@ -149,31 +162,24 @@ def clear_index(self): def _get_supported_files(self) -> List[str]: """ - Get all supported files in the project. + Get all supported files in the project using centralized filtering. Returns: List of file paths that can be parsed """ supported_files = [] - supported_extensions = set(SUPPORTED_EXTENSIONS) + base_path = Path(self.project_path) try: for root, dirs, files in os.walk(self.project_path): - # Skip hidden directories and common ignore patterns - dirs[:] = [d for d in dirs if not d.startswith('.') and d not in { - '__pycache__', 'node_modules', '.git', '.svn', '.hg', - '.vscode', '.idea', 'target', 'build', 'dist' - }] + # Filter directories in-place using centralized logic + dirs[:] = [d for d in dirs if not self.file_filter.should_exclude_directory(d)] + # Filter files using centralized logic for file in files: - if file.startswith('.'): - continue - - file_path = os.path.join(root, file) - ext = Path(file_path).suffix.lower() - - if ext in supported_extensions: - supported_files.append(file_path) + file_path = Path(root) / file + if self.file_filter.should_process_path(file_path, base_path): + supported_files.append(str(file_path)) except Exception as e: logger.error(f"Error scanning directory {self.project_path}: {e}") diff --git a/src/code_index_mcp/indexing/json_index_manager.py b/src/code_index_mcp/indexing/json_index_manager.py index d24eb03..d4564f3 100644 --- a/src/code_index_mcp/indexing/json_index_manager.py +++ b/src/code_index_mcp/indexing/json_index_manager.py @@ -35,6 +35,16 @@ def set_project_path(self, project_path: str) -> bool: """Set the project path and initialize index storage.""" with self._lock: try: + # Input validation + if not project_path or not isinstance(project_path, str): + logger.error(f"Invalid project path: {project_path}") + return False + + project_path = project_path.strip() + if not project_path: + logger.error("Project path cannot be empty") + return False + if not os.path.isdir(project_path): logger.error(f"Project path does not exist: {project_path}") return False @@ -114,6 +124,15 @@ def refresh_index(self) -> bool: def find_files(self, pattern: str = "*") -> List[str]: """Find files matching a pattern.""" with self._lock: + # Input validation + if not isinstance(pattern, str): + logger.error(f"Pattern must be a string, got {type(pattern)}") + return [] + + pattern = pattern.strip() + if not pattern: + pattern = "*" + if not self.index_builder or not self.index_builder.in_memory_index: logger.warning("Index not loaded") return [] @@ -133,12 +152,35 @@ def find_files(self, pattern: str = "*") -> List[str]: return [] def get_file_summary(self, file_path: str) -> Optional[Dict[str, Any]]: - """Get summary information for a file.""" + """ + Get summary information for a file. + + This method attempts to retrieve comprehensive file information including + symbol counts, functions, classes, methods, and imports. If the index + is not loaded, it will attempt auto-initialization to restore from the + most recent index state. + + Args: + file_path: Relative path to the file + + Returns: + Dictionary containing file summary information, or None if not found + """ with self._lock: - # Auto-initialize if not ready but project path can be inferred + # Input validation + if not isinstance(file_path, str): + logger.error(f"File path must be a string, got {type(file_path)}") + return None + + file_path = file_path.strip() + if not file_path: + logger.error("File path cannot be empty") + return None + + # Try to load cached index if not ready if not self.index_builder or not self.index_builder.in_memory_index: - if not self._auto_initialize_from_context(): - logger.warning("Index not loaded and cannot auto-initialize") + if not self._try_load_cached_index(): + logger.warning("Index not loaded and no cached index available") return None try: @@ -275,18 +317,22 @@ def _is_index_fresh(self) -> bool: return False try: + from ..utils import FileFilter + file_filter = FileFilter() + # Simple freshness check - index exists and is recent index_mtime = os.path.getmtime(self.index_path) + base_path = Path(self.project_path) # Check if any source files are newer than index for root, dirs, files in os.walk(self.project_path): - # Skip excluded directories - dirs[:] = [d for d in dirs if d not in {'.git', '__pycache__', 'node_modules', '.venv'}] + # Filter directories using centralized logic + dirs[:] = [d for d in dirs if not file_filter.should_exclude_directory(d)] for file in files: - if any(file.endswith(ext) for ext in ['.py', '.js', '.ts', '.java']): - file_path = os.path.join(root, file) - if os.path.getmtime(file_path) > index_mtime: + file_path = Path(root) / file + if file_filter.should_process_path(file_path, base_path): + if os.path.getmtime(str(file_path)) > index_mtime: return False return True @@ -295,45 +341,47 @@ def _is_index_fresh(self) -> bool: logger.warning(f"Error checking index freshness: {e}") return False - def _auto_initialize_from_context(self) -> bool: + def _try_load_cached_index(self, expected_project_path: Optional[str] = None) -> bool: """ - Auto-initialize from the most recent project context. - This handles the case where MCP tools run in separate processes. + Try to load a cached index file if available. + + This is a simplified version of auto-initialization that only loads + a cached index if we can verify it matches the expected project. + + Args: + expected_project_path: Optional path to verify against cached index + + Returns: + True if cached index was loaded successfully, False otherwise. """ try: - import glob - import tempfile - - # Find the most recent index file - pattern = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, "*", INDEX_FILE) - index_files = glob.glob(pattern) - - if not index_files: - logger.debug("No index files found for auto-initialization") - return False - - # Get the most recently modified index - latest_file = max(index_files, key=os.path.getmtime) - logger.info(f"Auto-initializing from latest index: {latest_file}") + # First try to load from current index_path if set + if self.index_path and os.path.exists(self.index_path): + return self.load_index() - # Extract project path from the index - with open(latest_file, 'r', encoding='utf-8') as f: - import json - index_data = json.load(f) - project_path = index_data.get('metadata', {}).get('project_path') + # If expected project path provided, try to find its cache + if expected_project_path: + project_hash = hashlib.md5(expected_project_path.encode()).hexdigest()[:12] + temp_dir = os.path.join(tempfile.gettempdir(), SETTINGS_DIR, project_hash) + index_path = os.path.join(temp_dir, INDEX_FILE) - if not project_path or not os.path.exists(project_path): - logger.warning(f"Invalid project path in index: {project_path}") - return False - - # Initialize with this project path - if self.set_project_path(project_path): - return self.load_index() + if os.path.exists(index_path): + # Verify the cached index matches the expected project + with open(index_path, 'r', encoding='utf-8') as f: + index_data = json.load(f) + cached_project = index_data.get('metadata', {}).get('project_path') + + if cached_project == expected_project_path: + self.temp_dir = temp_dir + self.index_path = index_path + return self.load_index() + else: + logger.warning(f"Cached index project mismatch: {cached_project} != {expected_project_path}") return False except Exception as e: - logger.warning(f"Auto-initialization failed: {e}") + logger.debug(f"Failed to load cached index: {e}") return False def cleanup(self): diff --git a/src/code_index_mcp/indexing/strategies/base_strategy.py b/src/code_index_mcp/indexing/strategies/base_strategy.py index 531478c..691dce0 100644 --- a/src/code_index_mcp/indexing/strategies/base_strategy.py +++ b/src/code_index_mcp/indexing/strategies/base_strategy.py @@ -5,87 +5,83 @@ import os from abc import ABC, abstractmethod from typing import Dict, List, Tuple, Optional -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo class ParsingStrategy(ABC): """Abstract base class for language parsing strategies.""" - + @abstractmethod def get_language_name(self) -> str: """Return the language name this strategy handles.""" - pass - + @abstractmethod def get_supported_extensions(self) -> List[str]: """Return list of file extensions this strategy supports.""" - pass - + @abstractmethod def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: """ Parse file content and extract symbols. - + Args: file_path: Path to the file being parsed content: File content as string - + Returns: Tuple of (symbols_dict, file_info) - symbols_dict: Maps symbol_id -> SymbolInfo - file_info: FileInfo with metadata about the file """ - pass - + def _create_symbol_id(self, file_path: str, symbol_name: str) -> str: """ Create a unique symbol ID. - + Args: file_path: Path to the file containing the symbol symbol_name: Name of the symbol - + Returns: Unique symbol identifier in format "relative_path::symbol_name" """ relative_path = self._get_relative_path(file_path) return f"{relative_path}::{symbol_name}" - + def _get_relative_path(self, file_path: str) -> str: """Convert absolute file path to relative path.""" parts = file_path.replace('\\', '/').split('/') - + # Priority order: test > src (outermost project roots first) for root_dir in ['test', 'src']: if root_dir in parts: root_index = parts.index(root_dir) relative_parts = parts[root_index:] return '/'.join(relative_parts) - + # Fallback: use just filename return os.path.basename(file_path) - + def _extract_line_number(self, content: str, symbol_position: int) -> int: """ Extract line number from character position in content. - + Args: content: File content symbol_position: Character position in content - + Returns: Line number (1-based) """ return content[:symbol_position].count('\n') + 1 - + def _get_file_name(self, file_path: str) -> str: """Get just the filename from a full path.""" return os.path.basename(file_path) - + def _safe_extract_text(self, content: str, start: int, end: int) -> str: """Safely extract text from content, handling bounds.""" try: return content[start:end].strip() except (IndexError, TypeError): - return "" \ No newline at end of file + return "" diff --git a/src/code_index_mcp/indexing/strategies/fallback_strategy.py b/src/code_index_mcp/indexing/strategies/fallback_strategy.py index 01d7135..21653bd 100644 --- a/src/code_index_mcp/indexing/strategies/fallback_strategy.py +++ b/src/code_index_mcp/indexing/strategies/fallback_strategy.py @@ -5,26 +5,25 @@ import os from typing import Dict, List, Tuple from .base_strategy import ParsingStrategy -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo class FallbackParsingStrategy(ParsingStrategy): """Fallback parser for unsupported languages and file types.""" - + def __init__(self, language_name: str = "unknown"): self.language_name = language_name - + def get_language_name(self) -> str: return self.language_name - + def get_supported_extensions(self) -> List[str]: return [] # Fallback supports any extension - + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: """Basic parsing: extract file information without symbol parsing.""" symbols = {} - + # For document files, we can at least index their existence file_info = FileInfo( language=self.language_name, @@ -32,7 +31,7 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo symbols={"functions": [], "classes": []}, imports=[] ) - + # For document files (e.g. .md, .txt, .json), we can add a symbol representing the file itself if self.language_name in ['markdown', 'text', 'json', 'yaml', 'xml', 'config', 'css', 'html']: filename = os.path.basename(file_path) @@ -43,5 +42,5 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo line=1, signature=f"{self.language_name} file: {filename}" ) - - return symbols, file_info \ No newline at end of file + + return symbols, file_info diff --git a/src/code_index_mcp/indexing/strategies/go_strategy.py b/src/code_index_mcp/indexing/strategies/go_strategy.py index 2116ee7..b3a95cb 100644 --- a/src/code_index_mcp/indexing/strategies/go_strategy.py +++ b/src/code_index_mcp/indexing/strategies/go_strategy.py @@ -5,19 +5,18 @@ import re from typing import Dict, List, Tuple, Optional from .base_strategy import ParsingStrategy -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo class GoParsingStrategy(ParsingStrategy): """Go-specific parsing strategy using regex patterns.""" - + def get_language_name(self) -> str: return "go" - + def get_supported_extensions(self) -> List[str]: return ['.go'] - + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: """Parse Go file using regex patterns.""" symbols = {} @@ -25,22 +24,22 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo classes = [] # Go doesn't have classes, but we'll track structs/interfaces imports = [] package = None - + lines = content.splitlines() - + for i, line in enumerate(lines): line = line.strip() - + # Package declaration if line.startswith('package '): package = line.split('package ')[1].strip() - + # Import statements elif line.startswith('import '): import_match = re.search(r'import\s+"([^"]+)"', line) if import_match: imports.append(import_match.group(1)) - + # Function declarations elif line.startswith('func '): func_match = re.match(r'func\s+(\w+)\s*\(', line) @@ -54,7 +53,7 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo signature=line ) functions.append(func_name) - + # Method declarations (func (receiver) methodName) method_match = re.match(r'func\s+\([^)]+\)\s+(\w+)\s*\(', line) if method_match: @@ -67,7 +66,7 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo signature=line ) functions.append(method_name) - + # Struct declarations elif re.match(r'type\s+\w+\s+struct\s*\{', line): struct_match = re.match(r'type\s+(\w+)\s+struct', line) @@ -80,7 +79,7 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo line=i + 1 ) classes.append(struct_name) - + # Interface declarations elif re.match(r'type\s+\w+\s+interface\s*\{', line): interface_match = re.match(r'type\s+(\w+)\s+interface', line) @@ -93,10 +92,10 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo line=i + 1 ) classes.append(interface_name) - + # Phase 2: Add call relationship analysis self._analyze_go_calls(content, symbols, file_path) - + file_info = FileInfo( language=self.get_language_name(), line_count=len(lines), @@ -104,26 +103,31 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo imports=imports, package=package ) - + return symbols, file_info - + def _analyze_go_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): """Analyze Go function calls for relationships.""" lines = content.splitlines() current_function = None - + is_function_declaration_line = False + for i, line in enumerate(lines): original_line = line line = line.strip() - + # Track current function context if line.startswith('func '): func_name = self._extract_go_function_name(line) if func_name: current_function = self._create_symbol_id(file_path, func_name) - + is_function_declaration_line = True + else: + is_function_declaration_line = False + # Find function calls: functionName() or obj.methodName() - if current_function and ('(' in line and ')' in line): + # Skip the function declaration line itself to avoid false self-calls + if current_function and not is_function_declaration_line and ('(' in line and ')' in line): called_functions = self._extract_go_called_functions(line) for called_func in called_functions: # Find the called function in symbols and add relationship @@ -131,32 +135,30 @@ def _analyze_go_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_p if called_func in symbol_id.split("::")[-1]: if current_function not in symbol_info.called_by: symbol_info.called_by.append(current_function) - + def _extract_go_function_name(self, line: str) -> Optional[str]: """Extract function name from Go function declaration.""" try: # func functionName(...) or func (receiver) methodName(...) - import re match = re.match(r'func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(', line) if match: return match.group(1) except: pass return None - + def _extract_go_called_functions(self, line: str) -> List[str]: """Extract function names that are being called in this line.""" - import re called_functions = [] - + # Find patterns like: functionName( or obj.methodName( patterns = [ r'(\w+)\s*\(', # functionName( r'\.(\w+)\s*\(', # .methodName( ] - + for pattern in patterns: matches = re.findall(pattern, line) called_functions.extend(matches) - - return called_functions \ No newline at end of file + + return called_functions diff --git a/src/code_index_mcp/indexing/strategies/java_strategy.py b/src/code_index_mcp/indexing/strategies/java_strategy.py index b258862..b1c9845 100644 --- a/src/code_index_mcp/indexing/strategies/java_strategy.py +++ b/src/code_index_mcp/indexing/strategies/java_strategy.py @@ -1,47 +1,35 @@ """ -Java parsing strategy using tree-sitter with regex fallback. +Java parsing strategy using tree-sitter. """ import logging import re from typing import Dict, List, Tuple, Optional from .base_strategy import ParsingStrategy -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo logger = logging.getLogger(__name__) -try: - import tree_sitter - import tree_sitter_java - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - logger.warning("tree-sitter-java not available, using regex fallback") +import tree_sitter +from tree_sitter_java import language class JavaParsingStrategy(ParsingStrategy): """Java-specific parsing strategy.""" - + def __init__(self): - if TREE_SITTER_AVAILABLE: - self.java_language = tree_sitter.Language(tree_sitter_java.language()) - else: - self.java_language = None - + self.java_language = tree_sitter.Language(language()) + def get_language_name(self) -> str: return "java" - + def get_supported_extensions(self) -> List[str]: return ['.java'] - + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Parse Java file using tree-sitter or regex fallback.""" - if TREE_SITTER_AVAILABLE and self.java_language: - return self._tree_sitter_parse(file_path, content) - else: - return self._regex_parse(file_path, content) - + """Parse Java file using tree-sitter.""" + return self._tree_sitter_parse(file_path, content) + def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: """Parse using tree-sitter.""" symbols = {} @@ -49,16 +37,16 @@ def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, Sy classes = [] imports = [] package = None - + parser = tree_sitter.Parser(self.java_language) - + try: tree = parser.parse(content.encode('utf8')) # Phase 1: Extract symbol definitions self._traverse_java_node(tree.root_node, content, file_path, symbols, functions, classes, imports) # Phase 2: Analyze method calls and build relationships self._analyze_java_calls(tree, content, symbols, file_path) - + # Extract package info for node in tree.root_node.children: if node.type == 'package_declaration': @@ -66,7 +54,7 @@ def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, Sy break except Exception as e: logger.warning(f"Error parsing Java file {file_path}: {e}") - + file_info = FileInfo( language=self.get_language_name(), line_count=len(content.splitlines()), @@ -74,71 +62,11 @@ def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, Sy imports=imports, package=package ) - - return symbols, file_info - - def _regex_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Parse using regex patterns.""" - symbols = {} - functions = [] - classes = [] - imports = [] - package = None - - lines = content.splitlines() - - for i, line in enumerate(lines): - line = line.strip() - - # Package declaration - if line.startswith('package '): - package = line.split('package ')[1].split(';')[0].strip() - - # Import statements - elif line.startswith('import '): - import_name = line.split('import ')[1].split(';')[0].strip() - imports.append(import_name) - - # Class declarations - elif re.match(r'(public\s+|private\s+|protected\s+)?(class|interface|enum)\s+\w+', line): - class_match = re.search(r'(class|interface|enum)\s+(\w+)', line) - if class_match: - class_name = class_match.group(2) - symbol_id = self._create_symbol_id(file_path, class_name) - symbols[symbol_id] = SymbolInfo( - type=class_match.group(1), # class, interface, or enum - file=file_path, - line=i + 1 - ) - classes.append(class_name) - - # Method declarations - elif re.match(r'\s*(public|private|protected).*\s+\w+\s*\(.*\)\s*\{?', line): - method_match = re.search(r'\s+(\w+)\s*\(', line) - if method_match: - method_name = method_match.group(1) - # Skip keywords like 'if', 'for', etc. - if method_name not in ['if', 'for', 'while', 'switch', 'try', 'catch']: - symbol_id = self._create_symbol_id(file_path, method_name) - symbols[symbol_id] = SymbolInfo( - type="method", - file=file_path, - line=i + 1, - signature=line.strip() - ) - functions.append(method_name) - - file_info = FileInfo( - language=self.get_language_name(), - line_count=len(lines), - symbols={"functions": functions, "classes": classes}, - imports=imports, - package=package - ) - + return symbols, file_info - - def _traverse_java_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], + + + def _traverse_java_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], functions: List[str], classes: List[str], imports: List[str]): """Traverse Java AST node.""" if node.type == 'class_declaration': @@ -151,7 +79,7 @@ def _traverse_java_node(self, node, content: str, file_path: str, symbols: Dict[ line=node.start_point[0] + 1 ) classes.append(name) - + elif node.type == 'method_declaration': name = self._get_java_method_name(node, content) if name: @@ -163,43 +91,43 @@ def _traverse_java_node(self, node, content: str, file_path: str, symbols: Dict[ signature=self._get_java_method_signature(node, content) ) functions.append(name) - + # Continue traversing children for child in node.children: self._traverse_java_node(child, content, file_path, symbols, functions, classes, imports) - + def _get_java_class_name(self, node, content: str) -> Optional[str]: for child in node.children: if child.type == 'identifier': return content[child.start_byte:child.end_byte] return None - + def _get_java_method_name(self, node, content: str) -> Optional[str]: for child in node.children: if child.type == 'identifier': return content[child.start_byte:child.end_byte] return None - + def _get_java_method_signature(self, node, content: str) -> str: return content[node.start_byte:node.end_byte].split('\n')[0].strip() - + def _extract_java_package(self, node, content: str) -> Optional[str]: for child in node.children: if child.type == 'scoped_identifier': return content[child.start_byte:child.end_byte] return None - + def _analyze_java_calls(self, tree, content: str, symbols: Dict[str, SymbolInfo], file_path: str): """Analyze Java method calls for relationships.""" self._find_java_calls(tree.root_node, content, symbols, file_path) - + def _find_java_calls(self, node, content: str, symbols: Dict[str, SymbolInfo], file_path: str, current_method: str = None): """Recursively find Java method calls.""" if node.type == 'method_declaration': method_name = self._get_java_method_name(node, content) if method_name: current_method = self._create_symbol_id(file_path, method_name) - + elif node.type == 'method_invocation': if current_method: called_method = self._get_called_method_name(node, content) @@ -209,14 +137,14 @@ def _find_java_calls(self, node, content: str, symbols: Dict[str, SymbolInfo], f if called_method in symbol_id.split("::")[-1]: if current_method not in symbol_info.called_by: symbol_info.called_by.append(current_method) - + # Continue traversing children for child in node.children: self._find_java_calls(child, content, symbols, file_path, current_method) - + def _get_called_method_name(self, node, content: str) -> Optional[str]: """Extract called method name from method invocation node.""" for child in node.children: if child.type == 'identifier': return content[child.start_byte:child.end_byte] - return None \ No newline at end of file + return None diff --git a/src/code_index_mcp/indexing/strategies/javascript_strategy.py b/src/code_index_mcp/indexing/strategies/javascript_strategy.py index dcdc970..63c78f7 100644 --- a/src/code_index_mcp/indexing/strategies/javascript_strategy.py +++ b/src/code_index_mcp/indexing/strategies/javascript_strategy.py @@ -4,36 +4,26 @@ import logging from typing import Dict, List, Tuple, Optional +import tree_sitter +from tree_sitter_javascript import language from .base_strategy import ParsingStrategy -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo logger = logging.getLogger(__name__) -try: - import tree_sitter - import tree_sitter_javascript - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - logger.warning("tree-sitter not available, JavaScript parsing will be limited") - class JavaScriptParsingStrategy(ParsingStrategy): """JavaScript-specific parsing strategy using tree-sitter.""" - + def __init__(self): - if TREE_SITTER_AVAILABLE: - self.js_language = tree_sitter.Language(tree_sitter_javascript.language()) - else: - self.js_language = None - + self.js_language = tree_sitter.Language(language()) + def get_language_name(self) -> str: return "javascript" - + def get_supported_extensions(self) -> List[str]: return ['.js', '.jsx', '.mjs', '.cjs'] - + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: """Parse JavaScript file using tree-sitter.""" symbols = {} @@ -41,19 +31,11 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo classes = [] imports = [] exports = [] - - if not TREE_SITTER_AVAILABLE or not self.js_language: - logger.info(f"Tree-sitter not available, using fallback for {file_path}") - return self._fallback_parse(file_path, content) - - try: - parser = tree_sitter.Parser(self.js_language) - tree = parser.parse(content.encode('utf8')) - self._traverse_js_node(tree.root_node, content, file_path, symbols, functions, classes, imports, exports) - except Exception as e: - logger.warning(f"Error parsing JavaScript file {file_path}: {e}, falling back to regex parsing") - return self._fallback_parse(file_path, content) - + + parser = tree_sitter.Parser(self.js_language) + tree = parser.parse(content.encode('utf8')) + self._traverse_js_node(tree.root_node, content, file_path, symbols, functions, classes, imports, exports) + file_info = FileInfo( language=self.get_language_name(), line_count=len(content.splitlines()), @@ -61,10 +43,10 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo imports=imports, exports=exports ) - + return symbols, file_info - - def _traverse_js_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], + + def _traverse_js_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], functions: List[str], classes: List[str], imports: List[str], exports: List[str]): """Traverse JavaScript AST node.""" if node.type == 'function_declaration': @@ -79,7 +61,33 @@ def _traverse_js_node(self, node, content: str, file_path: str, symbols: Dict[st signature=signature ) functions.append(name) - + + # Handle arrow functions and function expressions in lexical declarations (const/let) + elif node.type in ['lexical_declaration', 'variable_declaration']: + # Look for const/let/var name = arrow_function or function_expression + for child in node.children: + if child.type == 'variable_declarator': + name_node = None + value_node = None + for declarator_child in child.children: + if declarator_child.type == 'identifier': + name_node = declarator_child + elif declarator_child.type in ['arrow_function', 'function_expression', 'function']: + value_node = declarator_child + + if name_node and value_node: + name = content[name_node.start_byte:name_node.end_byte] + symbol_id = self._create_symbol_id(file_path, name) + # Create signature from the declaration + signature = content[child.start_byte:child.end_byte].split('\n')[0].strip() + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=child.start_point[0] + 1, # Use child position, not parent + signature=signature + ) + functions.append(name) + elif node.type == 'class_declaration': name = self._get_class_name(node, content) if name: @@ -90,12 +98,13 @@ def _traverse_js_node(self, node, content: str, file_path: str, symbols: Dict[st line=node.start_point[0] + 1 ) classes.append(name) - + elif node.type == 'method_definition': method_name = self._get_method_name(node, content) class_name = self._find_parent_class(node, content) if method_name and class_name: - symbol_id = self._create_symbol_id(file_path, f"{class_name}.{method_name}") + full_name = f"{class_name}.{method_name}" + symbol_id = self._create_symbol_id(file_path, full_name) signature = self._get_js_function_signature(node, content) symbols[symbol_id] = SymbolInfo( type="method", @@ -103,32 +112,34 @@ def _traverse_js_node(self, node, content: str, file_path: str, symbols: Dict[st line=node.start_point[0] + 1, signature=signature ) - + # Add method to functions list for consistency + functions.append(full_name) + # Continue traversing children for child in node.children: self._traverse_js_node(child, content, file_path, symbols, functions, classes, imports, exports) - + def _get_function_name(self, node, content: str) -> Optional[str]: """Extract function name from tree-sitter node.""" for child in node.children: if child.type == 'identifier': return content[child.start_byte:child.end_byte] return None - + def _get_class_name(self, node, content: str) -> Optional[str]: """Extract class name from tree-sitter node.""" for child in node.children: if child.type == 'identifier': return content[child.start_byte:child.end_byte] return None - + def _get_method_name(self, node, content: str) -> Optional[str]: """Extract method name from tree-sitter node.""" for child in node.children: if child.type == 'property_identifier': return content[child.start_byte:child.end_byte] return None - + def _find_parent_class(self, node, content: str) -> Optional[str]: """Find the parent class of a method.""" parent = node.parent @@ -137,217 +148,7 @@ def _find_parent_class(self, node, content: str) -> Optional[str]: return self._get_class_name(parent, content) parent = parent.parent return None - + def _get_js_function_signature(self, node, content: str) -> str: """Extract JavaScript function signature.""" return content[node.start_byte:node.end_byte].split('\n')[0].strip() - - def _fallback_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Enhanced fallback parsing when tree-sitter is not available.""" - symbols = {} - functions = [] - classes = [] - imports = [] - - # Phase 1: Extract symbols using enhanced regex-based parsing - lines = content.splitlines() - current_class = None - - for i, line in enumerate(lines): - original_line = line - line = line.strip() - - # Import/require statements - if line.startswith('const ') and 'require(' in line: - import_name = self._extract_js_require(line) - if import_name: - imports.append(import_name) - elif line.startswith('import ') and ' from ' in line: - import_name = self._extract_js_import(line) - if import_name: - imports.append(import_name) - - # Class declarations - elif line.startswith('class '): - class_name = self._extract_js_class_name(line) - if class_name: - current_class = class_name - symbol_id = self._create_symbol_id(file_path, class_name) - symbols[symbol_id] = SymbolInfo( - type="class", - file=file_path, - line=i + 1 - ) - classes.append(class_name) - - # Function declarations (standalone) - elif line.startswith('function '): - func_name = self._extract_js_function_name(line) - if func_name: - symbol_id = self._create_symbol_id(file_path, func_name) - symbols[symbol_id] = SymbolInfo( - type="function", - file=file_path, - line=i + 1, - signature=line - ) - functions.append(func_name) - - # Method declarations (inside classes) - async method() { or method() { - elif current_class and (line.endswith('{') or '{' in line) and '(' in line and ')' in line: - method_name = self._extract_js_method_name(line) - if method_name and not line.startswith('//') and 'function' not in line: - symbol_id = self._create_symbol_id(file_path, f"{current_class}.{method_name}") - symbols[symbol_id] = SymbolInfo( - type="method", - file=file_path, - line=i + 1, - signature=line.replace('{', '').strip() - ) - functions.append(method_name) # Add to functions list for summary - - # Reset class context on closing brace (simplified) - elif line == '}' and current_class: - # Very basic heuristic - this could be improved - if original_line.strip() == '}' and i < len(lines) - 1: - current_class = None - - # Phase 2: Add call relationship analysis - self._analyze_js_calls(content, symbols, file_path) - - file_info = FileInfo( - language=self.get_language_name(), - line_count=len(lines), - symbols={"functions": functions, "classes": classes}, - imports=imports - ) - - return symbols, file_info - - def _extract_js_function_name(self, line: str) -> Optional[str]: - """Extract function name from JavaScript function declaration.""" - try: - # function functionName(...) or function functionName(...) - parts = line.split('(')[0].split() - if len(parts) >= 2 and parts[0] == 'function': - return parts[1] - except: - pass - return None - - def _extract_js_class_name(self, line: str) -> Optional[str]: - """Extract class name from JavaScript class declaration.""" - try: - # class ClassName { or class ClassName extends ... - parts = line.split() - if len(parts) >= 2 and parts[0] == 'class': - class_name = parts[1] - # Remove any trailing characters like { or extends - if '{' in class_name: - class_name = class_name.split('{')[0] - if 'extends' in class_name: - class_name = class_name.split('extends')[0] - return class_name.strip() - except: - pass - return None - - def _extract_js_method_name(self, line: str) -> Optional[str]: - """Extract method name from JavaScript method declaration.""" - try: - # async methodName(params) { or methodName(params) { - line = line.strip() - if line.startswith('async '): - line = line[6:].strip() - - if '(' in line: - method_name = line.split('(')[0].strip() - # Remove access modifiers and keywords - for modifier in ['static', 'get', 'set']: - if method_name.startswith(modifier + ' '): - method_name = method_name[len(modifier):].strip() - - return method_name if method_name and method_name.replace('_', '').isalnum() else None - except: - pass - return None - - def _extract_js_require(self, line: str) -> Optional[str]: - """Extract module name from require statement.""" - try: - # const something = require('module') or require('module') - if 'require(' in line: - start = line.find("require('") + 9 - if start == 8: # require(" format - start = line.find('require("') + 9 - if start > 8: - end = line.find("'", start) - if end == -1: - end = line.find('"', start) - if end > start: - return line[start:end] - except: - pass - return None - - def _extract_js_import(self, line: str) -> Optional[str]: - """Extract module name from ES6 import statement.""" - try: - # import { something } from 'module' or import something from 'module' - if ' from ' in line: - module_part = line.split(' from ')[-1].strip() - module_name = module_part.strip('\'"').replace("'", "").replace('"', '').replace(';', '') - return module_name - except: - pass - return None - - def _analyze_js_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): - """Analyze JavaScript function calls for relationships.""" - lines = content.splitlines() - current_function = None - - for i, line in enumerate(lines): - original_line = line - line = line.strip() - - # Track current function context - if 'function ' in line or (line.endswith('{') and '(' in line and ')' in line): - func_name = self._extract_function_from_line(line) - if func_name: - current_function = self._create_symbol_id(file_path, func_name) - - # Find function calls: functionName() or obj.methodName() - if current_function and ('(' in line and ')' in line): - called_functions = self._extract_called_functions(line) - for called_func in called_functions: - # Find the called function in symbols and add relationship - for symbol_id, symbol_info in symbols.items(): - if called_func in symbol_id.split("::")[-1]: - if current_function not in symbol_info.called_by: - symbol_info.called_by.append(current_function) - - def _extract_function_from_line(self, line: str) -> Optional[str]: - """Extract function name from a line that defines a function.""" - if 'function ' in line: - return self._extract_js_function_name(line) - elif line.endswith('{') and '(' in line: - return self._extract_js_method_name(line) - return None - - def _extract_called_functions(self, line: str) -> List[str]: - """Extract function names that are being called in this line.""" - import re - called_functions = [] - - # Find patterns like: functionName( or obj.methodName( - patterns = [ - r'(\w+)\s*\(', # functionName( - r'\.(\w+)\s*\(', # .methodName( - ] - - for pattern in patterns: - matches = re.findall(pattern, line) - called_functions.extend(matches) - - return called_functions \ No newline at end of file diff --git a/src/code_index_mcp/indexing/strategies/objective_c_strategy.py b/src/code_index_mcp/indexing/strategies/objective_c_strategy.py index c3e4a64..4226f1c 100644 --- a/src/code_index_mcp/indexing/strategies/objective_c_strategy.py +++ b/src/code_index_mcp/indexing/strategies/objective_c_strategy.py @@ -5,38 +5,37 @@ import re from typing import Dict, List, Tuple, Optional from .base_strategy import ParsingStrategy -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo class ObjectiveCParsingStrategy(ParsingStrategy): """Objective-C parsing strategy using regex patterns.""" - + def get_language_name(self) -> str: return "objective-c" - + def get_supported_extensions(self) -> List[str]: return ['.m', '.mm'] - + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: """Parse Objective-C file using regex patterns.""" symbols = {} functions = [] classes = [] imports = [] - + lines = content.splitlines() current_class = None - + for i, line in enumerate(lines): line = line.strip() - + # Import statements if line.startswith('#import ') or line.startswith('#include '): import_match = re.search(r'#(?:import|include)\s+[<"]([^>"]+)[>"]', line) if import_match: imports.append(import_match.group(1)) - + # Interface declarations elif line.startswith('@interface '): interface_match = re.match(r'@interface\s+(\w+)', line) @@ -50,13 +49,13 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo line=i + 1 ) classes.append(class_name) - + # Implementation declarations elif line.startswith('@implementation '): impl_match = re.match(r'@implementation\s+(\w+)', line) if impl_match: current_class = impl_match.group(1) - + # Method declarations elif line.startswith(('- (', '+ (')): method_match = re.search(r'[+-]\s*\([^)]+\)\s*(\w+)', line) @@ -71,7 +70,7 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo signature=line ) functions.append(full_name) - + # C function declarations elif re.match(r'\w+.*\s+\w+\s*\([^)]*\)\s*\{?', line) and not line.startswith(('if', 'for', 'while')): func_match = re.search(r'\s(\w+)\s*\([^)]*\)', line) @@ -85,38 +84,38 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo signature=line ) functions.append(func_name) - + # End of class elif line == '@end': current_class = None - + # Phase 2: Add call relationship analysis self._analyze_objc_calls(content, symbols, file_path) - + file_info = FileInfo( language=self.get_language_name(), line_count=len(lines), symbols={"functions": functions, "classes": classes}, imports=imports ) - + return symbols, file_info - + def _analyze_objc_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): """Analyze Objective-C method calls for relationships.""" lines = content.splitlines() current_function = None - + for i, line in enumerate(lines): original_line = line line = line.strip() - + # Track current method context if line.startswith('- (') or line.startswith('+ ('): func_name = self._extract_objc_method_name(line) if func_name: current_function = self._create_symbol_id(file_path, func_name) - + # Find method calls: [obj methodName] or functionName() if current_function and ('[' in line and ']' in line or ('(' in line and ')' in line)): called_functions = self._extract_objc_called_functions(line) @@ -126,32 +125,30 @@ def _analyze_objc_calls(self, content: str, symbols: Dict[str, SymbolInfo], file if called_func in symbol_id.split("::")[-1]: if current_function not in symbol_info.called_by: symbol_info.called_by.append(current_function) - + def _extract_objc_method_name(self, line: str) -> Optional[str]: """Extract method name from Objective-C method declaration.""" try: # - (returnType)methodName:(params) or + (returnType)methodName - import re match = re.search(r'[+-]\s*\([^)]*\)\s*(\w+)', line) if match: return match.group(1) except: pass return None - + def _extract_objc_called_functions(self, line: str) -> List[str]: """Extract method names that are being called in this line.""" - import re called_functions = [] - + # Find patterns like: [obj methodName] or functionName( patterns = [ r'\[\s*\w+\s+(\w+)\s*[\]:]', # [obj methodName] r'(\w+)\s*\(', # functionName( ] - + for pattern in patterns: matches = re.findall(pattern, line) called_functions.extend(matches) - - return called_functions \ No newline at end of file + + return called_functions diff --git a/src/code_index_mcp/indexing/strategies/python_strategy.py b/src/code_index_mcp/indexing/strategies/python_strategy.py index 89062bd..2cf62cd 100644 --- a/src/code_index_mcp/indexing/strategies/python_strategy.py +++ b/src/code_index_mcp/indexing/strategies/python_strategy.py @@ -6,8 +6,7 @@ import logging from typing import Dict, List, Tuple, Optional from .base_strategy import ParsingStrategy -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo logger = logging.getLogger(__name__) @@ -51,11 +50,22 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo def _visit_ast_node(self, node: ast.AST, symbols: Dict, functions: List, classes: List, imports: List, file_path: str, content: str): """Visit AST nodes and extract symbols.""" + # Track processed nodes to avoid duplicates + processed_nodes = set() + + # First pass: handle classes and mark their methods as processed for child in ast.walk(node): - if isinstance(child, ast.FunctionDef): + if isinstance(child, ast.ClassDef): + self._handle_class(child, symbols, classes, file_path, functions) + # Mark all methods in this class as processed + for class_child in child.body: + if isinstance(class_child, ast.FunctionDef): + processed_nodes.add(id(class_child)) + + # Second pass: handle standalone functions and imports + for child in ast.walk(node): + if isinstance(child, ast.FunctionDef) and id(child) not in processed_nodes: self._handle_function(child, symbols, functions, file_path) - elif isinstance(child, ast.ClassDef): - self._handle_class(child, symbols, classes, file_path) elif isinstance(child, (ast.Import, ast.ImportFrom)): self._handle_import(child, imports) @@ -79,7 +89,7 @@ def _handle_function(self, node: ast.FunctionDef, symbols: Dict, functions: List ) functions.append(func_name) - def _handle_class(self, node: ast.ClassDef, symbols: Dict, classes: List, file_path: str): + def _handle_class(self, node: ast.ClassDef, symbols: Dict, classes: List, file_path: str, functions: List = None): """Handle class definition.""" class_name = node.name symbol_id = self._create_symbol_id(file_path, class_name) @@ -111,6 +121,10 @@ def _handle_class(self, node: ast.ClassDef, symbols: Dict, classes: List, file_p signature=method_signature, docstring=method_docstring ) + + # Add method to functions list if provided + if functions is not None: + functions.append(method_name) def _handle_import(self, node, imports: List): """Handle import statements.""" @@ -155,12 +169,25 @@ def __init__(self, symbols: Dict[str, SymbolInfo], file_path: str): self.symbols = symbols self.file_path = file_path self.current_function_stack = [] + self.current_class = None + + def visit_ClassDef(self, node: ast.ClassDef): + """Visit class definition and track context.""" + self.current_class = node.name + self.generic_visit(node) + self.current_class = None def visit_FunctionDef(self, node: ast.FunctionDef): """Visit function definition and track context.""" - # Create symbol ID for this function using relative path - relative_path = self._get_relative_path(self.file_path) - function_id = f"{relative_path}::{node.name}" + # File path is already relative after our fix + relative_path = self.file_path + + # Handle methods within classes + if self.current_class: + function_id = f"{relative_path}::{self.current_class}.{node.name}" + else: + function_id = f"{relative_path}::{node.name}" + self.current_function_stack.append(function_id) # Visit all child nodes within this function @@ -188,12 +215,17 @@ def visit_Call(self, node: ast.Call): # Look for the called function in our symbols and add relationship for symbol_id, symbol_info in self.symbols.items(): - if (symbol_info.type in ["function", "method"] and - called_function in symbol_id.split("::")[-1]): # Match function name part - # Add caller to the called function's called_by list - if caller_function not in symbol_info.called_by: - symbol_info.called_by.append(caller_function) - break + if symbol_info.type in ["function", "method"]: + # Extract just the function/method name from the symbol ID + symbol_name = symbol_id.split("::")[-1] + + # Check for exact match or method name match (ClassName.method) + if (symbol_name == called_function or + symbol_name.endswith(f".{called_function}")): + # Add caller to the called function's called_by list + if caller_function not in symbol_info.called_by: + symbol_info.called_by.append(caller_function) + break except Exception: # Silently handle parsing errors for complex call patterns pass diff --git a/src/code_index_mcp/indexing/strategies/strategy_factory.py b/src/code_index_mcp/indexing/strategies/strategy_factory.py index 4564138..c7116d9 100644 --- a/src/code_index_mcp/indexing/strategies/strategy_factory.py +++ b/src/code_index_mcp/indexing/strategies/strategy_factory.py @@ -2,6 +2,7 @@ Strategy factory for creating appropriate parsing strategies. """ +import threading from typing import Dict, List from .base_strategy import ParsingStrategy from .python_strategy import PythonParsingStrategy @@ -16,12 +17,14 @@ class StrategyFactory: """Factory for creating appropriate parsing strategies.""" - + def __init__(self): - # Initialize all strategies + # Initialize all strategies with thread safety self._strategies: Dict[str, ParsingStrategy] = {} + self._initialized = False + self._lock = threading.RLock() self._initialize_strategies() - + # File type mappings for fallback parser self._file_type_mappings = { # Web and markup @@ -32,17 +35,17 @@ def __init__(self): '.json': 'json', '.jsonc': 'json', '.xml': 'xml', '.yml': 'yaml', '.yaml': 'yaml', - + # Frontend frameworks '.vue': 'vue', '.svelte': 'svelte', '.astro': 'astro', - + # Template engines '.hbs': 'handlebars', '.handlebars': 'handlebars', '.ejs': 'ejs', '.pug': 'pug', - + # Database and SQL '.sql': 'sql', '.ddl': 'sql', '.dml': 'sql', '.mysql': 'sql', '.postgresql': 'sql', '.psql': 'sql', @@ -56,7 +59,7 @@ def __init__(self): '.cql': 'sql', '.cypher': 'sql', '.sparql': 'sql', '.gql': 'graphql', '.liquibase': 'sql', '.flyway': 'sql', - + # Config and text files '.txt': 'text', '.ini': 'config', '.cfg': 'config', '.conf': 'config', @@ -66,7 +69,7 @@ def __init__(self): '.gitignore': 'config', '.dockerignore': 'config', '.editorconfig': 'config', - + # Other programming languages (will use fallback) '.c': 'c', '.cpp': 'cpp', '.h': 'h', '.hpp': 'hpp', '.cxx': 'cpp', '.cc': 'cpp', '.hxx': 'hpp', '.hh': 'hpp', @@ -90,91 +93,109 @@ def __init__(self): '.clj': 'clojure', '.cljs': 'clojure', '.vim': 'vim', } - + def _initialize_strategies(self): - """Initialize all parsing strategies.""" - # Python - python_strategy = PythonParsingStrategy() - for ext in python_strategy.get_supported_extensions(): - self._strategies[ext] = python_strategy - - # JavaScript - js_strategy = JavaScriptParsingStrategy() - for ext in js_strategy.get_supported_extensions(): - self._strategies[ext] = js_strategy - - # TypeScript - ts_strategy = TypeScriptParsingStrategy() - for ext in ts_strategy.get_supported_extensions(): - self._strategies[ext] = ts_strategy - - # Java - java_strategy = JavaParsingStrategy() - for ext in java_strategy.get_supported_extensions(): - self._strategies[ext] = java_strategy - - # Go - go_strategy = GoParsingStrategy() - for ext in go_strategy.get_supported_extensions(): - self._strategies[ext] = go_strategy - - # Objective-C - objc_strategy = ObjectiveCParsingStrategy() - for ext in objc_strategy.get_supported_extensions(): - self._strategies[ext] = objc_strategy - - # Zig - zig_strategy = ZigParsingStrategy() - for ext in zig_strategy.get_supported_extensions(): - self._strategies[ext] = zig_strategy - + """Initialize all parsing strategies with thread safety.""" + with self._lock: + if self._initialized: + return + + try: + # Python + python_strategy = PythonParsingStrategy() + for ext in python_strategy.get_supported_extensions(): + self._strategies[ext] = python_strategy + + # JavaScript + js_strategy = JavaScriptParsingStrategy() + for ext in js_strategy.get_supported_extensions(): + self._strategies[ext] = js_strategy + + # TypeScript + ts_strategy = TypeScriptParsingStrategy() + for ext in ts_strategy.get_supported_extensions(): + self._strategies[ext] = ts_strategy + + # Java + java_strategy = JavaParsingStrategy() + for ext in java_strategy.get_supported_extensions(): + self._strategies[ext] = java_strategy + + # Go + go_strategy = GoParsingStrategy() + for ext in go_strategy.get_supported_extensions(): + self._strategies[ext] = go_strategy + + # Objective-C + objc_strategy = ObjectiveCParsingStrategy() + for ext in objc_strategy.get_supported_extensions(): + self._strategies[ext] = objc_strategy + + # Zig + zig_strategy = ZigParsingStrategy() + for ext in zig_strategy.get_supported_extensions(): + self._strategies[ext] = zig_strategy + + self._initialized = True + + except Exception as e: + # Reset state on failure to allow retry + self._strategies.clear() + self._initialized = False + raise e + def get_strategy(self, file_extension: str) -> ParsingStrategy: """ Get appropriate strategy for file extension. - + Args: file_extension: File extension (e.g., '.py', '.js') - + Returns: Appropriate parsing strategy """ - # Check for specialized strategies first - if file_extension in self._strategies: - return self._strategies[file_extension] - - # Use fallback strategy with appropriate language name - language_name = self._file_type_mappings.get(file_extension, 'unknown') - return FallbackParsingStrategy(language_name) - + with self._lock: + # Ensure initialization is complete + if not self._initialized: + self._initialize_strategies() + + # Check for specialized strategies first + if file_extension in self._strategies: + return self._strategies[file_extension] + + # Use fallback strategy with appropriate language name + language_name = self._file_type_mappings.get(file_extension, 'unknown') + return FallbackParsingStrategy(language_name) + def get_all_supported_extensions(self) -> List[str]: """Get all supported extensions across strategies.""" specialized = list(self._strategies.keys()) fallback = list(self._file_type_mappings.keys()) return specialized + fallback - + def get_specialized_extensions(self) -> List[str]: """Get extensions that have specialized parsers.""" return list(self._strategies.keys()) - + def get_fallback_extensions(self) -> List[str]: """Get extensions that use fallback parsing.""" return list(self._file_type_mappings.keys()) - + def get_strategy_info(self) -> Dict[str, List[str]]: """Get information about available strategies.""" info = {} - + # Group extensions by strategy type for ext, strategy in self._strategies.items(): strategy_name = strategy.get_language_name() if strategy_name not in info: info[strategy_name] = [] info[strategy_name].append(ext) - + # Add fallback info fallback_languages = set(self._file_type_mappings.values()) for lang in fallback_languages: extensions = [ext for ext, mapped_lang in self._file_type_mappings.items() if mapped_lang == lang] info[f"fallback_{lang}"] = extensions - - return info \ No newline at end of file + + return info diff --git a/src/code_index_mcp/indexing/strategies/typescript_strategy.py b/src/code_index_mcp/indexing/strategies/typescript_strategy.py index 43be6f6..efd2ec9 100644 --- a/src/code_index_mcp/indexing/strategies/typescript_strategy.py +++ b/src/code_index_mcp/indexing/strategies/typescript_strategy.py @@ -5,35 +5,26 @@ import logging from typing import Dict, List, Tuple, Optional from .base_strategy import ParsingStrategy -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo logger = logging.getLogger(__name__) -try: - import tree_sitter - import tree_sitter_typescript - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - logger.warning("tree-sitter not available, TypeScript parsing will be limited") +import tree_sitter +from tree_sitter_typescript import language_typescript class TypeScriptParsingStrategy(ParsingStrategy): """TypeScript-specific parsing strategy using tree-sitter.""" - + def __init__(self): - if TREE_SITTER_AVAILABLE: - self.ts_language = tree_sitter.Language(tree_sitter_typescript.language_typescript()) - else: - self.ts_language = None - + self.ts_language = tree_sitter.Language(language_typescript()) + def get_language_name(self) -> str: return "typescript" - + def get_supported_extensions(self) -> List[str]: return ['.ts', '.tsx'] - + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: """Parse TypeScript file using tree-sitter.""" symbols = {} @@ -41,19 +32,14 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo classes = [] imports = [] exports = [] - - if not TREE_SITTER_AVAILABLE or not self.ts_language: - logger.info(f"Tree-sitter not available, using fallback for {file_path}") - return self._fallback_parse(file_path, content) - - try: - parser = tree_sitter.Parser(self.ts_language) - tree = parser.parse(content.encode('utf8')) - self._traverse_ts_node(tree.root_node, content, file_path, symbols, functions, classes, imports, exports) - except Exception as e: - logger.warning(f"Error parsing TypeScript file {file_path}: {e}, falling back to regex parsing") - return self._fallback_parse(file_path, content) - + + parser = tree_sitter.Parser(self.ts_language) + tree = parser.parse(content.encode('utf8')) + # Phase 1: Extract symbols + self._traverse_ts_node(tree.root_node, content, file_path, symbols, functions, classes, imports, exports) + # Phase 2: Analyze function calls using tree-sitter + self._analyze_ts_calls_with_tree_sitter(tree.root_node, content, file_path, symbols) + file_info = FileInfo( language=self.get_language_name(), line_count=len(content.splitlines()), @@ -61,10 +47,10 @@ def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo imports=imports, exports=exports ) - + return symbols, file_info - - def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], + + def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], functions: List[str], classes: List[str], imports: List[str], exports: List[str]): """Traverse TypeScript AST node.""" if node.type == 'function_declaration': @@ -79,7 +65,7 @@ def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[st signature=signature ) functions.append(name) - + elif node.type == 'class_declaration': name = self._get_class_name(node, content) if name: @@ -90,7 +76,7 @@ def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[st line=node.start_point[0] + 1 ) classes.append(name) - + elif node.type == 'interface_declaration': name = self._get_interface_name(node, content) if name: @@ -101,12 +87,13 @@ def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[st line=node.start_point[0] + 1 ) classes.append(name) # Group interfaces with classes for simplicity - + elif node.type == 'method_definition': method_name = self._get_method_name(node, content) class_name = self._find_parent_class(node, content) if method_name and class_name: - symbol_id = self._create_symbol_id(file_path, f"{class_name}.{method_name}") + full_name = f"{class_name}.{method_name}" + symbol_id = self._create_symbol_id(file_path, full_name) signature = self._get_ts_function_signature(node, content) symbols[symbol_id] = SymbolInfo( type="method", @@ -114,39 +101,41 @@ def _traverse_ts_node(self, node, content: str, file_path: str, symbols: Dict[st line=node.start_point[0] + 1, signature=signature ) - + # Add method to functions list for consistency + functions.append(full_name) + # Continue traversing children for child in node.children: self._traverse_ts_node(child, content, file_path, symbols, functions, classes, imports, exports) - + def _get_function_name(self, node, content: str) -> Optional[str]: """Extract function name from tree-sitter node.""" for child in node.children: if child.type == 'identifier': return content[child.start_byte:child.end_byte] return None - + def _get_class_name(self, node, content: str) -> Optional[str]: """Extract class name from tree-sitter node.""" for child in node.children: if child.type == 'identifier': return content[child.start_byte:child.end_byte] return None - + def _get_interface_name(self, node, content: str) -> Optional[str]: """Extract interface name from tree-sitter node.""" for child in node.children: if child.type == 'type_identifier': return content[child.start_byte:child.end_byte] return None - + def _get_method_name(self, node, content: str) -> Optional[str]: """Extract method name from tree-sitter node.""" for child in node.children: if child.type == 'property_identifier': return content[child.start_byte:child.end_byte] return None - + def _find_parent_class(self, node, content: str) -> Optional[str]: """Find the parent class of a method.""" parent = node.parent @@ -155,222 +144,57 @@ def _find_parent_class(self, node, content: str) -> Optional[str]: return self._get_class_name(parent, content) or self._get_interface_name(parent, content) parent = parent.parent return None - + def _get_ts_function_signature(self, node, content: str) -> str: """Extract TypeScript function signature.""" return content[node.start_byte:node.end_byte].split('\n')[0].strip() - - def _fallback_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Fallback parsing when tree-sitter is not available.""" - symbols = {} - functions = [] - classes = [] - imports = [] - - # Phase 1: Extract symbols using regex-based parsing for TypeScript - lines = content.splitlines() - current_class = None - - for i, line in enumerate(lines): - original_line = line - line = line.strip() - - # Import statements - if line.startswith('import ') and ' from ' in line: - import_match = self._extract_ts_import(line) - if import_match: - imports.extend(import_match) - - # Class declarations - elif line.startswith('class '): - class_name = self._extract_ts_class_name(line) - if class_name: - current_class = class_name - symbol_id = self._create_symbol_id(file_path, class_name) - symbols[symbol_id] = SymbolInfo( - type="class", - file=file_path, - line=i + 1 - ) - classes.append(class_name) - - # Interface declarations - elif line.startswith('interface '): - interface_name = self._extract_ts_interface_name(line) - if interface_name: - symbol_id = self._create_symbol_id(file_path, interface_name) - symbols[symbol_id] = SymbolInfo( - type="interface", - file=file_path, - line=i + 1 - ) - classes.append(interface_name) - current_class = interface_name - - # Function declarations (standalone) - elif line.startswith('function ') or ' function ' in line: - func_name = self._extract_ts_function_name(line) - if func_name: - symbol_id = self._create_symbol_id(file_path, func_name) - symbols[symbol_id] = SymbolInfo( - type="function", - file=file_path, - line=i + 1, - signature=line - ) - functions.append(func_name) - - # Method declarations (inside classes) - elif current_class and ('(' in line and ')' in line and ':' in line): - method_name = self._extract_ts_method_name(line) - if method_name and not line.startswith('//') and 'function' not in line: - symbol_id = self._create_symbol_id(file_path, f"{current_class}.{method_name}") - symbols[symbol_id] = SymbolInfo( - type="method", - file=file_path, - line=i + 1, - signature=line - ) - functions.append(method_name) # Add to functions list for summary - - # Reset class context on closing brace (simplified) - elif line == '}' and current_class: - current_class = None - - # Phase 2: Add call relationship analysis (similar to Python approach) - self._analyze_ts_calls(content, symbols, file_path) - - file_info = FileInfo( - language=self.get_language_name(), - line_count=len(lines), - symbols={"functions": functions, "classes": classes}, - imports=imports - ) - - return symbols, file_info - - def _extract_ts_function_name(self, line: str) -> Optional[str]: - """Extract function name from TypeScript function declaration.""" - try: - # function functionName(...): ReturnType or function functionName(...) - if 'function ' in line: - parts = line.split('function ')[1].split('(')[0].strip() - return parts if parts and parts.isidentifier() else None - except: - pass - return None - - def _extract_ts_class_name(self, line: str) -> Optional[str]: - """Extract class name from TypeScript class declaration.""" - try: - # class ClassName { or class ClassName extends ... or class ClassName implements ... - parts = line.split() - if len(parts) >= 2 and parts[0] == 'class': - class_name = parts[1] - # Remove any trailing characters - for separator in ['{', 'extends', 'implements']: - if separator in class_name: - class_name = class_name.split(separator)[0] - return class_name.strip() - except: - pass - return None - - def _extract_ts_interface_name(self, line: str) -> Optional[str]: - """Extract interface name from TypeScript interface declaration.""" - try: - # interface InterfaceName { or interface InterfaceName extends ... - parts = line.split() - if len(parts) >= 2 and parts[0] == 'interface': - interface_name = parts[1] - # Remove any trailing characters - for separator in ['{', 'extends']: - if separator in interface_name: - interface_name = interface_name.split(separator)[0] - return interface_name.strip() - except: - pass - return None - - def _extract_ts_method_name(self, line: str) -> Optional[str]: - """Extract method name from TypeScript method declaration.""" - try: - # async methodName(params): ReturnType or methodName(params): ReturnType - line = line.strip() - if line.startswith('async '): - line = line[6:].strip() - - if '(' in line: - method_name = line.split('(')[0].strip() - # Remove access modifiers - for modifier in ['public', 'private', 'protected', 'static']: - if method_name.startswith(modifier + ' '): - method_name = method_name[len(modifier):].strip() - - return method_name if method_name and method_name.replace('_', '').isalnum() else None - except: - pass - return None - - def _extract_ts_import(self, line: str) -> List[str]: - """Extract imports from TypeScript import statement.""" - imports = [] - try: - # import { something } from 'module' or import something from 'module' - if ' from ' in line: - module_part = line.split(' from ')[-1].strip() - module_name = module_part.strip('\'"').replace("'", "").replace('"', '').replace(';', '') - imports.append(module_name) - except: - pass - return imports - - def _analyze_ts_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): - """Analyze TypeScript function calls for relationships.""" - lines = content.splitlines() - current_function = None - - for i, line in enumerate(lines): - original_line = line - line = line.strip() - - # Track current function context - if 'function ' in line or (': ' in line and '(' in line and ')' in line): - func_name = self._extract_function_from_line(line) - if func_name: - current_function = self._create_symbol_id(file_path, func_name) - - # Find function calls: functionName() or obj.methodName() - if current_function and ('(' in line and ')' in line): - called_functions = self._extract_called_functions(line) - for called_func in called_functions: - # Find the called function in symbols and add relationship - for symbol_id, symbol_info in symbols.items(): - if called_func in symbol_id.split("::")[-1]: + + + def _analyze_ts_calls_with_tree_sitter(self, node, content: str, file_path: str, symbols: Dict[str, SymbolInfo], + current_function: Optional[str] = None, current_class: Optional[str] = None): + """Analyze TypeScript function calls using tree-sitter AST.""" + # Track function/method context + if node.type == 'function_declaration': + func_name = self._get_function_name(node, content) + if func_name: + current_function = f"{file_path}::{func_name}" + elif node.type == 'method_definition': + method_name = self._get_method_name(node, content) + parent_class = self._find_parent_class(node, content) + if method_name and parent_class: + current_function = f"{file_path}::{parent_class}.{method_name}" + elif node.type == 'class_declaration': + current_class = self._get_class_name(node, content) + + # Detect function calls + if node.type == 'call_expression' and current_function: + # Extract the function being called + called_function = None + if node.children: + func_node = node.children[0] + if func_node.type == 'identifier': + # Direct function call + called_function = content[func_node.start_byte:func_node.end_byte] + elif func_node.type == 'member_expression': + # Method call (obj.method or this.method) + for child in func_node.children: + if child.type == 'property_identifier': + called_function = content[child.start_byte:child.end_byte] + break + + # Add relationship if we found the called function + if called_function: + for symbol_id, symbol_info in symbols.items(): + if symbol_info.type in ["function", "method"]: + symbol_name = symbol_id.split("::")[-1] + # Check for exact match or method name match + if (symbol_name == called_function or + symbol_name.endswith(f".{called_function}")): if current_function not in symbol_info.called_by: symbol_info.called_by.append(current_function) - - def _extract_function_from_line(self, line: str) -> Optional[str]: - """Extract function name from a line that defines a function.""" - if 'function ' in line: - return self._extract_ts_function_name(line) - elif ': ' in line and '(' in line: - return self._extract_ts_method_name(line) - return None - - def _extract_called_functions(self, line: str) -> List[str]: - """Extract function names that are being called in this line.""" - import re - called_functions = [] - - # Find patterns like: functionName( or obj.methodName( - patterns = [ - r'(\w+)\s*\(', # functionName( - r'\.(\w+)\s*\(', # .methodName( - ] - - for pattern in patterns: - matches = re.findall(pattern, line) - called_functions.extend(matches) - - return called_functions \ No newline at end of file + break + + # Recursively process children + for child in node.children: + self._analyze_ts_calls_with_tree_sitter(child, content, file_path, symbols, current_function, current_class) + diff --git a/src/code_index_mcp/indexing/strategies/zig_strategy.py b/src/code_index_mcp/indexing/strategies/zig_strategy.py index ca3f5f6..658ca2b 100644 --- a/src/code_index_mcp/indexing/strategies/zig_strategy.py +++ b/src/code_index_mcp/indexing/strategies/zig_strategy.py @@ -1,179 +1,99 @@ """ -Zig parsing strategy using regex patterns with tree-sitter fallback. +Zig parsing strategy using tree-sitter. """ -import re import logging from typing import Dict, List, Tuple, Optional from .base_strategy import ParsingStrategy -from ..models.symbol_info import SymbolInfo -from ..models.file_info import FileInfo +from ..models import SymbolInfo, FileInfo logger = logging.getLogger(__name__) -try: - import tree_sitter - import tree_sitter_zig - TREE_SITTER_AVAILABLE = True -except ImportError: - TREE_SITTER_AVAILABLE = False - logger.warning("tree-sitter-zig not available, using regex fallback") +import tree_sitter +from tree_sitter_zig import language class ZigParsingStrategy(ParsingStrategy): - """Zig parsing strategy using regex patterns with tree-sitter fallback.""" - + """Zig parsing strategy using tree-sitter.""" + def __init__(self): - if TREE_SITTER_AVAILABLE: - self.zig_language = tree_sitter.Language(tree_sitter_zig.language()) - else: - self.zig_language = None - + self.zig_language = tree_sitter.Language(language()) + def get_language_name(self) -> str: return "zig" - + def get_supported_extensions(self) -> List[str]: return ['.zig', '.zon'] - + def parse_file(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Parse Zig file using regex patterns.""" - # For now, use regex parsing even if tree-sitter is available - # Tree-sitter-zig might not be stable yet - return self._regex_parse(file_path, content) - - def _regex_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: - """Parse Zig file using regex patterns.""" + """Parse Zig file using tree-sitter.""" + return self._tree_sitter_parse(file_path, content) + + + def _tree_sitter_parse(self, file_path: str, content: str) -> Tuple[Dict[str, SymbolInfo], FileInfo]: + """Parse Zig file using tree-sitter.""" symbols = {} functions = [] - classes = [] # Zig uses structs, not classes + classes = [] imports = [] - - lines = content.splitlines() - - for i, line in enumerate(lines): - line = line.strip() - - # Import statements (const x = @import(...)) - if '@import(' in line: - import_match = re.search(r'@import\("([^"]+)"\)', line) - if import_match: - imports.append(import_match.group(1)) - - # Function declarations (pub fn, fn) - elif re.match(r'(pub\s+)?fn\s+\w+', line): - func_match = re.match(r'(?:pub\s+)?fn\s+(\w+)', line) - if func_match: - func_name = func_match.group(1) - symbol_id = self._create_symbol_id(file_path, func_name) - symbols[symbol_id] = SymbolInfo( - type="function", - file=file_path, - line=i + 1, - signature=line - ) - functions.append(func_name) - - # Struct declarations - elif re.match(r'const\s+\w+\s*=\s*struct\s*\{', line): - struct_match = re.match(r'const\s+(\w+)\s*=\s*struct', line) - if struct_match: - struct_name = struct_match.group(1) - symbol_id = self._create_symbol_id(file_path, struct_name) - symbols[symbol_id] = SymbolInfo( - type="struct", - file=file_path, - line=i + 1 - ) - classes.append(struct_name) - - # Union declarations - elif re.match(r'const\s+\w+\s*=\s*union', line): - union_match = re.match(r'const\s+(\w+)\s*=\s*union', line) - if union_match: - union_name = union_match.group(1) - symbol_id = self._create_symbol_id(file_path, union_name) - symbols[symbol_id] = SymbolInfo( - type="union", - file=file_path, - line=i + 1 - ) - classes.append(union_name) - - # Enum declarations - elif re.match(r'const\s+\w+\s*=\s*enum', line): - enum_match = re.match(r'const\s+(\w+)\s*=\s*enum', line) - if enum_match: - enum_name = enum_match.group(1) - symbol_id = self._create_symbol_id(file_path, enum_name) - symbols[symbol_id] = SymbolInfo( - type="enum", - file=file_path, - line=i + 1 - ) - classes.append(enum_name) - - # Phase 2: Add call relationship analysis - self._analyze_zig_calls(content, symbols, file_path) - + + parser = tree_sitter.Parser(self.zig_language) + tree = parser.parse(content.encode('utf8')) + + # Phase 1: Extract symbols using tree-sitter + self._traverse_zig_node(tree.root_node, content, file_path, symbols, functions, classes, imports) + file_info = FileInfo( language=self.get_language_name(), - line_count=len(lines), + line_count=len(content.splitlines()), symbols={"functions": functions, "classes": classes}, imports=imports ) - + return symbols, file_info - - def _analyze_zig_calls(self, content: str, symbols: Dict[str, SymbolInfo], file_path: str): - """Analyze Zig function calls for relationships.""" - lines = content.splitlines() - current_function = None - - for i, line in enumerate(lines): - original_line = line - line = line.strip() - - # Track current function context - if line.startswith('fn '): - func_name = self._extract_zig_function_name(line) - if func_name: - current_function = self._create_symbol_id(file_path, func_name) - - # Find function calls: functionName() or obj.methodName() - if current_function and ('(' in line and ')' in line): - called_functions = self._extract_zig_called_functions(line) - for called_func in called_functions: - # Find the called function in symbols and add relationship - for symbol_id, symbol_info in symbols.items(): - if called_func in symbol_id.split("::")[-1]: - if current_function not in symbol_info.called_by: - symbol_info.called_by.append(current_function) - - def _extract_zig_function_name(self, line: str) -> Optional[str]: - """Extract function name from Zig function declaration.""" - try: - # fn functionName(...) or pub fn functionName(...) - import re - match = re.search(r'fn\s+(\w+)\s*\(', line) - if match: - return match.group(1) - except: - pass + + def _traverse_zig_node(self, node, content: str, file_path: str, symbols: Dict, functions: List, classes: List, imports: List): + """Traverse Zig AST node and extract symbols.""" + if node.type == 'function_declaration': + func_name = self._extract_zig_function_name_from_node(node, content) + if func_name: + line_number = self._extract_line_number(content, node.start_byte) + symbol_id = self._create_symbol_id(file_path, func_name) + symbols[symbol_id] = SymbolInfo( + type="function", + file=file_path, + line=line_number, + signature=self._safe_extract_text(content, node.start_byte, node.end_byte) + ) + functions.append(func_name) + + elif node.type in ['struct_declaration', 'union_declaration', 'enum_declaration']: + type_name = self._extract_zig_type_name_from_node(node, content) + if type_name: + line_number = self._extract_line_number(content, node.start_byte) + symbol_id = self._create_symbol_id(file_path, type_name) + symbols[symbol_id] = SymbolInfo( + type=node.type.replace('_declaration', ''), + file=file_path, + line=line_number + ) + classes.append(type_name) + + # Recurse through children + for child in node.children: + self._traverse_zig_node(child, content, file_path, symbols, functions, classes, imports) + + def _extract_zig_function_name_from_node(self, node, content: str) -> Optional[str]: + """Extract function name from tree-sitter node.""" + for child in node.children: + if child.type == 'identifier': + return self._safe_extract_text(content, child.start_byte, child.end_byte) + return None + + def _extract_zig_type_name_from_node(self, node, content: str) -> Optional[str]: + """Extract type name from tree-sitter node.""" + for child in node.children: + if child.type == 'identifier': + return self._safe_extract_text(content, child.start_byte, child.end_byte) return None - - def _extract_zig_called_functions(self, line: str) -> List[str]: - """Extract function names that are being called in this line.""" - import re - called_functions = [] - - # Find patterns like: functionName( or obj.methodName( - patterns = [ - r'(\w+)\s*\(', # functionName( - r'\.(\w+)\s*\(', # .methodName( - ] - - for pattern in patterns: - matches = re.findall(pattern, line) - called_functions.extend(matches) - - return called_functions \ No newline at end of file + diff --git a/src/code_index_mcp/services/file_watcher_service.py b/src/code_index_mcp/services/file_watcher_service.py index 7526fdd..cac4dd5 100644 --- a/src/code_index_mcp/services/file_watcher_service.py +++ b/src/code_index_mcp/services/file_watcher_service.py @@ -11,7 +11,7 @@ import os import traceback from threading import Timer -from typing import Optional, Callable +from typing import Optional, Callable, List from pathlib import Path try: @@ -50,7 +50,6 @@ def __init__(self): WATCHDOG_AVAILABLE = False from .base_service import BaseService -from ..constants import SUPPORTED_EXTENSIONS class FileWatcherService(BaseService): @@ -311,7 +310,7 @@ class DebounceEventHandler(FileSystemEventHandler): """ def __init__(self, debounce_seconds: float, rebuild_callback: Callable, - base_path: Path, logger: logging.Logger): + base_path: Path, logger: logging.Logger, additional_excludes: Optional[List[str]] = None): """ Initialize the debounce event handler. @@ -320,7 +319,10 @@ def __init__(self, debounce_seconds: float, rebuild_callback: Callable, rebuild_callback: Function to call when rebuild is needed base_path: Base project path for filtering logger: Logger instance for debug messages + additional_excludes: Additional patterns to exclude """ + from ..utils import FileFilter + super().__init__() self.debounce_seconds = debounce_seconds self.rebuild_callback = rebuild_callback @@ -328,18 +330,8 @@ def __init__(self, debounce_seconds: float, rebuild_callback: Callable, self.debounce_timer: Optional[Timer] = None self.logger = logger - # Exclusion patterns for directories and files to ignore - self.exclude_patterns = { - '.git', '.svn', '.hg', - 'node_modules', '__pycache__', '.venv', 'venv', - '.DS_Store', 'Thumbs.db', - 'dist', 'build', 'target', '.idea', '.vscode', - '.pytest_cache', '.coverage', '.tox', - 'bin', 'obj' # Additional build directories - } - - # Convert supported extensions to set for faster lookup - self.supported_extensions = set(SUPPORTED_EXTENSIONS) + # Use centralized file filtering + self.file_filter = FileFilter(additional_excludes) def on_any_event(self, event: FileSystemEvent) -> None: """ @@ -360,7 +352,7 @@ def on_any_event(self, event: FileSystemEvent) -> None: def should_process_event(self, event: FileSystemEvent) -> bool: """ - Determine if event should trigger index rebuild. + Determine if event should trigger index rebuild using centralized filtering. Args: event: The file system event to evaluate @@ -381,139 +373,23 @@ def should_process_event(self, event: FileSystemEvent) -> bool: else: target_path = event.src_path - # Fast path exclusion - check if path is in excluded directory before any processing - if self._is_path_in_excluded_directory(target_path): - return False - - # Unified path checking + # Use centralized filtering logic try: path = Path(target_path) - return self._should_process_path(path) - except Exception: - return False - - def _should_process_path(self, path: Path) -> bool: - """ - Check if a specific path should trigger index rebuild. - - Args: - path: The file path to check - - Returns: - True if path should trigger rebuild, False otherwise - """ - # Skip excluded paths - if self.is_excluded_path(path): - return False - - # Only process supported file types - if not self.is_supported_file_type(path): - return False - - # Skip temporary files - if self.is_temporary_file(path): - return False - - return True - - def _is_path_in_excluded_directory(self, file_path: str) -> bool: - """ - Fast check if a file path is within an excluded directory. - - This method performs a quick string-based check to avoid expensive - Path operations for files in excluded directories like .venv. - - Args: - file_path: The file path to check + should_process = self.file_filter.should_process_path(path, self.base_path) - Returns: - True if the path is in an excluded directory, False otherwise - """ - try: - # Normalize path separators for cross-platform compatibility - normalized_path = file_path.replace('\\', '/') - base_path_normalized = str(self.base_path).replace('\\', '/') - - # Get relative path string - if not normalized_path.startswith(base_path_normalized): - return True # Path outside project - exclude it + # Skip temporary files using centralized logic + if not should_process or self.file_filter.is_temporary_file(path): + return False - relative_path = normalized_path[len(base_path_normalized):].lstrip('/') - - # Quick check: if any excluded pattern appears as a path component - path_parts = relative_path.split('/') - for part in path_parts: - if part in self.exclude_patterns: - return True - - return False - except Exception: - # If any error occurs, err on the side of exclusion - return True - - def is_excluded_path(self, path: Path) -> bool: - """ - Check if path should be excluded from monitoring. - - Args: - path: The file path to check - - Returns: - True if path should be excluded, False otherwise - """ - try: - relative_path = path.relative_to(self.base_path) - parts = relative_path.parts - - # Check if any part of the path matches exclusion patterns - return any(part in self.exclude_patterns for part in parts) - except ValueError: - # Path is not relative to base_path - exclude it return True except Exception: - # Handle any other path processing issues - return True - - def is_supported_file_type(self, path: Path) -> bool: - """ - Check if file type is supported for indexing. - - Args: - path: The file path to check - - Returns: - True if file type is supported, False otherwise - """ - return path.suffix.lower() in self.supported_extensions - - def is_temporary_file(self, path: Path) -> bool: - """ - Check if file is a temporary file. - - Args: - path: The file path to check + return False - Returns: - True if file appears to be temporary, False otherwise - """ - name = path.name.lower() - # Common temporary file patterns - temp_patterns = ['.tmp', '.swp', '.swo', '~', '.bak', '.orig'] - # Check for temporary file extensions - if any(name.endswith(pattern) for pattern in temp_patterns): - return True - # Check for vim/editor temporary files - if name.startswith('.') and (name.endswith('.swp') or name.endswith('.swo')): - return True - - # Check for backup files (e.g., file.py~, file.py.bak) - if '~' in name or '.bak' in name: - return True - return False def reset_debounce_timer(self) -> None: """Reset the debounce timer, canceling any existing timer.""" diff --git a/src/code_index_mcp/tools/config/project_config_tool.py b/src/code_index_mcp/tools/config/project_config_tool.py index cf78da2..c2738dd 100644 --- a/src/code_index_mcp/tools/config/project_config_tool.py +++ b/src/code_index_mcp/tools/config/project_config_tool.py @@ -9,7 +9,6 @@ from pathlib import Path from ...project_settings import ProjectSettings -from ...constants import SUPPORTED_EXTENSIONS class ProjectConfigTool: @@ -178,9 +177,12 @@ def create_default_config(self, project_path: str) -> Dict[str, Any]: Returns: Default configuration dictionary """ + from ...utils import FileFilter + + file_filter = FileFilter() return { "base_path": project_path, - "supported_extensions": SUPPORTED_EXTENSIONS, + "supported_extensions": list(file_filter.supported_extensions), "last_indexed": None, "file_watcher": self.get_file_watcher_config() if self._settings else {} } @@ -253,8 +255,12 @@ def get_basic_project_structure(self, project_path: str) -> Dict[str, Any]: Returns: Basic directory structure dictionary """ + from ...utils import FileFilter + + file_filter = FileFilter() + def build_tree(path: str, max_depth: int = 3, current_depth: int = 0) -> Dict[str, Any]: - """Build directory tree with limited depth.""" + """Build directory tree with limited depth using centralized filtering.""" if current_depth >= max_depth: return {"type": "directory", "truncated": True} @@ -262,24 +268,18 @@ def build_tree(path: str, max_depth: int = 3, current_depth: int = 0) -> Dict[st items = [] path_obj = Path(path) - # Skip hidden directories and common ignore patterns - skip_patterns = {'.git', '.svn', '__pycache__', 'node_modules', '.vscode', '.idea'} - for item in sorted(path_obj.iterdir()): - if item.name.startswith('.') and item.name not in {'.gitignore', '.env'}: - continue - if item.name in skip_patterns: - continue - if item.is_dir(): - items.append({ - "name": item.name, - "type": "directory", - "children": build_tree(str(item), max_depth, current_depth + 1) - }) + # Use centralized directory filtering + if not file_filter.should_exclude_directory(item.name): + items.append({ + "name": item.name, + "type": "directory", + "children": build_tree(str(item), max_depth, current_depth + 1) + }) else: - # Only include supported file types - if item.suffix.lower() in SUPPORTED_EXTENSIONS: + # Use centralized file filtering + if not file_filter.should_exclude_file(item): items.append({ "name": item.name, "type": "file", diff --git a/src/code_index_mcp/utils/__init__.py b/src/code_index_mcp/utils/__init__.py index 7e0d99b..cd3fb92 100644 --- a/src/code_index_mcp/utils/__init__.py +++ b/src/code_index_mcp/utils/__init__.py @@ -12,6 +12,7 @@ from .context_helper import ContextHelper from .validation import ValidationHelper from .response_formatter import ResponseFormatter +from .file_filter import FileFilter __all__ = [ 'handle_mcp_errors', @@ -19,5 +20,6 @@ 'handle_mcp_tool_errors', 'ContextHelper', 'ValidationHelper', - 'ResponseFormatter' + 'ResponseFormatter', + 'FileFilter' ] \ No newline at end of file diff --git a/src/code_index_mcp/utils/file_filter.py b/src/code_index_mcp/utils/file_filter.py new file mode 100644 index 0000000..5cd9938 --- /dev/null +++ b/src/code_index_mcp/utils/file_filter.py @@ -0,0 +1,177 @@ +""" +Centralized file filtering logic for the Code Index MCP server. + +This module provides unified filtering capabilities used across all components +that need to determine which files and directories should be processed or excluded. +""" + +import fnmatch +from pathlib import Path +from typing import List, Optional, Set + +from ..constants import FILTER_CONFIG + + +class FileFilter: + """Centralized file filtering logic.""" + + def __init__(self, additional_excludes: Optional[List[str]] = None): + """ + Initialize the file filter. + + Args: + additional_excludes: Additional directory patterns to exclude + """ + self.exclude_dirs = set(FILTER_CONFIG["exclude_directories"]) + self.exclude_files = set(FILTER_CONFIG["exclude_files"]) + self.supported_extensions = set(FILTER_CONFIG["supported_extensions"]) + + # Add user-defined exclusions + if additional_excludes: + self.exclude_dirs.update(additional_excludes) + + def should_exclude_directory(self, dir_name: str) -> bool: + """ + Check if directory should be excluded from processing. + + Args: + dir_name: Directory name to check + + Returns: + True if directory should be excluded, False otherwise + """ + # Skip hidden directories except for specific allowed ones + if dir_name.startswith('.') and dir_name not in {'.env', '.gitignore'}: + return True + + # Check against exclude patterns + return dir_name in self.exclude_dirs + + def should_exclude_file(self, file_path: Path) -> bool: + """ + Check if file should be excluded from processing. + + Args: + file_path: Path object for the file to check + + Returns: + True if file should be excluded, False otherwise + """ + # Extension check - only process supported file types + if file_path.suffix.lower() not in self.supported_extensions: + return True + + # Hidden files (except specific allowed ones) + if file_path.name.startswith('.') and file_path.name not in {'.gitignore', '.env'}: + return True + + # Filename pattern check using glob patterns + for pattern in self.exclude_files: + if fnmatch.fnmatch(file_path.name, pattern): + return True + + return False + + def should_process_path(self, path: Path, base_path: Path) -> bool: + """ + Unified path processing logic to determine if a file should be processed. + + Args: + path: File path to check + base_path: Project base path for relative path calculation + + Returns: + True if file should be processed, False otherwise + """ + try: + # Ensure we're working with absolute paths + if not path.is_absolute(): + path = base_path / path + + # Get relative path from base + relative_path = path.relative_to(base_path) + + # Check each path component for excluded directories + for part in relative_path.parts[:-1]: # Exclude filename + if self.should_exclude_directory(part): + return False + + # Check file itself + return not self.should_exclude_file(path) + + except (ValueError, OSError): + # Path not relative to base_path or other path errors + return False + + def is_supported_file_type(self, file_path: Path) -> bool: + """ + Check if file type is supported for indexing. + + Args: + file_path: Path to check + + Returns: + True if file type is supported, False otherwise + """ + return file_path.suffix.lower() in self.supported_extensions + + def is_temporary_file(self, file_path: Path) -> bool: + """ + Check if file appears to be a temporary file. + + Args: + file_path: Path to check + + Returns: + True if file appears temporary, False otherwise + """ + name = file_path.name + + # Common temporary file patterns + temp_patterns = ['*.tmp', '*.temp', '*.swp', '*.swo', '*~'] + + for pattern in temp_patterns: + if fnmatch.fnmatch(name, pattern): + return True + + # Files ending in .bak or .orig + if name.endswith(('.bak', '.orig')): + return True + + return False + + def filter_file_list(self, files: List[str], base_path: str) -> List[str]: + """ + Filter a list of file paths, keeping only those that should be processed. + + Args: + files: List of file paths (absolute or relative) + base_path: Project base path + + Returns: + Filtered list of file paths that should be processed + """ + base = Path(base_path) + filtered = [] + + for file_path_str in files: + file_path = Path(file_path_str) + if self.should_process_path(file_path, base): + filtered.append(file_path_str) + + return filtered + + def get_exclude_summary(self) -> dict: + """ + Get summary of current exclusion configuration. + + Returns: + Dictionary with exclusion configuration details + """ + return { + "exclude_directories_count": len(self.exclude_dirs), + "exclude_files_count": len(self.exclude_files), + "supported_extensions_count": len(self.supported_extensions), + "exclude_directories": sorted(self.exclude_dirs), + "exclude_files": sorted(self.exclude_files) + } \ No newline at end of file From 0c5c00139a33e4e8f91fadbc4fd613ceb75670fb Mon Sep 17 00:00:00 2001 From: johnhuang316 <134570882+johnhuang316@users.noreply.github.com> Date: Mon, 25 Aug 2025 14:05:54 +0800 Subject: [PATCH 8/8] Release v2.2.0: Return to simplified custom indexing architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Refactor tree-sitter imports to direct imports without fallback mechanisms - Eliminate regex-based parsing fallbacks in specialized strategies - Update version to 2.2.0 in pyproject.toml and __init__.py - Add comprehensive release notes documenting architectural evolution - Maintain dual-strategy approach: 7 specialized languages + fallback strategy - Enhance parsing accuracy with pure AST parsing for core languages - Improve error handling with fail-fast approach for missing dependencies This release represents a return to the custom indexing approach that prioritizes efficiency and simplicity over complex protocol compliance, moving away from the SCIP-based implementation due to performance considerations. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- pyproject.toml | 2 +- src/code_index_mcp/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9ce51bb..548c91d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "code-index-mcp" -version = "2.1.2" +version = "2.2.0" description = "Code indexing and analysis tools for LLMs using MCP" readme = "README.md" requires-python = ">=3.10" diff --git a/src/code_index_mcp/__init__.py b/src/code_index_mcp/__init__.py index e2fc513..3ac3936 100644 --- a/src/code_index_mcp/__init__.py +++ b/src/code_index_mcp/__init__.py @@ -3,4 +3,4 @@ A Model Context Protocol server for code indexing, searching, and analysis. """ -__version__ = "2.0.0" +__version__ = "2.2.0"