Skip to main content

AirLibrary/Indexing/State/
CreateState.rs

1//! # CreateState
2//!
3//! ## File: Indexing/State/CreateState.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides state creation functions for the File Indexer service, including
8//! the construction of index entries, symbols, and related data structures
9//! used throughout the indexing system.
10//!
11//! ## Primary Responsibility
12//!
13//! Create and initialize index state structures including FileIndex,
14//! FileMetadata, SymbolInfo, and related types.
15//!
16//! ## Secondary Responsibilities
17//!
18//! - Generate index version strings
19//! - Calculate index checksums for integrity verification
20//! - Create new empty indexes
21//! - Backup corrupted indexes
22//!
23//! ## Dependencies
24//!
25//! **External Crates:**
26//! - `chrono` - Timestamp generation for index metadata
27//! - `sha2` - Checksum calculation for index integrity
28//! - `serde` - Serialization/deserialization of index structures
29//!
30//! **Internal Modules:**
31//! - `crate::Result` - Error handling type
32//! - `crate::AirError` - Error types
33//!
34//! ## Dependents
35//!
36//! - `Indexing::Store::StoreEntry` - Creates entries for index storage
37//! - `Indexing::Store::UpdateIndex` - Updates index state
38//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
39//!
40//! ## VSCode Pattern Reference
41//!
42//! Inspired by VSCode's indexer state creation in
43//! `src/vs/workbench/services/search/common/`
44//!
45//! ## Security Considerations
46//!
47//! - Checksums prevent tampering with index data
48//! - Version tracking enables corruption detection
49//! - Path traversal protection applied during validation
50//!
51//! ## Performance Considerations
52//!
53//! - Lightweight state creation operations
54//! - Hash calculations are amortized across index operations
55//! - Memory-efficient data structures for large indexes
56//!
57//! ## Error Handling Strategy
58//!
59//! State creation operations use result types and propagate errors up
60//! with clear messages about what failed during creation or validation.
61//!
62//! ## Thread Safety
63//!
64//! State structures are designed to be moved into Arc<RwLock<>> for
65//! thread-safe shared access across indexing and search operations.
66
67use std::{collections::HashMap, path::PathBuf};
68#[cfg(unix)]
69use std::os::unix::fs::PermissionsExt;
70
71use serde::{Deserialize, Serialize};
72use sha2::{Digest, Sha256};
73
74use crate::{AirError, Result};
75
76/// Maximum file size allowed for indexing (100MB)
77pub const MAX_FILE_SIZE_BYTES:u64 = 100 * 1024 * 1024;
78
79/// Symbol information extracted from files for VSCode Outline View
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct SymbolInfo {
82	/// Symbol name (function, class, variable, etc.)
83	pub name:String,
84	/// Symbol kind (function, class, struct, interface, etc.)
85	pub kind:SymbolKind,
86	/// Line number where symbol is defined
87	pub line:u32,
88	/// Column number
89	pub column:u32,
90	/// Full qualified path
91	pub full_path:String,
92}
93
94/// Symbol kind for VSCode compatibility
95#[derive(Debug, Clone, Serialize, Deserialize, Hash, Eq, PartialEq)]
96pub enum SymbolKind {
97	File = 0,
98	Module = 1,
99	Namespace = 2,
100	Package = 3,
101	Class = 4,
102	Method = 5,
103	Property = 6,
104	Field = 7,
105	Constructor = 8,
106	Enum = 9,
107	Interface = 10,
108	Function = 11,
109	Variable = 12,
110	Constant = 13,
111	String = 14,
112	Number = 15,
113	Boolean = 16,
114	Array = 17,
115	Object = 18,
116	Key = 19,
117	Null = 20,
118	EnumMember = 21,
119	Struct = 22,
120	Event = 23,
121	Operator = 24,
122	TypeParameter = 25,
123}
124
125/// Symbol location for cross-referencing
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct SymbolLocation {
128	/// File containing the symbol
129	pub file_path:PathBuf,
130	/// Line number
131	pub line:u32,
132	/// Symbol information
133	pub symbol:SymbolInfo,
134}
135
136/// File metadata with comprehensive information
137#[derive(Debug, Clone, Serialize, Deserialize)]
138pub struct FileMetadata {
139	/// File path
140	pub path:PathBuf,
141	/// File size in bytes
142	pub size:u64,
143	/// Last modification timestamp
144	pub modified:chrono::DateTime<chrono::Utc>,
145	/// MIME type
146	pub mime_type:String,
147	/// Detected programming language
148	pub language:Option<String>,
149	/// Line count for text files
150	pub line_count:Option<u32>,
151	/// SHA-256 checksum for change detection
152	pub checksum:String,
153	/// Whether file is a symbolic link
154	pub is_symlink:bool,
155	/// File permissions (format: "rwxrwxrwx")
156	pub permissions:String,
157	/// File encoding (UTF-8, ASCII, etc.)
158	pub encoding:Option<String>,
159	/// Last indexed timestamp
160	pub indexed_at:chrono::DateTime<chrono::Utc>,
161	/// Number of symbols extracted
162	pub symbol_count:u32,
163}
164
165/// File index structure with comprehensive metadata
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct FileIndex {
168	/// Indexed files with complete metadata
169	pub files:HashMap<PathBuf, FileMetadata>,
170	/// Content index for fast text search
171	/// Maps words/tokens to file paths where they appear
172	pub content_index:HashMap<String, Vec<PathBuf>>,
173	/// Symbol index for VSCode Outline View and Go to Symbol
174	/// Maps symbol names to their definitions
175	pub symbol_index:HashMap<String, Vec<SymbolLocation>>,
176	/// Reverse symbol index for cross-referencing
177	pub file_symbols:HashMap<PathBuf, Vec<SymbolInfo>>,
178	/// Last update timestamp for all indexes
179	pub last_updated:chrono::DateTime<chrono::Utc>,
180	/// Index version for corruption detection
181	pub index_version:String,
182	/// Index checksum for integrity verification
183	pub index_checksum:String,
184}
185
186/// Create a new empty file index
187pub fn CreateNewIndex() -> FileIndex {
188	FileIndex {
189		files:HashMap::new(),
190		content_index:HashMap::new(),
191		symbol_index:HashMap::new(),
192		file_symbols:HashMap::new(),
193		last_updated:chrono::Utc::now(),
194		index_version:GenerateIndexVersion(),
195		index_checksum:String::new(),
196	}
197}
198
199/// Generate index version string
200pub fn GenerateIndexVersion() -> String { format!("{}-{}", env!("CARGO_PKG_VERSION"), chrono::Utc::now().timestamp()) }
201
202/// Calculate index checksum for integrity verification
203pub fn CalculateIndexChecksum(index:&FileIndex) -> Result<String> {
204	let checksum_input = format!(
205		"{}:{}:{}:{}",
206		index.files.len(),
207		index.content_index.len(),
208		index.symbol_index.len(),
209		index.last_updated.timestamp()
210	);
211
212	let mut hasher = Sha256::new();
213	hasher.update(checksum_input.as_bytes());
214	// sha2 0.11: digest output is `hybrid_array::Array` which has no
215	// `LowerHex` impl; `hex::encode` is the 1:1 replacement.
216	Ok(hex::encode(hasher.finalize()))
217}
218
219/// Create file metadata from raw information
220pub fn CreateFileMetadata(
221	path:PathBuf,
222	size:u64,
223	modified:chrono::DateTime<chrono::Utc>,
224	mime_type:String,
225	language:Option<String>,
226	line_count:Option<u32>,
227	checksum:String,
228	is_symlink:bool,
229	permissions:String,
230	encoding:Option<String>,
231	symbol_count:u32,
232) -> FileMetadata {
233	FileMetadata {
234		path,
235		size,
236		modified,
237		mime_type,
238		language,
239		line_count,
240		checksum,
241		is_symlink,
242		permissions,
243		encoding,
244		indexed_at:chrono::Utc::now(),
245		symbol_count,
246	}
247}
248
249/// Create symbol info with validation
250pub fn CreateSymbolInfo(name:String, kind:SymbolKind, line:u32, column:u32, full_path:String) -> SymbolInfo {
251	SymbolInfo { name, kind, line, column, full_path }
252}
253
254/// Create symbol location for cross-referencing
255pub fn CreateSymbolLocation(file_path:PathBuf, line:u32, symbol:SymbolInfo) -> SymbolLocation {
256	SymbolLocation { file_path, line, symbol }
257}
258
259/// Get file permissions as string from metadata
260#[cfg(unix)]
261pub fn GetPermissionsString(metadata:&std::fs::Metadata) -> String {
262	let mode = metadata.permissions().mode();
263	let mut perms = String::new();
264	// Read permission
265	perms.push(if mode & 0o400 != 0 { 'r' } else { '-' });
266	// Write permission
267	perms.push(if mode & 0o200 != 0 { 'w' } else { '-' });
268	// Execute permission
269	perms.push(if mode & 0o100 != 0 { 'x' } else { '-' });
270	// Group permissions
271	perms.push(if mode & 0o040 != 0 { 'r' } else { '-' });
272	perms.push(if mode & 0o020 != 0 { 'w' } else { '-' });
273	perms.push(if mode & 0o010 != 0 { 'x' } else { '-' });
274	// Other permissions
275	perms.push(if mode & 0o004 != 0 { 'r' } else { '-' });
276	perms.push(if mode & 0o002 != 0 { 'w' } else { '-' });
277	perms.push(if mode & 0o001 != 0 { 'x' } else { '-' });
278	perms
279}
280
281/// Get file permissions as string for non-Unix systems
282#[cfg(not(unix))]
283pub fn GetPermissionsString(_metadata:&std::fs::Metadata) -> String { "--------".to_string() }
284
285/// Validate file size against maximum allowed
286pub fn ValidateFileSize(size:u64) -> Result<()> {
287	if size > MAX_FILE_SIZE_BYTES {
288		return Err(AirError::FileSystem(format!(
289			"File size {} exceeds maximum allowed size of {} bytes",
290			size, MAX_FILE_SIZE_BYTES
291		)));
292	}
293	Ok(())
294}
295
296/// Check if index size is within sane limits
297pub fn ValidateIndexSize(index:&FileIndex) -> Result<()> {
298	const MAX_INDEXED_FILES:usize = 1_000_000;
299	const MAX_SYMBOLS:usize = 10_000_000;
300
301	if index.files.len() > MAX_INDEXED_FILES {
302		return Err(AirError::Internal(format!(
303			"Index exceeds maximum file count: {} > {}",
304			index.files.len(),
305			MAX_INDEXED_FILES
306		)));
307	}
308
309	let total_symbols:usize = index.file_symbols.values().map(|v| v.len()).sum();
310	if total_symbols > MAX_SYMBOLS {
311		return Err(AirError::Internal(format!(
312			"Index exceeds maximum symbol count: {} > {}",
313			total_symbols, MAX_SYMBOLS
314		)));
315	}
316
317	Ok(())
318}