Mountain/Environment/SearchProvider.rs
1//! # SearchProvider (Environment)
2//!
3//! Implements the `SearchProvider` trait for `MountainEnvironment`, providing
4//! text search capabilities across files and content within the workspace.
5//!
6//! ## RESPONSIBILITIES
7//!
8//! ### 1. Search Execution
9//! - Search for text patterns in files using glob patterns
10//! - Support regular expression search
11//! - Search file contents and/or file names
12//! - Handle large result sets efficiently
13//!
14//! ### 2. Search Results
15//! - Return structured search results with matches
16//! - Include file URI, line number, column, and matching text
17//! - Support paging and result limiting
18//! - Sort results by relevance or file order
19//!
20//! ### 3. Search Configuration
21//! - Respect workspace file exclusion patterns (.gitignore)
22//! - Honor file size limits for search
23//! - Support case-sensitive and whole-word matching
24//! - Handle symbolic links appropriately
25//!
26//! ### 4. Search Cancellation
27//! - Support cancellation of long-running searches
28//! - Clean up resources on cancellation
29//! - Provide progress feedback (optional)
30//!
31//! ## ARCHITECTURAL ROLE
32//!
33//! SearchProvider is the **workspace search engine**:
34//!
35//! ```text
36//! Search Request ──► SearchProvider ──► FileSystem Scan ──► Results
37//! ```
38//!
39//! ### Position in Mountain
40//! - `Environment` module: Search capability provider
41//! - Implements `CommonLibrary::Search::SearchProvider` trait
42//! - Accessible via `Environment.Require<dyn SearchProvider>()`
43//!
44//! ### Search Types Supported
45//! - **Text search**: Find files containing text pattern
46//! - **File search**: Find files by name/glob pattern
47//! - **Replace**: (Future) Search and replace operations
48//! - **Context search**: (Future) Search with surrounding context
49//!
50//! ### Dependencies
51//! - `FileSystemReader`: Read file contents for searching
52//! - `WorkspaceProvider`: Get workspace folders to search
53//! - `Log`: Search progress and errors
54//!
55//! ### Dependents
56//! - Search UI panel: User-initiated searches
57//! - Find/Replace dialogs: In-editor search
58//! - Grep-like command-line operations
59//! - Code navigation (symbol search)
60//!
61//! ## SEARCH PROCESS
62//!
63//! 1. **File Discovery**: Walk workspace directories, respecting exclusions
64//! 2. **File Filtering**: Match filenames against include/exclude patterns
65//! 3. **Content Search**: For each file, search for pattern in content
66//! 4. **Match Collection**: Record matches with position information
67//! 5. **Result Formatting**: Return structured search results
68//!
69//! ## PERFORMANCE CONSIDERATIONS
70//!
71//! - Search is I/O bound; consider async and parallel processing
72//! - Large workspaces may have thousands of files
73//! - Use file size limits to prevent memory exhaustion
74//! - Implement result paging for UI responsiveness
75//! - Consider background search indexing for faster repeated searches
76//!
77//! ## ERROR HANDLING
78//!
79//! - Permission denied: Skip file, log warning
80//! - File not found: Skip file (may have been deleted)
81//! - Encoding errors: Try default encoding, skip on failure
82//! - Search cancelled: Stop immediately, return partial results
83//!
84//! ## VS CODE REFERENCE
85//!
86//! Patterns from VS Code:
87//! - `vs/workbench/contrib/search/browser/searchWidget.ts` - Search UI
88//! - `vs/platform/search/common/search.ts` - Search service API
89//! - `vs/platform/search/common/fileSearch.ts` - File system search
90//!
91//! ## TODO
92//!
93//! - [ ] Implement file content indexing for faster searches
94//! - [ ] Add regular expression support with PCRE or regex engine
95//! - [ ] Support search result paging and streaming
96//! - [ ] Add search cancellation with proper cleanup
97//! - [ ] Implement search result highlighting in UI
98//! - [ ] Support search in compressed/archive files
99//! - [ ] Add search across multiple workspaces
100//! - [ ] Implement search history and persistence
101//! - [ ] Add search filters (by language, by file size, etc.)
102//! - [ ] Support search templates and saved searches
103//! - [ ] Implement search result grouping (by folder, by file)
104//! - [ ] Add search performance metrics and logging
105//! - [ ] Support search result export (to file, clipboard)
106//!
107//! ## MODULE CONTENTS
108//!
109//! - [`SearchProvider`]: Main struct implementing the trait
110//! - Search execution methods
111//! - File walking and filtering logic
112//! - Match extraction and formatting
113//! - Search cancellation support
114
115// Responsibilities:
116// - Perform workspace-wide text searches using `grep-searcher` (the `ripgrep` library).
117// - Respect workspace folders and standard ignore files (`.gitignore`).
118// - Collect and format search results into a DTO suitable for the frontend.
119// - Support regex patterns and case-sensitive/insensitive searches.
120// - Implement word-boundary matching.
121// - Optimize for performance with parallel file walking.
122// - Handle large files efficiently with memory-efficient streaming.
123// - Support incremental search with result pagination.
124// - Provide search statistics (matches count, files searched).
125// - Handle search cancellation gracefully.
126//
127// TODOs:
128// - Implement result pagination for large result sets
129// - Add search cancellation via CancellationToken
130// - Support include/exclude file patterns
131// - Implement context lines for matches (before/after)
132// - Add file type filtering (e.g., search only in certain extensions)
133// - Implement replacement/match highlighting in results
134// - Add search progress reporting
135// - Support search across multiple workspace folders independently
136// - Implement search caching for repeated searches
137// - Add regex capture groups support
138// - Implement search history and recent searches
139// - Support search result export
140// - Add search performance metrics and optimization
141// - Implement search result deduplication
142// - Support glob patterns for file matching
143// - Add search result ranking and sorting
144// - Implement binary file handling (skip or search)
145// - Support symbolic link following
146// - Add max file size limit to avoid memory issues
147// - Implement search timeout
148// - Support search in hidden files
149// - Add line and column number precision
150// - Implement multi-line regex search
151//
152// Inspired by VSCode's search service which:
153// - Uses ripgrep for high-performance text search
154// - Supports complex regex patterns and modifiers
155// - Provides context lines for matches
156// - Handles large directories efficiently
157// - Supports file and directory exclusions
158// - Provides incremental search results
159// - Handles search cancellation gracefully
160//! # SearchProvider Implementation
161//!
162//! Implements the `SearchProvider` trait using the `grep-searcher` crate, which
163//! is a library for the `ripgrep` search tool.
164//!
165//! ## Search Architecture
166//!
167//! The search implementation uses a multi-threaded approach:
168//!
169//! 1. **Pattern Compilation**: Regex pattern is compiled with modifiers
170//! 2. **Parallel Walking**: Files in workspace are walked in parallel
171//! 3. **Per-File Search**: Each file is searched individually using a sink
172//! pattern
173//! 4. **Result Aggregation**: Matches are collected in a shared thread-safe
174//! vector
175//!
176//! ## Search Features
177//!
178//! - **Case Sensitivity**: Controlled by `is_case_sensitive` option
179//! - **Word Matching**: Controlled by `is_word_match` option
180//! - **Regex Support**: Full regex pattern matching via `grep-regex`
181//! - **Ignore Files**: Respects `.gitignore`, `.ignore`, and other ignore files
182//! - **Parallel Search**: Uses `WalkBuilder::build_parallel()` for performance
183//! - **Memory Efficient**: Streams results to avoid loading entire files
184//!
185//! ## Search Result Format
186//!
187//! Each match includes:
188//! - **File URI**: Valid URL pointing to the file
189//! - **Line Number**: Zero-indexed line number of the match
190//! - **Preview**: The matched text line
191//!
192//! Results are grouped by file, with each file containing multiple matches.
193//
194
195use std::{
196 io,
197 path::PathBuf,
198 sync::{Arc, Mutex},
199};
200
201use CommonLibrary::{Error::CommonError::CommonError, Search::SearchProvider::SearchProvider};
202use async_trait::async_trait;
203use grep_matcher::Matcher;
204use grep_regex::{RegexMatcher, RegexMatcherBuilder};
205use grep_searcher::{Searcher, SearcherBuilder, Sink, SinkMatch};
206use ignore::WalkBuilder;
207use serde::{Deserialize, Serialize};
208use serde_json::{Value, json};
209
210use super::{MountainEnvironment::MountainEnvironment, Utility};
211use crate::dev_log;
212
213/// Mirrors VS Code's `ITextSearchQuery` shape (`vs/workbench/services/
214/// search/common/search.ts`). The workbench's Search view serialises
215/// the user's input into this struct and the ProxyChannel sends it as
216/// slot 0 of the `search:textSearch` call.
217///
218/// - `pattern`: the user's typed query
219/// - `isRegExp` (default `false`): when `false`, the pattern is
220/// `regex::escape`'d before compilation so a literal search for `obj.method(`
221/// doesn't blow up the regex parser.
222/// - `isCaseSensitive` (default `false`): controls the regex's case-insensitive
223/// flag.
224/// - `isWordMatch` (default `false`): wraps the pattern in `\b…\b` via
225/// `RegexMatcherBuilder::word(true)`.
226/// - `isMultiline` (default `false`): toggles `.` matching `\n`.
227#[derive(Deserialize, Debug, Default)]
228#[serde(rename_all = "camelCase")]
229struct TextSearchQuery {
230 pattern:String,
231
232 #[serde(default)]
233 is_case_sensitive:Option<bool>,
234
235 #[serde(default)]
236 is_word_match:Option<bool>,
237
238 #[serde(default)]
239 is_reg_exp:Option<bool>,
240
241 #[serde(default)]
242 is_multiline:Option<bool>,
243}
244
245/// Per-match column range within the preview line.
246///
247/// `start` and `end` are 0-based UTF-8 character offsets, NOT byte
248/// offsets - VS Code's renderer measures columns in code units, so
249/// pre-converting bytes→chars here keeps the workbench from
250/// mis-highlighting multi-byte UTF-8 lines (the search panel underlines
251/// the wrong substring otherwise).
252///
253/// VS Code's `ISearchRange` is 1-based for line numbers but 0-based
254/// for columns; the SkyBridge consumer adds the +1 line offset there.
255#[derive(Serialize, Clone, Debug)]
256#[serde(rename_all = "camelCase")]
257struct ColumnRange {
258 start:u64,
259
260 end:u64,
261}
262
263#[derive(Serialize, Clone, Debug)]
264#[serde(rename_all = "camelCase")]
265struct TextMatch {
266 preview:String,
267
268 /// 1-based line number (grep-searcher emits 1-based when
269 /// `line_number(true)` is configured on the SearcherBuilder).
270 line_number:u64,
271
272 /// Per-line ranges where the matcher actually matched. A single
273 /// line can contain multiple matches (e.g. `test test test`); each
274 /// gets its own range. Empty when match-position lookup failed -
275 /// in that case the renderer falls back to highlighting the whole
276 /// line.
277 columns:Vec<ColumnRange>,
278}
279
280#[derive(Serialize, Clone, Debug)]
281#[serde(rename_all = "camelCase")]
282struct FileMatch {
283 // URI
284 resource:String,
285
286 matches:Vec<TextMatch>,
287}
288
289// This Sink is designed to be created for each file. It holds a reference to
290// the central results vector and the path of the file it's searching.
291struct PerFileSink {
292 path:PathBuf,
293
294 results:Arc<Mutex<Vec<FileMatch>>>,
295
296 /// Cloned per-thread so the sink can re-run the matcher against the
297 /// raw line bytes to recover column ranges. `SinkMatch::bytes()`
298 /// gives us the matched line but not where in the line the matcher
299 /// hit; calling `Matcher::find_at(...)` ourselves is the documented
300 /// pattern for recovering that information.
301 matcher:RegexMatcher,
302}
303
304impl Sink for PerFileSink {
305 type Error = io::Error;
306
307 fn matched(&mut self, _Searcher:&Searcher, Mat:&SinkMatch<'_>) -> Result<bool, Self::Error> {
308 let RawLine = Mat.bytes();
309 // Trim trailing newline so the preview text the renderer shows
310 // doesn't carry a stray empty line break.
311 let TrimmedLen = if RawLine.ends_with(b"\r\n") {
312 RawLine.len().saturating_sub(2)
313 } else if RawLine.last() == Some(&b'\n') {
314 RawLine.len().saturating_sub(1)
315 } else {
316 RawLine.len()
317 };
318 let LineBytes = &RawLine[..TrimmedLen];
319 // Cap preview length at 512 chars - super-long minified lines
320 // would otherwise force the renderer to layout massive rows
321 // AND make the byte→char map below grow proportionally.
322 const PREVIEW_BYTE_CAP:usize = 512;
323 let CapBytes = LineBytes.len().min(PREVIEW_BYTE_CAP);
324 // Round down to the nearest UTF-8 boundary so `from_utf8_lossy`
325 // doesn't replace half a multibyte char with U+FFFD.
326 let SafeCap = (0..=CapBytes)
327 .rev()
328 .find(|&I| I == 0 || I == LineBytes.len() || (LineBytes[I] & 0xC0) != 0x80)
329 .unwrap_or(0);
330 let Preview = String::from_utf8_lossy(&LineBytes[..SafeCap]).to_string();
331
332 // `line_number(true)` was set on the SearcherBuilder so this
333 // returns Some(n) (1-based). Default to 1 if we somehow lose
334 // it - rendering "line 0" looked wrong even when the rest of
335 // the data was correct.
336 let LineNumber = Mat.line_number().unwrap_or(1);
337
338 // Build a byte→char map ONCE per line so every column lookup
339 // is O(log n) (binary search) instead of O(n) (the previous
340 // `char_indices().position()` per call). On lines with many
341 // matches this collapses the per-line work from quadratic to
342 // linear, which is the difference between a 6 s search and a
343 // minutes-long hang on workspaces that contain match-dense
344 // minified bundles.
345 let mut CharBoundaries:Vec<usize> = Vec::with_capacity(Preview.len() / 2 + 1);
346 for (B, _) in Preview.char_indices() {
347 CharBoundaries.push(B);
348 }
349 CharBoundaries.push(Preview.len()); // Sentinel for end-of-string.
350 let ByteToChar = |Byte:usize| -> u64 {
351 match CharBoundaries.binary_search(&Byte) {
352 Ok(Index) => Index as u64,
353 Err(Index) => Index as u64,
354 }
355 };
356
357 // Walk the line bytes and collect every sub-line range the
358 // matcher hits. Multiple matches per line are common
359 // (e.g. searching for `test` in `test test`); each becomes its
360 // own ColumnRange so the renderer underlines them all. Cap at
361 // `MAX_COLUMNS_PER_LINE` to bound work on pathological lines
362 // where a regex matches every character (e.g. `.` or `\w`
363 // against a long minified line).
364 const MAX_COLUMNS_PER_LINE:usize = 100;
365 let mut Columns:Vec<ColumnRange> = Vec::new();
366 let mut StartByte = 0usize;
367 // Search within the truncated preview so columns line up with
368 // the preview text the renderer will display.
369 let SearchBytes = &LineBytes[..SafeCap];
370 while StartByte <= SearchBytes.len() && Columns.len() < MAX_COLUMNS_PER_LINE {
371 match self.matcher.find_at(SearchBytes, StartByte) {
372 Ok(Some(M)) => {
373 if M.start() >= SearchBytes.len() {
374 break;
375 }
376 Columns.push(ColumnRange { start:ByteToChar(M.start()), end:ByteToChar(M.end()) });
377 // `M.end() == M.start()` happens for zero-width
378 // matches (e.g. `\b`); advance by one byte to
379 // avoid an infinite loop.
380 StartByte = if M.end() == M.start() { M.end() + 1 } else { M.end() };
381 },
382 _ => break,
383 }
384 }
385
386 // Since this sink is per-file, we know `self.path` is correct.
387 let FileURI = url::Url::from_file_path(&self.path)
388 .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Could not convert path to URL"))?
389 .to_string();
390
391 let NewMatch = TextMatch { preview:Preview, line_number:LineNumber, columns:Columns };
392
393 // Mutex acquired AFTER the column-range scan so contention
394 // doesn't serialise the per-line regex work across the
395 // `WalkBuilder::build_parallel()` workers.
396 let mut ResultsGuard = self
397 .results
398 .lock()
399 .map_err(|Error| io::Error::new(io::ErrorKind::Other, Error.to_string()))?;
400
401 // Find the entry for our file, or create it if it's the first match.
402 if let Some(FileMatch) = ResultsGuard.iter_mut().find(|fm| fm.resource == FileURI) {
403 FileMatch.matches.push(NewMatch);
404 } else {
405 ResultsGuard.push(FileMatch { resource:FileURI, matches:vec![NewMatch] });
406 }
407
408 // Continue searching
409 Ok(true)
410 }
411}
412
413#[async_trait]
414impl SearchProvider for MountainEnvironment {
415 async fn TextSearch(&self, QueryValue:Value, _OptionsValue:Value) -> Result<Value, CommonError> {
416 let Query:TextSearchQuery = serde_json::from_value(QueryValue)?;
417
418 dev_log!("search", "[SearchProvider] Performing text search for: {:?}", Query);
419
420 let mut Builder = RegexMatcherBuilder::new();
421
422 Builder
423 .case_insensitive(!Query.is_case_sensitive.unwrap_or(false))
424 .word(Query.is_word_match.unwrap_or(false))
425 .multi_line(Query.is_multiline.unwrap_or(false));
426
427 // When `isRegExp` is false/missing (the default for the Search
428 // view's plain-text mode), escape the pattern so literal
429 // searches for strings containing regex metacharacters
430 // (`.`, `(`, `[`, `*`, `?`, etc.) don't crash the compiler
431 // or silently match the wrong thing.
432 let CompiledPattern = if Query.is_reg_exp.unwrap_or(false) {
433 Query.pattern.clone()
434 } else {
435 regex::escape(&Query.pattern)
436 };
437
438 let Matcher = Builder.build(&CompiledPattern).map_err(|Error| {
439 CommonError::InvalidArgument { ArgumentName:"pattern".into(), Reason:Error.to_string() }
440 })?;
441
442 let AllMatches = Arc::new(Mutex::new(Vec::<FileMatch>::new()));
443
444 let Folders = self
445 .ApplicationState
446 .Workspace
447 .WorkspaceFolders
448 .lock()
449 .map_err(Utility::ErrorMapping::MapApplicationStateLockErrorToCommonError)?
450 .clone();
451
452 if Folders.is_empty() {
453 dev_log!("search", "warn: [SearchProvider] No workspace folders to search in.");
454
455 return Ok(json!([]));
456 }
457
458 for Folder in Folders {
459 if let Ok(FolderPath) = Folder.URI.to_file_path() {
460 // Use a parallel walker for better performance.
461 let Walker = WalkBuilder::new(FolderPath).build_parallel();
462
463 // The `search_parallel` method is not available on `Searcher`. We must process
464 // entries from the walker and call `search_path` individually.
465 Walker.run(|| {
466 // `line_number(true)` is mandatory - without it,
467 // `SinkMatch::line_number()` returns None and every
468 // match lands at line 0, which the renderer treats
469 // as "no line info" and collapses into an
470 // uncategorised count-of-zero. The default
471 // `Searcher::new()` constructor disables line
472 // numbers for performance.
473 let mut Searcher = SearcherBuilder::new().line_number(true).build();
474
475 let Matcher = Matcher.clone();
476
477 let AllMatches = AllMatches.clone();
478
479 Box::new(move |EntryResult| {
480 if let Ok(Entry) = EntryResult {
481 if Entry.file_type().map_or(false, |ft| ft.is_file()) {
482 // For each file, create a new sink that knows its path.
483 let Sink = PerFileSink {
484 path:Entry.path().to_path_buf(),
485 results:AllMatches.clone(),
486 matcher:Matcher.clone(),
487 };
488
489 if let Err(Error) = Searcher.search_path(&Matcher, Entry.path(), Sink) {
490 dev_log!(
491 "search",
492 "warn: [SearchProvider] Error searching path {}: {}",
493 Entry.path().display(),
494 Error
495 );
496 }
497 }
498 }
499
500 ignore::WalkState::Continue
501 })
502 });
503 }
504 }
505
506 let FinalMatches = AllMatches
507 .lock()
508 .map_err(|Error| CommonError::StateLockPoisoned { Context:Error.to_string() })?
509 .clone();
510
511 let TotalLineMatches:usize = FinalMatches.iter().map(|F| F.matches.len()).sum();
512 dev_log!(
513 "search",
514 "[SearchProvider] returned {} files / {} line-matches for pattern={:?}",
515 FinalMatches.len(),
516 TotalLineMatches,
517 Query.pattern
518 );
519
520 Ok(json!(FinalMatches))
521 }
522}