Skip to main content

Mountain/Environment/
SearchProvider.rs

1//! # SearchProvider (Environment)
2//!
3//! Implements the `SearchProvider` trait for `MountainEnvironment`, providing
4//! text search capabilities across files and content within the workspace.
5//!
6//! ## RESPONSIBILITIES
7//!
8//! ### 1. Search Execution
9//! - Search for text patterns in files using glob patterns
10//! - Support regular expression search
11//! - Search file contents and/or file names
12//! - Handle large result sets efficiently
13//!
14//! ### 2. Search Results
15//! - Return structured search results with matches
16//! - Include file URI, line number, column, and matching text
17//! - Support paging and result limiting
18//! - Sort results by relevance or file order
19//!
20//! ### 3. Search Configuration
21//! - Respect workspace file exclusion patterns (.gitignore)
22//! - Honor file size limits for search
23//! - Support case-sensitive and whole-word matching
24//! - Handle symbolic links appropriately
25//!
26//! ### 4. Search Cancellation
27//! - Support cancellation of long-running searches
28//! - Clean up resources on cancellation
29//! - Provide progress feedback (optional)
30//!
31//! ## ARCHITECTURAL ROLE
32//!
33//! SearchProvider is the **workspace search engine**:
34//!
35//! ```text
36//! Search Request ──► SearchProvider ──► FileSystem Scan ──► Results
37//! ```
38//!
39//! ### Position in Mountain
40//! - `Environment` module: Search capability provider
41//! - Implements `CommonLibrary::Search::SearchProvider` trait
42//! - Accessible via `Environment.Require<dyn SearchProvider>()`
43//!
44//! ### Search Types Supported
45//! - **Text search**: Find files containing text pattern
46//! - **File search**: Find files by name/glob pattern
47//! - **Replace**: (Future) Search and replace operations
48//! - **Context search**: (Future) Search with surrounding context
49//!
50//! ### Dependencies
51//! - `FileSystemReader`: Read file contents for searching
52//! - `WorkspaceProvider`: Get workspace folders to search
53//! - `Log`: Search progress and errors
54//!
55//! ### Dependents
56//! - Search UI panel: User-initiated searches
57//! - Find/Replace dialogs: In-editor search
58//! - Grep-like command-line operations
59//! - Code navigation (symbol search)
60//!
61//! ## SEARCH PROCESS
62//!
63//! 1. **File Discovery**: Walk workspace directories, respecting exclusions
64//! 2. **File Filtering**: Match filenames against include/exclude patterns
65//! 3. **Content Search**: For each file, search for pattern in content
66//! 4. **Match Collection**: Record matches with position information
67//! 5. **Result Formatting**: Return structured search results
68//!
69//! ## PERFORMANCE CONSIDERATIONS
70//!
71//! - Search is I/O bound; consider async and parallel processing
72//! - Large workspaces may have thousands of files
73//! - Use file size limits to prevent memory exhaustion
74//! - Implement result paging for UI responsiveness
75//! - Consider background search indexing for faster repeated searches
76//!
77//! ## ERROR HANDLING
78//!
79//! - Permission denied: Skip file, log warning
80//! - File not found: Skip file (may have been deleted)
81//! - Encoding errors: Try default encoding, skip on failure
82//! - Search cancelled: Stop immediately, return partial results
83//!
84//! ## VS CODE REFERENCE
85//!
86//! Patterns from VS Code:
87//! - `vs/workbench/contrib/search/browser/searchWidget.ts` - Search UI
88//! - `vs/platform/search/common/search.ts` - Search service API
89//! - `vs/platform/search/common/fileSearch.ts` - File system search
90//!
91//! ## TODO
92//!
93//! - [ ] Implement file content indexing for faster searches
94//! - [ ] Add regular expression support with PCRE or regex engine
95//! - [ ] Support search result paging and streaming
96//! - [ ] Add search cancellation with proper cleanup
97//! - [ ] Implement search result highlighting in UI
98//! - [ ] Support search in compressed/archive files
99//! - [ ] Add search across multiple workspaces
100//! - [ ] Implement search history and persistence
101//! - [ ] Add search filters (by language, by file size, etc.)
102//! - [ ] Support search templates and saved searches
103//! - [ ] Implement search result grouping (by folder, by file)
104//! - [ ] Add search performance metrics and logging
105//! - [ ] Support search result export (to file, clipboard)
106//!
107//! ## MODULE CONTENTS
108//!
109//! - [`SearchProvider`]: Main struct implementing the trait
110//! - Search execution methods
111//! - File walking and filtering logic
112//! - Match extraction and formatting
113//! - Search cancellation support
114
115// Responsibilities:
116//   - Perform workspace-wide text searches using `grep-searcher` (the `ripgrep` library).
117//   - Respect workspace folders and standard ignore files (`.gitignore`).
118//   - Collect and format search results into a DTO suitable for the frontend.
119//   - Support regex patterns and case-sensitive/insensitive searches.
120//   - Implement word-boundary matching.
121//   - Optimize for performance with parallel file walking.
122//   - Handle large files efficiently with memory-efficient streaming.
123//   - Support incremental search with result pagination.
124//   - Provide search statistics (matches count, files searched).
125//   - Handle search cancellation gracefully.
126//
127// TODOs:
128//   - Implement result pagination for large result sets
129//   - Add search cancellation via CancellationToken
130//   - Support include/exclude file patterns
131//   - Implement context lines for matches (before/after)
132//   - Add file type filtering (e.g., search only in certain extensions)
133//   - Implement replacement/match highlighting in results
134//   - Add search progress reporting
135//   - Support search across multiple workspace folders independently
136//   - Implement search caching for repeated searches
137//   - Add regex capture groups support
138//   - Implement search history and recent searches
139//   - Support search result export
140//   - Add search performance metrics and optimization
141//   - Implement search result deduplication
142//   - Support glob patterns for file matching
143//   - Add search result ranking and sorting
144//   - Implement binary file handling (skip or search)
145//   - Support symbolic link following
146//   - Add max file size limit to avoid memory issues
147//   - Implement search timeout
148//   - Support search in hidden files
149//   - Add line and column number precision
150//   - Implement multi-line regex search
151//
152// Inspired by VSCode's search service which:
153// - Uses ripgrep for high-performance text search
154// - Supports complex regex patterns and modifiers
155// - Provides context lines for matches
156// - Handles large directories efficiently
157// - Supports file and directory exclusions
158// - Provides incremental search results
159// - Handles search cancellation gracefully
160//! # SearchProvider Implementation
161//!
162//! Implements the `SearchProvider` trait using the `grep-searcher` crate, which
163//! is a library for the `ripgrep` search tool.
164//!
165//! ## Search Architecture
166//!
167//! The search implementation uses a multi-threaded approach:
168//!
169//! 1. **Pattern Compilation**: Regex pattern is compiled with modifiers
170//! 2. **Parallel Walking**: Files in workspace are walked in parallel
171//! 3. **Per-File Search**: Each file is searched individually using a sink
172//!    pattern
173//! 4. **Result Aggregation**: Matches are collected in a shared thread-safe
174//!    vector
175//!
176//! ## Search Features
177//!
178//! - **Case Sensitivity**: Controlled by `is_case_sensitive` option
179//! - **Word Matching**: Controlled by `is_word_match` option
180//! - **Regex Support**: Full regex pattern matching via `grep-regex`
181//! - **Ignore Files**: Respects `.gitignore`, `.ignore`, and other ignore files
182//! - **Parallel Search**: Uses `WalkBuilder::build_parallel()` for performance
183//! - **Memory Efficient**: Streams results to avoid loading entire files
184//!
185//! ## Search Result Format
186//!
187//! Each match includes:
188//! - **File URI**: Valid URL pointing to the file
189//! - **Line Number**: Zero-indexed line number of the match
190//! - **Preview**: The matched text line
191//!
192//! Results are grouped by file, with each file containing multiple matches.
193//
194
195use std::{
196	io,
197	path::PathBuf,
198	sync::{Arc, Mutex},
199};
200
201use CommonLibrary::{Error::CommonError::CommonError, Search::SearchProvider::SearchProvider};
202use async_trait::async_trait;
203use grep_matcher::Matcher;
204use grep_regex::{RegexMatcher, RegexMatcherBuilder};
205use grep_searcher::{Searcher, SearcherBuilder, Sink, SinkMatch};
206use ignore::WalkBuilder;
207use serde::{Deserialize, Serialize};
208use serde_json::{Value, json};
209
210use super::{MountainEnvironment::MountainEnvironment, Utility};
211use crate::dev_log;
212
213/// Mirrors VS Code's `ITextSearchQuery` shape (`vs/workbench/services/
214/// search/common/search.ts`). The workbench's Search view serialises
215/// the user's input into this struct and the ProxyChannel sends it as
216/// slot 0 of the `search:textSearch` call.
217///
218/// - `pattern`: the user's typed query
219/// - `isRegExp` (default `false`): when `false`, the pattern is
220///   `regex::escape`'d before compilation so a literal search for `obj.method(`
221///   doesn't blow up the regex parser.
222/// - `isCaseSensitive` (default `false`): controls the regex's case-insensitive
223///   flag.
224/// - `isWordMatch` (default `false`): wraps the pattern in `\b…\b` via
225///   `RegexMatcherBuilder::word(true)`.
226/// - `isMultiline` (default `false`): toggles `.` matching `\n`.
227#[derive(Deserialize, Debug, Default)]
228#[serde(rename_all = "camelCase")]
229struct TextSearchQuery {
230	pattern:String,
231
232	#[serde(default)]
233	is_case_sensitive:Option<bool>,
234
235	#[serde(default)]
236	is_word_match:Option<bool>,
237
238	#[serde(default)]
239	is_reg_exp:Option<bool>,
240
241	#[serde(default)]
242	is_multiline:Option<bool>,
243}
244
245/// Per-match column range within the preview line.
246///
247/// `start` and `end` are 0-based UTF-8 character offsets, NOT byte
248/// offsets - VS Code's renderer measures columns in code units, so
249/// pre-converting bytes→chars here keeps the workbench from
250/// mis-highlighting multi-byte UTF-8 lines (the search panel underlines
251/// the wrong substring otherwise).
252///
253/// VS Code's `ISearchRange` is 1-based for line numbers but 0-based
254/// for columns; the SkyBridge consumer adds the +1 line offset there.
255#[derive(Serialize, Clone, Debug)]
256#[serde(rename_all = "camelCase")]
257struct ColumnRange {
258	start:u64,
259
260	end:u64,
261}
262
263#[derive(Serialize, Clone, Debug)]
264#[serde(rename_all = "camelCase")]
265struct TextMatch {
266	preview:String,
267
268	/// 1-based line number (grep-searcher emits 1-based when
269	/// `line_number(true)` is configured on the SearcherBuilder).
270	line_number:u64,
271
272	/// Per-line ranges where the matcher actually matched. A single
273	/// line can contain multiple matches (e.g. `test test test`); each
274	/// gets its own range. Empty when match-position lookup failed -
275	/// in that case the renderer falls back to highlighting the whole
276	/// line.
277	columns:Vec<ColumnRange>,
278}
279
280#[derive(Serialize, Clone, Debug)]
281#[serde(rename_all = "camelCase")]
282struct FileMatch {
283	// URI
284	resource:String,
285
286	matches:Vec<TextMatch>,
287}
288
289// This Sink is designed to be created for each file. It holds a reference to
290// the central results vector and the path of the file it's searching.
291struct PerFileSink {
292	path:PathBuf,
293
294	results:Arc<Mutex<Vec<FileMatch>>>,
295
296	/// Cloned per-thread so the sink can re-run the matcher against the
297	/// raw line bytes to recover column ranges. `SinkMatch::bytes()`
298	/// gives us the matched line but not where in the line the matcher
299	/// hit; calling `Matcher::find_at(...)` ourselves is the documented
300	/// pattern for recovering that information.
301	matcher:RegexMatcher,
302}
303
304impl Sink for PerFileSink {
305	type Error = io::Error;
306
307	fn matched(&mut self, _Searcher:&Searcher, Mat:&SinkMatch<'_>) -> Result<bool, Self::Error> {
308		let RawLine = Mat.bytes();
309		// Trim trailing newline so the preview text the renderer shows
310		// doesn't carry a stray empty line break.
311		let TrimmedLen = if RawLine.ends_with(b"\r\n") {
312			RawLine.len().saturating_sub(2)
313		} else if RawLine.last() == Some(&b'\n') {
314			RawLine.len().saturating_sub(1)
315		} else {
316			RawLine.len()
317		};
318		let LineBytes = &RawLine[..TrimmedLen];
319		// Cap preview length at 512 chars - super-long minified lines
320		// would otherwise force the renderer to layout massive rows
321		// AND make the byte→char map below grow proportionally.
322		const PREVIEW_BYTE_CAP:usize = 512;
323		let CapBytes = LineBytes.len().min(PREVIEW_BYTE_CAP);
324		// Round down to the nearest UTF-8 boundary so `from_utf8_lossy`
325		// doesn't replace half a multibyte char with U+FFFD.
326		let SafeCap = (0..=CapBytes)
327			.rev()
328			.find(|&I| I == 0 || I == LineBytes.len() || (LineBytes[I] & 0xC0) != 0x80)
329			.unwrap_or(0);
330		let Preview = String::from_utf8_lossy(&LineBytes[..SafeCap]).to_string();
331
332		// `line_number(true)` was set on the SearcherBuilder so this
333		// returns Some(n) (1-based). Default to 1 if we somehow lose
334		// it - rendering "line 0" looked wrong even when the rest of
335		// the data was correct.
336		let LineNumber = Mat.line_number().unwrap_or(1);
337
338		// Build a byte→char map ONCE per line so every column lookup
339		// is O(log n) (binary search) instead of O(n) (the previous
340		// `char_indices().position()` per call). On lines with many
341		// matches this collapses the per-line work from quadratic to
342		// linear, which is the difference between a 6 s search and a
343		// minutes-long hang on workspaces that contain match-dense
344		// minified bundles.
345		let mut CharBoundaries:Vec<usize> = Vec::with_capacity(Preview.len() / 2 + 1);
346		for (B, _) in Preview.char_indices() {
347			CharBoundaries.push(B);
348		}
349		CharBoundaries.push(Preview.len()); // Sentinel for end-of-string.
350		let ByteToChar = |Byte:usize| -> u64 {
351			match CharBoundaries.binary_search(&Byte) {
352				Ok(Index) => Index as u64,
353				Err(Index) => Index as u64,
354			}
355		};
356
357		// Walk the line bytes and collect every sub-line range the
358		// matcher hits. Multiple matches per line are common
359		// (e.g. searching for `test` in `test test`); each becomes its
360		// own ColumnRange so the renderer underlines them all. Cap at
361		// `MAX_COLUMNS_PER_LINE` to bound work on pathological lines
362		// where a regex matches every character (e.g. `.` or `\w`
363		// against a long minified line).
364		const MAX_COLUMNS_PER_LINE:usize = 100;
365		let mut Columns:Vec<ColumnRange> = Vec::new();
366		let mut StartByte = 0usize;
367		// Search within the truncated preview so columns line up with
368		// the preview text the renderer will display.
369		let SearchBytes = &LineBytes[..SafeCap];
370		while StartByte <= SearchBytes.len() && Columns.len() < MAX_COLUMNS_PER_LINE {
371			match self.matcher.find_at(SearchBytes, StartByte) {
372				Ok(Some(M)) => {
373					if M.start() >= SearchBytes.len() {
374						break;
375					}
376					Columns.push(ColumnRange { start:ByteToChar(M.start()), end:ByteToChar(M.end()) });
377					// `M.end() == M.start()` happens for zero-width
378					// matches (e.g. `\b`); advance by one byte to
379					// avoid an infinite loop.
380					StartByte = if M.end() == M.start() { M.end() + 1 } else { M.end() };
381				},
382				_ => break,
383			}
384		}
385
386		// Since this sink is per-file, we know `self.path` is correct.
387		let FileURI = url::Url::from_file_path(&self.path)
388			.map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Could not convert path to URL"))?
389			.to_string();
390
391		let NewMatch = TextMatch { preview:Preview, line_number:LineNumber, columns:Columns };
392
393		// Mutex acquired AFTER the column-range scan so contention
394		// doesn't serialise the per-line regex work across the
395		// `WalkBuilder::build_parallel()` workers.
396		let mut ResultsGuard = self
397			.results
398			.lock()
399			.map_err(|Error| io::Error::new(io::ErrorKind::Other, Error.to_string()))?;
400
401		// Find the entry for our file, or create it if it's the first match.
402		if let Some(FileMatch) = ResultsGuard.iter_mut().find(|fm| fm.resource == FileURI) {
403			FileMatch.matches.push(NewMatch);
404		} else {
405			ResultsGuard.push(FileMatch { resource:FileURI, matches:vec![NewMatch] });
406		}
407
408		// Continue searching
409		Ok(true)
410	}
411}
412
413#[async_trait]
414impl SearchProvider for MountainEnvironment {
415	async fn TextSearch(&self, QueryValue:Value, _OptionsValue:Value) -> Result<Value, CommonError> {
416		let Query:TextSearchQuery = serde_json::from_value(QueryValue)?;
417
418		dev_log!("search", "[SearchProvider] Performing text search for: {:?}", Query);
419
420		let mut Builder = RegexMatcherBuilder::new();
421
422		Builder
423			.case_insensitive(!Query.is_case_sensitive.unwrap_or(false))
424			.word(Query.is_word_match.unwrap_or(false))
425			.multi_line(Query.is_multiline.unwrap_or(false));
426
427		// When `isRegExp` is false/missing (the default for the Search
428		// view's plain-text mode), escape the pattern so literal
429		// searches for strings containing regex metacharacters
430		// (`.`, `(`, `[`, `*`, `?`, etc.) don't crash the compiler
431		// or silently match the wrong thing.
432		let CompiledPattern = if Query.is_reg_exp.unwrap_or(false) {
433			Query.pattern.clone()
434		} else {
435			regex::escape(&Query.pattern)
436		};
437
438		let Matcher = Builder.build(&CompiledPattern).map_err(|Error| {
439			CommonError::InvalidArgument { ArgumentName:"pattern".into(), Reason:Error.to_string() }
440		})?;
441
442		let AllMatches = Arc::new(Mutex::new(Vec::<FileMatch>::new()));
443
444		let Folders = self
445			.ApplicationState
446			.Workspace
447			.WorkspaceFolders
448			.lock()
449			.map_err(Utility::ErrorMapping::MapApplicationStateLockErrorToCommonError)?
450			.clone();
451
452		if Folders.is_empty() {
453			dev_log!("search", "warn: [SearchProvider] No workspace folders to search in.");
454
455			return Ok(json!([]));
456		}
457
458		for Folder in Folders {
459			if let Ok(FolderPath) = Folder.URI.to_file_path() {
460				// Use a parallel walker for better performance.
461				let Walker = WalkBuilder::new(FolderPath).build_parallel();
462
463				// The `search_parallel` method is not available on `Searcher`. We must process
464				// entries from the walker and call `search_path` individually.
465				Walker.run(|| {
466					// `line_number(true)` is mandatory - without it,
467					// `SinkMatch::line_number()` returns None and every
468					// match lands at line 0, which the renderer treats
469					// as "no line info" and collapses into an
470					// uncategorised count-of-zero. The default
471					// `Searcher::new()` constructor disables line
472					// numbers for performance.
473					let mut Searcher = SearcherBuilder::new().line_number(true).build();
474
475					let Matcher = Matcher.clone();
476
477					let AllMatches = AllMatches.clone();
478
479					Box::new(move |EntryResult| {
480						if let Ok(Entry) = EntryResult {
481							if Entry.file_type().map_or(false, |ft| ft.is_file()) {
482								// For each file, create a new sink that knows its path.
483								let Sink = PerFileSink {
484									path:Entry.path().to_path_buf(),
485									results:AllMatches.clone(),
486									matcher:Matcher.clone(),
487								};
488
489								if let Err(Error) = Searcher.search_path(&Matcher, Entry.path(), Sink) {
490									dev_log!(
491										"search",
492										"warn: [SearchProvider] Error searching path {}: {}",
493										Entry.path().display(),
494										Error
495									);
496								}
497							}
498						}
499
500						ignore::WalkState::Continue
501					})
502				});
503			}
504		}
505
506		let FinalMatches = AllMatches
507			.lock()
508			.map_err(|Error| CommonError::StateLockPoisoned { Context:Error.to_string() })?
509			.clone();
510
511		let TotalLineMatches:usize = FinalMatches.iter().map(|F| F.matches.len()).sum();
512		dev_log!(
513			"search",
514			"[SearchProvider] returned {} files / {} line-matches for pattern={:?}",
515			FinalMatches.len(),
516			TotalLineMatches,
517			Query.pattern
518		);
519
520		Ok(json!(FinalMatches))
521	}
522}