Coverage for debputy/path_matcher.py: 78%
362 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-22 14:29 +0100
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-22 14:29 +0100
1import fnmatch
2import glob
3import itertools
4import os
5import re
6from abc import abstractmethod
7from enum import Enum
8from typing import Callable, Optional, Generic, TypeVar, Dict, List, Tuple, Iterable, TYPE_CHECKING, Union, Sequence
10from debputy.intermediate_manifest import PathType
11from debputy.substitution import Substitution, NULL_SUBSTITUTION
12from debputy.util import _normalize_path
14if TYPE_CHECKING:
15 from debputy.filesystem_scan import FSPath
17MR = TypeVar('MR')
18_GLOB_PARTS = re.compile(r'[*?]|\[]?[^]]+]')
21def _glob_match_weight(glob_pattern: str) -> Tuple[int, int]:
22 total_len = len(glob_pattern)
23 non_glob_prefix_length = total_len
24 glob_part_len = 0
25 for m in _GLOB_PARTS.finditer(glob_pattern):
26 non_glob_prefix_length = min(non_glob_prefix_length, m.start())
27 glob_part_len += m.end() - m.start()
28 return total_len - glob_part_len, non_glob_prefix_length
31def _lookup_path(fs_root: 'FSPath', path: str) -> Optional['FSPath']:
32 if not path.startswith('./'): 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true
33 raise ValueError("Directory must be normalized (and not the root directory)")
34 if fs_root.basename != '.' or fs_root.parent_dir is not None: 34 ↛ 35line 34 didn't jump to line 35, because the condition on line 34 was never true
35 raise ValueError("Provided fs_root must be the root directory")
36 return fs_root.lookup(path[2:])
39def _compile_basename_glob(basename_glob: str) -> Callable[[str], bool]:
40 remainder = None
41 if basename_glob.startswith('*'):
42 if basename_glob.endswith('*'):
43 remainder = basename_glob[1:-1]
44 possible_quick_match = lambda x: remainder in x
45 else:
46 remainder = basename_glob[1:]
47 possible_quick_match = lambda x: x.endswith(remainder)
48 else:
49 remainder = basename_glob[:-1]
50 possible_quick_match = lambda x: x.startswith(remainder)
52 if possible_quick_match is not None and not glob.has_magic(remainder):
53 return possible_quick_match
54 slow_pattern = re.compile(fnmatch.translate(basename_glob))
55 return lambda x: bool(slow_pattern.match(x))
58def _apply_match(fs_path: 'FSPath', match_part: Union[Callable[[str], bool], str]) -> Iterable['FSPath']:
59 if isinstance(match_part, str):
60 m = fs_path.lookup(match_part)
61 if m:
62 yield m
63 else:
64 yield from (p for p in fs_path.children if match_part(p.basename))
67class MatchRuleType(Enum):
68 EXACT_MATCH = ('exact', 999, 0)
69 BASENAME_GLOB = ('basename-glob', 1, 0)
70 DIRECT_CHILDREN_OF_DIR = ('direct-children-of-dir', 1, 1)
71 ANYTHING_BENEATH_DIR = ('anything-beneath-dir', 1, 0)
72 GENERIC_GLOB = ('generic-glob', 1, 0)
73 MATCH_ANYTHING = ('match-anything', 0, 0)
75 @property
76 def key(self) -> str:
77 return self.value[0]
79 @property
80 def sort_key(self) -> int:
81 return self.value[1]
83 @property
84 def rule_tiebreaker(self) -> int:
85 return self.value[2]
88class MatchRule:
90 __slots__ = '_rule_type',
92 def __init__(self, rule_type: MatchRuleType) -> None:
93 self._rule_type = rule_type
95 @property
96 def rule_type(self) -> MatchRuleType:
97 return self._rule_type
99 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']:
100 raise NotImplementedError
102 def _full_pattern(self) -> str:
103 raise NotImplementedError
105 def _match_weight(self) -> Tuple[int, int]:
106 return _glob_match_weight(self._full_pattern())
108 @property
109 def path_type(self) -> Optional[PathType]:
110 return None
112 def describe_match_short(self) -> str:
113 return self._full_pattern()
115 def describe_match_exact(self) -> str:
116 raise NotImplementedError
118 def lookup_key(self) -> Optional[str]:
119 return None
121 def __hash__(self) -> int:
122 raise NotImplementedError
124 @abstractmethod
125 def __eq__(self, other: object) -> bool:
126 return False
128 def sort_key(self):
129 rule_type = self._rule_type
130 ml, pl = self._match_weight()
131 path_type_specific_weight = 1 if self.path_type is not None else 0
132 # Used by directory matches to prefer "foo/*" over "foo/**/*".
133 tie_breaker = rule_type.rule_tiebreaker
134 # The _full_pattern is just there as a tie-breaker
135 return rule_type.sort_key, ml, pl, path_type_specific_weight, tie_breaker, self._full_pattern()
137 @property
138 def is_path_type_part_of_match(self) -> bool:
139 return self.path_type is not None
141 def match_rule_without_path_type_match(self) -> 'MatchRule':
142 assert not self.is_path_type_part_of_match
143 return self
145 @classmethod
146 def recursive_beneath_directory(cls,
147 directory: str,
148 definition_source: str,
149 path_type: Optional[PathType] = None,
150 substitution: Substitution = NULL_SUBSTITUTION,
151 ) -> 'MatchRule':
152 if directory in ('.', '/'): 152 ↛ 153line 152 didn't jump to line 153, because the condition on line 152 was never true
153 return MATCH_ANYTHING
154 assert not glob.has_magic(directory)
155 return DirectoryBasedMatch(MatchRuleType.ANYTHING_BENEATH_DIR,
156 substitution.substitute(_normalize_path(directory), definition_source),
157 path_type=path_type,
158 )
160 @classmethod
161 def from_path_or_glob(cls,
162 path_or_glob: str,
163 definition_source: str,
164 path_type: Optional[PathType] = None,
165 substitution: Substitution = NULL_SUBSTITUTION,
166 ) -> 'MatchRule':
167 # TODO: Handle '{a,b,c}' patterns too
168 normalized_no_prefix = _normalize_path(path_or_glob, with_prefix=False)
169 if path_or_glob in ('*', '**/*', '.', '/'): 169 ↛ 170line 169 didn't jump to line 170, because the condition on line 169 was never true
170 assert path_type is None
171 return MATCH_ANYTHING
173 # We do not support {a,b} at the moment. This check is not perfect, but it should catch the most obvious
174 # unsupported usage.
175 assert '{' not in path_or_glob or '${' in path_or_glob
177 normalized_with_prefix = './' + normalized_no_prefix
178 # TODO: Check for escapes here "foo[?]/bar" can be written as an exact match for foo?/bar
179 # - similar holds for "foo[?]/*" being a directory match (etc.).
180 if not glob.has_magic(normalized_with_prefix):
181 assert path_type is None
182 return ExactFileSystemPath(substitution.substitute(normalized_with_prefix, definition_source))
184 directory = os.path.dirname(normalized_with_prefix)
185 basename = os.path.basename(normalized_with_prefix)
187 if ('**' in directory and directory != './**') or '**' in basename: 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true
188 raise ValueError(f'Cannot process pattern "{path_or_glob}" from {definition_source}: The double-star'
189 ' glob ("**") is not supported in general. Only "**/<basename-glob>" supported.')
191 if basename == '*' and not glob.has_magic(directory):
192 return DirectoryBasedMatch(MatchRuleType.DIRECT_CHILDREN_OF_DIR,
193 substitution.substitute(directory, definition_source),
194 path_type=path_type)
195 elif (directory == './**' or not glob.has_magic(directory)) and glob.has_magic(basename):
196 basename_glob = substitution.substitute(basename, definition_source, escape_glob_characters=True)
197 if directory in ('.', './**'):
198 return BasenameGlobMatch(basename_glob,
199 path_type=path_type,
200 recursive_match=True,
201 )
202 return BasenameGlobMatch(basename_glob,
203 only_when_in_directory=substitution.substitute(directory, definition_source),
204 path_type=path_type,
205 recursive_match=False,
206 )
208 return GenericGlobImplementation(normalized_with_prefix,
209 path_type=path_type)
212def _match_file_type(path_type, path: 'FSPath') -> bool:
213 if path_type == PathType.FILE and path.is_file:
214 return True
215 if path_type == PathType.DIRECTORY and path.is_dir: 215 ↛ 216line 215 didn't jump to line 216, because the condition on line 215 was never true
216 return True
217 if path_type == PathType.SYMLINK and path.is_symlink: 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true
218 return True
219 assert path_type in (PathType.FILE, PathType.DIRECTORY, PathType.SYMLINK)
220 return False
223class MatchAnything(MatchRule):
225 def __init__(self):
226 super().__init__(MatchRuleType.MATCH_ANYTHING)
228 def _full_pattern(self) -> str:
229 return '**/*'
231 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']:
232 yield from fs_root.all_paths()
234 def _match_weight(self) -> Tuple[int, int]:
235 return 0, 0
237 def describe_match_exact(self) -> str:
238 return '**/* (Match anything)'
240 def __hash__(self) -> int:
241 return hash((self.rule_type, '**/*'))
243 def __eq__(self, other: object) -> bool:
244 return self is other
247MATCH_ANYTHING: MatchRule = MatchAnything()
249del MatchAnything
252class ExactFileSystemPath(MatchRule):
254 __slots__ = '_path'
256 def __init__(self, path: str) -> None:
257 super().__init__(MatchRuleType.EXACT_MATCH)
258 self._path = path
260 def _full_pattern(self) -> str:
261 return self._path
263 def _match_weight(self) -> Tuple[int, int]:
264 w = len(self._path)
265 return w, w
267 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']:
268 p = _lookup_path(fs_root, self._path)
269 if p is not None:
270 yield p
272 def describe_match_exact(self) -> str:
273 return f'{self._path} (the exact path / no globbing)'
275 @property
276 def path(self) -> str:
277 return self._path
279 def lookup_key(self) -> str:
280 return self._path
282 def __hash__(self) -> int:
283 return hash((self.rule_type, self._path))
285 def __eq__(self, other: object) -> bool:
286 if not isinstance(other, ExactFileSystemPath):
287 return NotImplemented
288 return self._path == other._path
291class DirectoryBasedMatch(MatchRule):
293 __slots__ = '_directory', '_path_type'
295 def __init__(self, rule_type: MatchRuleType, directory: str, path_type: Optional[PathType] = None) -> None:
296 super().__init__(rule_type)
297 self._directory = directory
298 self._path_type = path_type
299 assert rule_type in (MatchRuleType.DIRECT_CHILDREN_OF_DIR, MatchRuleType.ANYTHING_BENEATH_DIR)
300 assert not self._directory.endswith('/')
302 def _full_pattern(self) -> str:
303 return self._directory
305 def _match_weight(self) -> Tuple[int, int]:
306 # directory + "/"
307 w = len(self._directory) + 1
308 return w, w
310 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']:
311 p = _lookup_path(fs_root, self._directory)
312 if p is None or not p.is_dir: 312 ↛ 313line 312 didn't jump to line 313, because the condition on line 312 was never true
313 return
314 if self._rule_type == MatchRuleType.ANYTHING_BENEATH_DIR:
315 path_iter = p.all_paths()
316 else:
317 path_iter = p.children
318 if self._path_type is None:
319 yield from path_iter
320 else:
321 yield from (m for m in path_iter if _match_file_type(self._path_type, m))
323 def describe_match_short(self) -> str:
324 path_type_match = '' if self._path_type is None else f' <only for path type {self._path_type.manifest_key}>'
325 if self._rule_type == MatchRuleType.ANYTHING_BENEATH_DIR:
326 return f'{self._directory}/**/*{path_type_match}'
327 return f'{self._directory}/*{path_type_match}'
329 def describe_match_exact(self) -> str:
330 if self._rule_type == MatchRuleType.ANYTHING_BENEATH_DIR:
331 return f'{self._directory}/**/* (anything below the directory)'
332 return f'{self.describe_match_short()} (anything directly in the directory)'
334 def lookup_key(self) -> str:
335 return self._directory
337 def __hash__(self) -> int:
338 return hash((self.rule_type, self._directory, self._path_type))
340 def __eq__(self, other: object) -> bool:
341 if not isinstance(other, DirectoryBasedMatch):
342 return NotImplemented
343 return (self._rule_type == other._rule_type
344 and self._directory == other._directory
345 and self._path_type == other._path_type
346 )
348 @property
349 def path_type(self) -> Optional[PathType]:
350 return self._path_type
352 def match_rule_without_path_type_match(self) -> 'MatchRule':
353 if self.is_path_type_part_of_match:
354 return DirectoryBasedMatch(self._rule_type, self._directory, path_type=None)
355 return self
358class BasenameGlobMatch(MatchRule):
360 __slots__ = '_basename_glob', '_directory', '_matcher', '_path_type', '_recursive_match'
362 def __init__(self,
363 basename_glob: str,
364 only_when_in_directory: Optional[str] = None,
365 path_type: Optional[PathType] = None,
366 recursive_match: bool = None
367 ) -> None:
368 super().__init__(MatchRuleType.BASENAME_GLOB)
369 self._basename_glob = basename_glob
370 self._directory = only_when_in_directory
371 self._path_type = path_type
372 self._recursive_match = recursive_match
373 if self._directory is None and not recursive_match: 373 ↛ 374line 373 didn't jump to line 374, because the condition on line 373 was never true
374 self._recursive_match = True
375 assert self._directory is None or not self._directory.endswith('/')
376 assert '/' not in basename_glob # Not a basename if it contains /
377 assert '**' not in basename_glob # Also not a (true) basename if it has **
378 assert glob.has_magic(basename_glob)
379 self._matcher = _compile_basename_glob(basename_glob)
381 def _full_pattern(self) -> str:
382 if self._directory is not None:
383 maybe_recursive = "**/" if self._recursive_match else ""
384 return f'{self._directory}/{maybe_recursive}{self._basename_glob}'
385 return self._basename_glob
387 def _match_weight(self) -> Tuple[int, int]:
388 ml, pl = _glob_match_weight(self._basename_glob)
390 dl = len(self._directory) if self._directory is not None else 0
391 # +1 for the trailing slash
392 # (note this holds for the "**/<foo>" case as well)
393 dl += 1
394 return ml + dl, pl + dl
396 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']:
397 search_root = fs_root
398 if self._directory is not None:
399 p = _lookup_path(fs_root, self._directory)
400 if p is None or not p.is_dir: 400 ↛ 401line 400 didn't jump to line 401, because the condition on line 400 was never true
401 return
402 search_root = p
403 path_iter = search_root.all_paths() if self._recursive_match else search_root.children
404 if self._path_type is None:
405 yield from (m for m in path_iter if self._matcher(m.basename))
406 else:
407 yield from (m for m in path_iter if self._matcher(m.basename) and _match_file_type(self._path_type, m))
409 def describe_match_short(self) -> str:
410 path_type_match = '' if self._path_type is None else f' <only for path type {self._path_type.manifest_key}>'
411 return self._full_pattern() if path_type_match == '' else f'{self._full_pattern()}{path_type_match}'
413 def describe_match_exact(self) -> str:
414 if self._directory is not None:
415 return f'{self.describe_match_short()} (glob / directly in the directory)'
416 return f'{self.describe_match_short()} (basename match)'
418 def lookup_key(self) -> Optional[str]:
419 return self._directory
421 def __hash__(self) -> int:
422 return hash((self.rule_type, self._basename_glob, self._directory, self._path_type))
424 def __eq__(self, other: object) -> bool:
425 if not isinstance(other, BasenameGlobMatch):
426 return NotImplemented
427 return (self._basename_glob == other._basename_glob
428 and self._directory == other._directory
429 and self._path_type == other._path_type
430 and self._recursive_match == other._recursive_match
431 )
433 @property
434 def path_type(self) -> Optional[PathType]:
435 return self._path_type
437 def match_rule_without_path_type_match(self) -> 'MatchRule':
438 if self.is_path_type_part_of_match:
439 return BasenameGlobMatch(self._basename_glob,
440 self._directory,
441 recursive_match=self._recursive_match,
442 path_type=None,)
443 return self
446class GenericGlobImplementation(MatchRule):
448 __slots__ = '_glob_pattern', '_path_type', '_match_parts'
450 def __init__(self,
451 glob_pattern: str,
452 path_type: Optional[PathType] = None,
453 ) -> None:
454 super().__init__(MatchRuleType.GENERIC_GLOB)
455 if glob_pattern.startswith('./'): 455 ↛ 457line 455 didn't jump to line 457, because the condition on line 455 was never false
456 glob_pattern = glob_pattern[2:]
457 self._glob_pattern = glob_pattern
458 self._path_type = path_type
459 assert '**' not in glob_pattern # No recursive globs
460 assert glob.has_magic(glob_pattern) # If it has no glob, then it could have been an exact match
461 assert '/' in glob_pattern # If it does not have a / then a BasenameGlob could have been used instead
462 self._match_parts = self._compile_glob()
464 def _full_pattern(self) -> str:
465 return self._glob_pattern
467 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']:
468 search_history = [fs_root]
469 for part in self._match_parts:
470 next_layer = itertools.chain.from_iterable(_apply_match(m, part) for m in search_history)
471 # TODO: Figure out why we need to materialize next_layer into a list for this to work.
472 search_history = list(next_layer)
473 if not search_history: 473 ↛ 475line 473 didn't jump to line 475, because the condition on line 473 was never true
474 # While we have it as a list, we might as well have an "early exit".
475 return
477 if self._path_type is None: 477 ↛ 480line 477 didn't jump to line 480, because the condition on line 477 was never false
478 yield from search_history
479 else:
480 yield from (m for m in search_history if _match_file_type(self._path_type, m))
482 def describe_match_short(self) -> str:
483 path_type_match = '' if self._path_type is None else f' <only for path type {self._path_type.manifest_key}>'
484 return self._full_pattern() if path_type_match == '' else f'{self._full_pattern()}{path_type_match}'
486 def describe_match_exact(self) -> str:
487 return f'{self.describe_match_short()} (glob)'
489 def _compile_glob(self) -> Sequence[Union[Callable[[str], bool], str]]:
490 assert self._glob_pattern.strip('/') == self._glob_pattern
491 return [
492 _compile_basename_glob(part) if glob.has_magic(part) else part
493 for part in self._glob_pattern.split('/')
494 ]
496 def __hash__(self) -> int:
497 return hash((self.rule_type, self._glob_pattern, self._path_type))
499 def __eq__(self, other: object) -> bool:
500 if not isinstance(other, GenericGlobImplementation):
501 return NotImplemented
502 return (self._glob_pattern == other._glob_pattern
503 and self._path_type == other._path_type
504 )
506 @property
507 def path_type(self) -> Optional[PathType]:
508 return self._path_type
510 def match_rule_without_path_type_match(self) -> 'MatchRule':
511 if self.is_path_type_part_of_match:
512 return GenericGlobImplementation(self._glob_pattern,
513 path_type=None,
514 )
515 return self
518class PathMatcher(Generic[MR]):
520 def __init__(self,
521 exact_match_rules: Dict[str, Tuple[MatchRule, MR]],
522 global_globs: List[Tuple[MatchRule, MR]],
523 global_default_match: MR,
524 ) -> None:
525 self._exact_match_rules = exact_match_rules
526 self._globs = global_globs
527 self._global_default_match = global_default_match
529 def resolve_all(self, fs_root: 'FSPath') -> Iterable[Tuple['FSPath', MR]]:
530 default_match_def = MATCH_ANYTHING, self._global_default_match
531 match_state: Dict[str, Tuple[MatchRule, MR]] = {p.path: default_match_def for p in fs_root.all_paths()}
532 for match_rule_def in self._globs:
533 match_rule, result = match_rule_def
534 for match in match_rule.finditer(fs_root):
535 match_state[match.path] = match_rule_def
537 match_state.update(self._exact_match_rules)
539 yield from ((p, match_state[p.path][1]) for p in fs_root.all_paths())
541 @classmethod
542 def from_dict(cls, path_specs: Dict[MatchRule, MR], builtin_default_match: MR):
543 exact_match_rules = {}
544 global_globs = []
545 global_default_match = None
546 match_rule: MatchRule
547 manifest_path_info: MR
548 for match_rule, manifest_path_info in sorted(path_specs.items(), key=lambda x: x[0].sort_key()):
549 lookup_key = match_rule.lookup_key()
550 if match_rule.rule_type == MatchRuleType.EXACT_MATCH:
551 assert lookup_key is not None
552 exact_match_rules[lookup_key] = match_rule, manifest_path_info
553 continue
554 if match_rule.rule_type == MatchRuleType.ANYTHING_BENEATH_DIR and lookup_key == '.': 554 ↛ 555line 554 didn't jump to line 555, because the condition on line 554 was never true
555 raise ValueError("Please use MATCH_ANYTHING instead of a ANYTHING_BENEATH_DIR rule for '.'")
556 if match_rule is MATCH_ANYTHING: 556 ↛ 557line 556 didn't jump to line 557, because the condition on line 556 was never true
557 assert global_default_match is None
558 global_default_match = manifest_path_info
559 continue
561 global_globs.append((match_rule, manifest_path_info))
562 if global_default_match is None: 562 ↛ 565line 562 didn't jump to line 565, because the condition on line 562 was never false
563 global_default_match = builtin_default_match
565 return PathMatcher(
566 exact_match_rules,
567 global_globs,
568 global_default_match,
569 )