Coverage for debputy/path_matcher.py: 78%

362 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-22 14:29 +0100

1import fnmatch 

2import glob 

3import itertools 

4import os 

5import re 

6from abc import abstractmethod 

7from enum import Enum 

8from typing import Callable, Optional, Generic, TypeVar, Dict, List, Tuple, Iterable, TYPE_CHECKING, Union, Sequence 

9 

10from debputy.intermediate_manifest import PathType 

11from debputy.substitution import Substitution, NULL_SUBSTITUTION 

12from debputy.util import _normalize_path 

13 

14if TYPE_CHECKING: 

15 from debputy.filesystem_scan import FSPath 

16 

17MR = TypeVar('MR') 

18_GLOB_PARTS = re.compile(r'[*?]|\[]?[^]]+]') 

19 

20 

21def _glob_match_weight(glob_pattern: str) -> Tuple[int, int]: 

22 total_len = len(glob_pattern) 

23 non_glob_prefix_length = total_len 

24 glob_part_len = 0 

25 for m in _GLOB_PARTS.finditer(glob_pattern): 

26 non_glob_prefix_length = min(non_glob_prefix_length, m.start()) 

27 glob_part_len += m.end() - m.start() 

28 return total_len - glob_part_len, non_glob_prefix_length 

29 

30 

31def _lookup_path(fs_root: 'FSPath', path: str) -> Optional['FSPath']: 

32 if not path.startswith('./'): 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true

33 raise ValueError("Directory must be normalized (and not the root directory)") 

34 if fs_root.basename != '.' or fs_root.parent_dir is not None: 34 ↛ 35line 34 didn't jump to line 35, because the condition on line 34 was never true

35 raise ValueError("Provided fs_root must be the root directory") 

36 return fs_root.lookup(path[2:]) 

37 

38 

39def _compile_basename_glob(basename_glob: str) -> Callable[[str], bool]: 

40 remainder = None 

41 if basename_glob.startswith('*'): 

42 if basename_glob.endswith('*'): 

43 remainder = basename_glob[1:-1] 

44 possible_quick_match = lambda x: remainder in x 

45 else: 

46 remainder = basename_glob[1:] 

47 possible_quick_match = lambda x: x.endswith(remainder) 

48 else: 

49 remainder = basename_glob[:-1] 

50 possible_quick_match = lambda x: x.startswith(remainder) 

51 

52 if possible_quick_match is not None and not glob.has_magic(remainder): 

53 return possible_quick_match 

54 slow_pattern = re.compile(fnmatch.translate(basename_glob)) 

55 return lambda x: bool(slow_pattern.match(x)) 

56 

57 

58def _apply_match(fs_path: 'FSPath', match_part: Union[Callable[[str], bool], str]) -> Iterable['FSPath']: 

59 if isinstance(match_part, str): 

60 m = fs_path.lookup(match_part) 

61 if m: 

62 yield m 

63 else: 

64 yield from (p for p in fs_path.children if match_part(p.basename)) 

65 

66 

67class MatchRuleType(Enum): 

68 EXACT_MATCH = ('exact', 999, 0) 

69 BASENAME_GLOB = ('basename-glob', 1, 0) 

70 DIRECT_CHILDREN_OF_DIR = ('direct-children-of-dir', 1, 1) 

71 ANYTHING_BENEATH_DIR = ('anything-beneath-dir', 1, 0) 

72 GENERIC_GLOB = ('generic-glob', 1, 0) 

73 MATCH_ANYTHING = ('match-anything', 0, 0) 

74 

75 @property 

76 def key(self) -> str: 

77 return self.value[0] 

78 

79 @property 

80 def sort_key(self) -> int: 

81 return self.value[1] 

82 

83 @property 

84 def rule_tiebreaker(self) -> int: 

85 return self.value[2] 

86 

87 

88class MatchRule: 

89 

90 __slots__ = '_rule_type', 

91 

92 def __init__(self, rule_type: MatchRuleType) -> None: 

93 self._rule_type = rule_type 

94 

95 @property 

96 def rule_type(self) -> MatchRuleType: 

97 return self._rule_type 

98 

99 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']: 

100 raise NotImplementedError 

101 

102 def _full_pattern(self) -> str: 

103 raise NotImplementedError 

104 

105 def _match_weight(self) -> Tuple[int, int]: 

106 return _glob_match_weight(self._full_pattern()) 

107 

108 @property 

109 def path_type(self) -> Optional[PathType]: 

110 return None 

111 

112 def describe_match_short(self) -> str: 

113 return self._full_pattern() 

114 

115 def describe_match_exact(self) -> str: 

116 raise NotImplementedError 

117 

118 def lookup_key(self) -> Optional[str]: 

119 return None 

120 

121 def __hash__(self) -> int: 

122 raise NotImplementedError 

123 

124 @abstractmethod 

125 def __eq__(self, other: object) -> bool: 

126 return False 

127 

128 def sort_key(self): 

129 rule_type = self._rule_type 

130 ml, pl = self._match_weight() 

131 path_type_specific_weight = 1 if self.path_type is not None else 0 

132 # Used by directory matches to prefer "foo/*" over "foo/**/*". 

133 tie_breaker = rule_type.rule_tiebreaker 

134 # The _full_pattern is just there as a tie-breaker 

135 return rule_type.sort_key, ml, pl, path_type_specific_weight, tie_breaker, self._full_pattern() 

136 

137 @property 

138 def is_path_type_part_of_match(self) -> bool: 

139 return self.path_type is not None 

140 

141 def match_rule_without_path_type_match(self) -> 'MatchRule': 

142 assert not self.is_path_type_part_of_match 

143 return self 

144 

145 @classmethod 

146 def recursive_beneath_directory(cls, 

147 directory: str, 

148 definition_source: str, 

149 path_type: Optional[PathType] = None, 

150 substitution: Substitution = NULL_SUBSTITUTION, 

151 ) -> 'MatchRule': 

152 if directory in ('.', '/'): 152 ↛ 153line 152 didn't jump to line 153, because the condition on line 152 was never true

153 return MATCH_ANYTHING 

154 assert not glob.has_magic(directory) 

155 return DirectoryBasedMatch(MatchRuleType.ANYTHING_BENEATH_DIR, 

156 substitution.substitute(_normalize_path(directory), definition_source), 

157 path_type=path_type, 

158 ) 

159 

160 @classmethod 

161 def from_path_or_glob(cls, 

162 path_or_glob: str, 

163 definition_source: str, 

164 path_type: Optional[PathType] = None, 

165 substitution: Substitution = NULL_SUBSTITUTION, 

166 ) -> 'MatchRule': 

167 # TODO: Handle '{a,b,c}' patterns too 

168 normalized_no_prefix = _normalize_path(path_or_glob, with_prefix=False) 

169 if path_or_glob in ('*', '**/*', '.', '/'): 169 ↛ 170line 169 didn't jump to line 170, because the condition on line 169 was never true

170 assert path_type is None 

171 return MATCH_ANYTHING 

172 

173 # We do not support {a,b} at the moment. This check is not perfect, but it should catch the most obvious 

174 # unsupported usage. 

175 assert '{' not in path_or_glob or '${' in path_or_glob 

176 

177 normalized_with_prefix = './' + normalized_no_prefix 

178 # TODO: Check for escapes here "foo[?]/bar" can be written as an exact match for foo?/bar 

179 # - similar holds for "foo[?]/*" being a directory match (etc.). 

180 if not glob.has_magic(normalized_with_prefix): 

181 assert path_type is None 

182 return ExactFileSystemPath(substitution.substitute(normalized_with_prefix, definition_source)) 

183 

184 directory = os.path.dirname(normalized_with_prefix) 

185 basename = os.path.basename(normalized_with_prefix) 

186 

187 if ('**' in directory and directory != './**') or '**' in basename: 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true

188 raise ValueError(f'Cannot process pattern "{path_or_glob}" from {definition_source}: The double-star' 

189 ' glob ("**") is not supported in general. Only "**/<basename-glob>" supported.') 

190 

191 if basename == '*' and not glob.has_magic(directory): 

192 return DirectoryBasedMatch(MatchRuleType.DIRECT_CHILDREN_OF_DIR, 

193 substitution.substitute(directory, definition_source), 

194 path_type=path_type) 

195 elif (directory == './**' or not glob.has_magic(directory)) and glob.has_magic(basename): 

196 basename_glob = substitution.substitute(basename, definition_source, escape_glob_characters=True) 

197 if directory in ('.', './**'): 

198 return BasenameGlobMatch(basename_glob, 

199 path_type=path_type, 

200 recursive_match=True, 

201 ) 

202 return BasenameGlobMatch(basename_glob, 

203 only_when_in_directory=substitution.substitute(directory, definition_source), 

204 path_type=path_type, 

205 recursive_match=False, 

206 ) 

207 

208 return GenericGlobImplementation(normalized_with_prefix, 

209 path_type=path_type) 

210 

211 

212def _match_file_type(path_type, path: 'FSPath') -> bool: 

213 if path_type == PathType.FILE and path.is_file: 

214 return True 

215 if path_type == PathType.DIRECTORY and path.is_dir: 215 ↛ 216line 215 didn't jump to line 216, because the condition on line 215 was never true

216 return True 

217 if path_type == PathType.SYMLINK and path.is_symlink: 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true

218 return True 

219 assert path_type in (PathType.FILE, PathType.DIRECTORY, PathType.SYMLINK) 

220 return False 

221 

222 

223class MatchAnything(MatchRule): 

224 

225 def __init__(self): 

226 super().__init__(MatchRuleType.MATCH_ANYTHING) 

227 

228 def _full_pattern(self) -> str: 

229 return '**/*' 

230 

231 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']: 

232 yield from fs_root.all_paths() 

233 

234 def _match_weight(self) -> Tuple[int, int]: 

235 return 0, 0 

236 

237 def describe_match_exact(self) -> str: 

238 return '**/* (Match anything)' 

239 

240 def __hash__(self) -> int: 

241 return hash((self.rule_type, '**/*')) 

242 

243 def __eq__(self, other: object) -> bool: 

244 return self is other 

245 

246 

247MATCH_ANYTHING: MatchRule = MatchAnything() 

248 

249del MatchAnything 

250 

251 

252class ExactFileSystemPath(MatchRule): 

253 

254 __slots__ = '_path' 

255 

256 def __init__(self, path: str) -> None: 

257 super().__init__(MatchRuleType.EXACT_MATCH) 

258 self._path = path 

259 

260 def _full_pattern(self) -> str: 

261 return self._path 

262 

263 def _match_weight(self) -> Tuple[int, int]: 

264 w = len(self._path) 

265 return w, w 

266 

267 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']: 

268 p = _lookup_path(fs_root, self._path) 

269 if p is not None: 

270 yield p 

271 

272 def describe_match_exact(self) -> str: 

273 return f'{self._path} (the exact path / no globbing)' 

274 

275 @property 

276 def path(self) -> str: 

277 return self._path 

278 

279 def lookup_key(self) -> str: 

280 return self._path 

281 

282 def __hash__(self) -> int: 

283 return hash((self.rule_type, self._path)) 

284 

285 def __eq__(self, other: object) -> bool: 

286 if not isinstance(other, ExactFileSystemPath): 

287 return NotImplemented 

288 return self._path == other._path 

289 

290 

291class DirectoryBasedMatch(MatchRule): 

292 

293 __slots__ = '_directory', '_path_type' 

294 

295 def __init__(self, rule_type: MatchRuleType, directory: str, path_type: Optional[PathType] = None) -> None: 

296 super().__init__(rule_type) 

297 self._directory = directory 

298 self._path_type = path_type 

299 assert rule_type in (MatchRuleType.DIRECT_CHILDREN_OF_DIR, MatchRuleType.ANYTHING_BENEATH_DIR) 

300 assert not self._directory.endswith('/') 

301 

302 def _full_pattern(self) -> str: 

303 return self._directory 

304 

305 def _match_weight(self) -> Tuple[int, int]: 

306 # directory + "/" 

307 w = len(self._directory) + 1 

308 return w, w 

309 

310 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']: 

311 p = _lookup_path(fs_root, self._directory) 

312 if p is None or not p.is_dir: 312 ↛ 313line 312 didn't jump to line 313, because the condition on line 312 was never true

313 return 

314 if self._rule_type == MatchRuleType.ANYTHING_BENEATH_DIR: 

315 path_iter = p.all_paths() 

316 else: 

317 path_iter = p.children 

318 if self._path_type is None: 

319 yield from path_iter 

320 else: 

321 yield from (m for m in path_iter if _match_file_type(self._path_type, m)) 

322 

323 def describe_match_short(self) -> str: 

324 path_type_match = '' if self._path_type is None else f' <only for path type {self._path_type.manifest_key}>' 

325 if self._rule_type == MatchRuleType.ANYTHING_BENEATH_DIR: 

326 return f'{self._directory}/**/*{path_type_match}' 

327 return f'{self._directory}/*{path_type_match}' 

328 

329 def describe_match_exact(self) -> str: 

330 if self._rule_type == MatchRuleType.ANYTHING_BENEATH_DIR: 

331 return f'{self._directory}/**/* (anything below the directory)' 

332 return f'{self.describe_match_short()} (anything directly in the directory)' 

333 

334 def lookup_key(self) -> str: 

335 return self._directory 

336 

337 def __hash__(self) -> int: 

338 return hash((self.rule_type, self._directory, self._path_type)) 

339 

340 def __eq__(self, other: object) -> bool: 

341 if not isinstance(other, DirectoryBasedMatch): 

342 return NotImplemented 

343 return (self._rule_type == other._rule_type 

344 and self._directory == other._directory 

345 and self._path_type == other._path_type 

346 ) 

347 

348 @property 

349 def path_type(self) -> Optional[PathType]: 

350 return self._path_type 

351 

352 def match_rule_without_path_type_match(self) -> 'MatchRule': 

353 if self.is_path_type_part_of_match: 

354 return DirectoryBasedMatch(self._rule_type, self._directory, path_type=None) 

355 return self 

356 

357 

358class BasenameGlobMatch(MatchRule): 

359 

360 __slots__ = '_basename_glob', '_directory', '_matcher', '_path_type', '_recursive_match' 

361 

362 def __init__(self, 

363 basename_glob: str, 

364 only_when_in_directory: Optional[str] = None, 

365 path_type: Optional[PathType] = None, 

366 recursive_match: bool = None 

367 ) -> None: 

368 super().__init__(MatchRuleType.BASENAME_GLOB) 

369 self._basename_glob = basename_glob 

370 self._directory = only_when_in_directory 

371 self._path_type = path_type 

372 self._recursive_match = recursive_match 

373 if self._directory is None and not recursive_match: 373 ↛ 374line 373 didn't jump to line 374, because the condition on line 373 was never true

374 self._recursive_match = True 

375 assert self._directory is None or not self._directory.endswith('/') 

376 assert '/' not in basename_glob # Not a basename if it contains / 

377 assert '**' not in basename_glob # Also not a (true) basename if it has ** 

378 assert glob.has_magic(basename_glob) 

379 self._matcher = _compile_basename_glob(basename_glob) 

380 

381 def _full_pattern(self) -> str: 

382 if self._directory is not None: 

383 maybe_recursive = "**/" if self._recursive_match else "" 

384 return f'{self._directory}/{maybe_recursive}{self._basename_glob}' 

385 return self._basename_glob 

386 

387 def _match_weight(self) -> Tuple[int, int]: 

388 ml, pl = _glob_match_weight(self._basename_glob) 

389 

390 dl = len(self._directory) if self._directory is not None else 0 

391 # +1 for the trailing slash 

392 # (note this holds for the "**/<foo>" case as well) 

393 dl += 1 

394 return ml + dl, pl + dl 

395 

396 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']: 

397 search_root = fs_root 

398 if self._directory is not None: 

399 p = _lookup_path(fs_root, self._directory) 

400 if p is None or not p.is_dir: 400 ↛ 401line 400 didn't jump to line 401, because the condition on line 400 was never true

401 return 

402 search_root = p 

403 path_iter = search_root.all_paths() if self._recursive_match else search_root.children 

404 if self._path_type is None: 

405 yield from (m for m in path_iter if self._matcher(m.basename)) 

406 else: 

407 yield from (m for m in path_iter if self._matcher(m.basename) and _match_file_type(self._path_type, m)) 

408 

409 def describe_match_short(self) -> str: 

410 path_type_match = '' if self._path_type is None else f' <only for path type {self._path_type.manifest_key}>' 

411 return self._full_pattern() if path_type_match == '' else f'{self._full_pattern()}{path_type_match}' 

412 

413 def describe_match_exact(self) -> str: 

414 if self._directory is not None: 

415 return f'{self.describe_match_short()} (glob / directly in the directory)' 

416 return f'{self.describe_match_short()} (basename match)' 

417 

418 def lookup_key(self) -> Optional[str]: 

419 return self._directory 

420 

421 def __hash__(self) -> int: 

422 return hash((self.rule_type, self._basename_glob, self._directory, self._path_type)) 

423 

424 def __eq__(self, other: object) -> bool: 

425 if not isinstance(other, BasenameGlobMatch): 

426 return NotImplemented 

427 return (self._basename_glob == other._basename_glob 

428 and self._directory == other._directory 

429 and self._path_type == other._path_type 

430 and self._recursive_match == other._recursive_match 

431 ) 

432 

433 @property 

434 def path_type(self) -> Optional[PathType]: 

435 return self._path_type 

436 

437 def match_rule_without_path_type_match(self) -> 'MatchRule': 

438 if self.is_path_type_part_of_match: 

439 return BasenameGlobMatch(self._basename_glob, 

440 self._directory, 

441 recursive_match=self._recursive_match, 

442 path_type=None,) 

443 return self 

444 

445 

446class GenericGlobImplementation(MatchRule): 

447 

448 __slots__ = '_glob_pattern', '_path_type', '_match_parts' 

449 

450 def __init__(self, 

451 glob_pattern: str, 

452 path_type: Optional[PathType] = None, 

453 ) -> None: 

454 super().__init__(MatchRuleType.GENERIC_GLOB) 

455 if glob_pattern.startswith('./'): 455 ↛ 457line 455 didn't jump to line 457, because the condition on line 455 was never false

456 glob_pattern = glob_pattern[2:] 

457 self._glob_pattern = glob_pattern 

458 self._path_type = path_type 

459 assert '**' not in glob_pattern # No recursive globs 

460 assert glob.has_magic(glob_pattern) # If it has no glob, then it could have been an exact match 

461 assert '/' in glob_pattern # If it does not have a / then a BasenameGlob could have been used instead 

462 self._match_parts = self._compile_glob() 

463 

464 def _full_pattern(self) -> str: 

465 return self._glob_pattern 

466 

467 def finditer(self, fs_root: 'FSPath') -> Iterable['FSPath']: 

468 search_history = [fs_root] 

469 for part in self._match_parts: 

470 next_layer = itertools.chain.from_iterable(_apply_match(m, part) for m in search_history) 

471 # TODO: Figure out why we need to materialize next_layer into a list for this to work. 

472 search_history = list(next_layer) 

473 if not search_history: 473 ↛ 475line 473 didn't jump to line 475, because the condition on line 473 was never true

474 # While we have it as a list, we might as well have an "early exit". 

475 return 

476 

477 if self._path_type is None: 477 ↛ 480line 477 didn't jump to line 480, because the condition on line 477 was never false

478 yield from search_history 

479 else: 

480 yield from (m for m in search_history if _match_file_type(self._path_type, m)) 

481 

482 def describe_match_short(self) -> str: 

483 path_type_match = '' if self._path_type is None else f' <only for path type {self._path_type.manifest_key}>' 

484 return self._full_pattern() if path_type_match == '' else f'{self._full_pattern()}{path_type_match}' 

485 

486 def describe_match_exact(self) -> str: 

487 return f'{self.describe_match_short()} (glob)' 

488 

489 def _compile_glob(self) -> Sequence[Union[Callable[[str], bool], str]]: 

490 assert self._glob_pattern.strip('/') == self._glob_pattern 

491 return [ 

492 _compile_basename_glob(part) if glob.has_magic(part) else part 

493 for part in self._glob_pattern.split('/') 

494 ] 

495 

496 def __hash__(self) -> int: 

497 return hash((self.rule_type, self._glob_pattern, self._path_type)) 

498 

499 def __eq__(self, other: object) -> bool: 

500 if not isinstance(other, GenericGlobImplementation): 

501 return NotImplemented 

502 return (self._glob_pattern == other._glob_pattern 

503 and self._path_type == other._path_type 

504 ) 

505 

506 @property 

507 def path_type(self) -> Optional[PathType]: 

508 return self._path_type 

509 

510 def match_rule_without_path_type_match(self) -> 'MatchRule': 

511 if self.is_path_type_part_of_match: 

512 return GenericGlobImplementation(self._glob_pattern, 

513 path_type=None, 

514 ) 

515 return self 

516 

517 

518class PathMatcher(Generic[MR]): 

519 

520 def __init__(self, 

521 exact_match_rules: Dict[str, Tuple[MatchRule, MR]], 

522 global_globs: List[Tuple[MatchRule, MR]], 

523 global_default_match: MR, 

524 ) -> None: 

525 self._exact_match_rules = exact_match_rules 

526 self._globs = global_globs 

527 self._global_default_match = global_default_match 

528 

529 def resolve_all(self, fs_root: 'FSPath') -> Iterable[Tuple['FSPath', MR]]: 

530 default_match_def = MATCH_ANYTHING, self._global_default_match 

531 match_state: Dict[str, Tuple[MatchRule, MR]] = {p.path: default_match_def for p in fs_root.all_paths()} 

532 for match_rule_def in self._globs: 

533 match_rule, result = match_rule_def 

534 for match in match_rule.finditer(fs_root): 

535 match_state[match.path] = match_rule_def 

536 

537 match_state.update(self._exact_match_rules) 

538 

539 yield from ((p, match_state[p.path][1]) for p in fs_root.all_paths()) 

540 

541 @classmethod 

542 def from_dict(cls, path_specs: Dict[MatchRule, MR], builtin_default_match: MR): 

543 exact_match_rules = {} 

544 global_globs = [] 

545 global_default_match = None 

546 match_rule: MatchRule 

547 manifest_path_info: MR 

548 for match_rule, manifest_path_info in sorted(path_specs.items(), key=lambda x: x[0].sort_key()): 

549 lookup_key = match_rule.lookup_key() 

550 if match_rule.rule_type == MatchRuleType.EXACT_MATCH: 

551 assert lookup_key is not None 

552 exact_match_rules[lookup_key] = match_rule, manifest_path_info 

553 continue 

554 if match_rule.rule_type == MatchRuleType.ANYTHING_BENEATH_DIR and lookup_key == '.': 554 ↛ 555line 554 didn't jump to line 555, because the condition on line 554 was never true

555 raise ValueError("Please use MATCH_ANYTHING instead of a ANYTHING_BENEATH_DIR rule for '.'") 

556 if match_rule is MATCH_ANYTHING: 556 ↛ 557line 556 didn't jump to line 557, because the condition on line 556 was never true

557 assert global_default_match is None 

558 global_default_match = manifest_path_info 

559 continue 

560 

561 global_globs.append((match_rule, manifest_path_info)) 

562 if global_default_match is None: 562 ↛ 565line 562 didn't jump to line 565, because the condition on line 562 was never false

563 global_default_match = builtin_default_match 

564 

565 return PathMatcher( 

566 exact_match_rules, 

567 global_globs, 

568 global_default_match, 

569 )