Implemented a more efficient file filtering mechanism for the website content indexing process.

2025-05-25 14:55:00 +00:00 · 2024-07-05 09:21:24 -03:00 · 2024-07-05 09:21:24 -03:00 · e3877b9855
commit e3877b9855
parent e951974c43
1 changed files with 16 additions and 14 deletions
--- a/aider/help.py
+++ b/aider/help.py
@ -12,23 +12,25 @@ from aider.dump import dump  # noqa: F401
 warnings.simplefilter("ignore", category=FutureWarning)
 exclude_website_pats = [
    "examples/**",
    "_posts/**",
    "HISTORY.md",
    "docs/benchmarks*md",
    "docs/ctags.md",
    "docs/unified-diffs.md",
    "docs/leaderboards/index.md",
    "assets/**",
 ]
 def get_package_files():
    for path in importlib_resources.files("website").iterdir():
-        dump(path)
+        if path.is_file():
-        if path.is_file() and path.name.endswith(".md"):
+            yield path
            if not any(
                part.startswith(("OLD", "tmp")) or part in ("examples", "_posts")
                for part in path.parts
            ):
                yield str(path)
        elif path.is_dir():
            for subpath in path.rglob("*.md"):
-                if not any(
+                yield subpath
                    part.startswith(("OLD", "tmp")) or part in ("examples", "_posts")
                    for part in subpath.parts
                ):
                    yield str(subpath)
 def fname_to_url(filepath):
@ -74,7 +76,7 @@ def get_index():
        nodes = []
        for fname in tqdm(list(get_package_files())):
            fname = Path(fname)
-            dump(fname)
+            # todo: skip if matches exclude website pats
            doc = Document(
                text=importlib_resources.files("website").joinpath(fname).read_text(),
                metadata=dict(
@ -85,7 +87,7 @@ def get_index():
            )
            nodes += parser.get_nodes_from_documents([doc])
-        index = VectorStoreIndex(nodes)
+        index = VectorStoreIndex(nodes, show_progress=True)
        dname.parent.mkdir(exist_ok=True)
        index.storage_context.persist(dname)