From e3877b9855b0f475a348a6827a3627b012aaa3f6 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Fri, 5 Jul 2024 09:21:24 -0300 Subject: [PATCH] Implemented a more efficient file filtering mechanism for the website content indexing process. --- aider/help.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/aider/help.py b/aider/help.py index e07ead148..871e915e8 100755 --- a/aider/help.py +++ b/aider/help.py @@ -12,23 +12,25 @@ from aider.dump import dump # noqa: F401 warnings.simplefilter("ignore", category=FutureWarning) +exclude_website_pats = [ + "examples/**", + "_posts/**", + "HISTORY.md", + "docs/benchmarks*md", + "docs/ctags.md", + "docs/unified-diffs.md", + "docs/leaderboards/index.md", + "assets/**", +] + def get_package_files(): for path in importlib_resources.files("website").iterdir(): - dump(path) - if path.is_file() and path.name.endswith(".md"): - if not any( - part.startswith(("OLD", "tmp")) or part in ("examples", "_posts") - for part in path.parts - ): - yield str(path) + if path.is_file(): + yield path elif path.is_dir(): for subpath in path.rglob("*.md"): - if not any( - part.startswith(("OLD", "tmp")) or part in ("examples", "_posts") - for part in subpath.parts - ): - yield str(subpath) + yield subpath def fname_to_url(filepath): @@ -74,7 +76,7 @@ def get_index(): nodes = [] for fname in tqdm(list(get_package_files())): fname = Path(fname) - dump(fname) + # todo: skip if matches exclude website pats doc = Document( text=importlib_resources.files("website").joinpath(fname).read_text(), metadata=dict( @@ -85,7 +87,7 @@ def get_index(): ) nodes += parser.get_nodes_from_documents([doc]) - index = VectorStoreIndex(nodes) + index = VectorStoreIndex(nodes, show_progress=True) dname.parent.mkdir(exist_ok=True) index.storage_context.persist(dname)