From e3877b9855b0f475a348a6827a3627b012aaa3f6 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Fri, 5 Jul 2024 09:21:24 -0300
Subject: [PATCH] Implemented a more efficient file filtering mechanism for the
 website content indexing process.

---
 aider/help.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/aider/help.py b/aider/help.py
index e07ead148..871e915e8 100755
--- a/aider/help.py
+++ b/aider/help.py
@@ -12,23 +12,25 @@ from aider.dump import dump  # noqa: F401
 
 warnings.simplefilter("ignore", category=FutureWarning)
 
+exclude_website_pats = [
+    "examples/**",
+    "_posts/**",
+    "HISTORY.md",
+    "docs/benchmarks*md",
+    "docs/ctags.md",
+    "docs/unified-diffs.md",
+    "docs/leaderboards/index.md",
+    "assets/**",
+]
+
 
 def get_package_files():
     for path in importlib_resources.files("website").iterdir():
-        dump(path)
-        if path.is_file() and path.name.endswith(".md"):
-            if not any(
-                part.startswith(("OLD", "tmp")) or part in ("examples", "_posts")
-                for part in path.parts
-            ):
-                yield str(path)
+        if path.is_file():
+            yield path
         elif path.is_dir():
             for subpath in path.rglob("*.md"):
-                if not any(
-                    part.startswith(("OLD", "tmp")) or part in ("examples", "_posts")
-                    for part in subpath.parts
-                ):
-                    yield str(subpath)
+                yield subpath
 
 
 def fname_to_url(filepath):
@@ -74,7 +76,7 @@ def get_index():
         nodes = []
         for fname in tqdm(list(get_package_files())):
             fname = Path(fname)
-            dump(fname)
+            # todo: skip if matches exclude website pats
             doc = Document(
                 text=importlib_resources.files("website").joinpath(fname).read_text(),
                 metadata=dict(
@@ -85,7 +87,7 @@ def get_index():
             )
             nodes += parser.get_nodes_from_documents([doc])
 
-        index = VectorStoreIndex(nodes)
+        index = VectorStoreIndex(nodes, show_progress=True)
         dname.parent.mkdir(exist_ok=True)
         index.storage_context.persist(dname)