Implemented a more efficient file filtering mechanism for the website content indexing process.

This commit is contained in:
Paul Gauthier 2024-07-05 09:21:24 -03:00 committed by Paul Gauthier (aider)
parent e951974c43
commit e3877b9855

View file

@ -12,23 +12,25 @@ from aider.dump import dump # noqa: F401
warnings.simplefilter("ignore", category=FutureWarning) warnings.simplefilter("ignore", category=FutureWarning)
exclude_website_pats = [
"examples/**",
"_posts/**",
"HISTORY.md",
"docs/benchmarks*md",
"docs/ctags.md",
"docs/unified-diffs.md",
"docs/leaderboards/index.md",
"assets/**",
]
def get_package_files(): def get_package_files():
for path in importlib_resources.files("website").iterdir(): for path in importlib_resources.files("website").iterdir():
dump(path) if path.is_file():
if path.is_file() and path.name.endswith(".md"): yield path
if not any(
part.startswith(("OLD", "tmp")) or part in ("examples", "_posts")
for part in path.parts
):
yield str(path)
elif path.is_dir(): elif path.is_dir():
for subpath in path.rglob("*.md"): for subpath in path.rglob("*.md"):
if not any( yield subpath
part.startswith(("OLD", "tmp")) or part in ("examples", "_posts")
for part in subpath.parts
):
yield str(subpath)
def fname_to_url(filepath): def fname_to_url(filepath):
@ -74,7 +76,7 @@ def get_index():
nodes = [] nodes = []
for fname in tqdm(list(get_package_files())): for fname in tqdm(list(get_package_files())):
fname = Path(fname) fname = Path(fname)
dump(fname) # todo: skip if matches exclude website pats
doc = Document( doc = Document(
text=importlib_resources.files("website").joinpath(fname).read_text(), text=importlib_resources.files("website").joinpath(fname).read_text(),
metadata=dict( metadata=dict(
@ -85,7 +87,7 @@ def get_index():
) )
nodes += parser.get_nodes_from_documents([doc]) nodes += parser.get_nodes_from_documents([doc])
index = VectorStoreIndex(nodes) index = VectorStoreIndex(nodes, show_progress=True)
dname.parent.mkdir(exist_ok=True) dname.parent.mkdir(exist_ok=True)
index.storage_context.persist(dname) index.storage_context.persist(dname)