feat: Improve language tag download script with multi-branch support

This commit is contained in:
Paul Gauthier (aider) 2025-03-12 13:54:31 -07:00
parent 849e02cbfb
commit a6ebed8d16

View file

@ -3,10 +3,42 @@
import json import json
import os import os
import sys import sys
import time
import requests import requests
def get_default_branch(owner, repo):
"""Get the default branch of a GitHub repository using the API."""
api_url = f"https://api.github.com/repos/{owner}/{repo}"
try:
response = requests.get(api_url)
response.raise_for_status()
return response.json().get("default_branch")
except requests.exceptions.RequestException:
return None
def try_download_tags(owner, repo, branch, directory, output_path):
"""Try to download tags.scm from a specific branch."""
base_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}"
if directory:
tags_url = f"{base_url}/{directory}/queries/tags.scm"
else:
tags_url = f"{base_url}/queries/tags.scm"
try:
response = requests.get(tags_url)
response.raise_for_status()
# Save the file
with open(output_path, "w") as f:
f.write(response.text)
return True
except requests.exceptions.RequestException:
return False
def main(): def main():
# Path to the language definitions file # Path to the language definitions file
lang_def_path = "../../tmp/tree-sitter-language-pack/sources/language_definitions.json" lang_def_path = "../../tmp/tree-sitter-language-pack/sources/language_definitions.json"
@ -17,6 +49,9 @@ def main():
# Create the output directory if it doesn't exist # Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
# Common branch names to try if API fails and config branch doesn't work
common_branches = ["main", "master", "dev", "develop"]
try: try:
# Load the language definitions # Load the language definitions
with open(lang_def_path, "r") as f: with open(lang_def_path, "r") as f:
@ -28,16 +63,18 @@ def main():
print(f"Found {len(lang_defs)} language definitions") print(f"Found {len(lang_defs)} language definitions")
# Process each language # Process each language
successes = 0
total = len(lang_defs)
for lang, config in lang_defs.items(): for lang, config in lang_defs.items():
print(f"Processing {lang}...") print(f"Processing {lang}...")
# Extract repo URL and branch from the config # Extract repo URL from the config
repo_url = config.get("repo") repo_url = config.get("repo")
if not repo_url: if not repo_url:
print(f"Skipping {lang}: No repository URL found") print(f"Skipping {lang}: No repository URL found")
continue continue
branch = config.get("branch", "master")
directory = config.get("directory", "") directory = config.get("directory", "")
# Parse the GitHub repository URL # Parse the GitHub repository URL
@ -46,35 +83,58 @@ def main():
continue continue
# Extract the owner and repo name from the URL # Extract the owner and repo name from the URL
_, _, _, owner, repo = repo_url.rstrip("/").split("/") parts = repo_url.rstrip("/").split("/")
if len(parts) < 5:
print(f"Skipping {lang}: Invalid GitHub URL format")
continue
# Construct the raw file URL owner = parts[-2]
# Build the GitHub raw content path repo = parts[-1]
base_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}"
if directory:
tags_url = f"{base_url}/{directory}/queries/tags.scm"
else:
tags_url = f"{base_url}/queries/tags.scm"
# Create the language directory in the output path # Create the language directory in the output path
lang_dir = os.path.join(output_dir, lang) lang_dir = os.path.join(output_dir, lang)
os.makedirs(os.path.join(lang_dir, "queries"), exist_ok=True) queries_dir = os.path.join(lang_dir, "queries")
os.makedirs(queries_dir, exist_ok=True)
output_file = os.path.join(queries_dir, "tags.scm")
# Fetch the tags.scm file # Try branches in this order:
try: # 1. Branch specified in the config
response = requests.get(tags_url) # 2. Default branch from GitHub API
response.raise_for_status() # Raise an exception for HTTP errors # 3. Common branch names (main, master, etc.)
# Save the file branches_to_try = []
output_file = os.path.join(lang_dir, "queries", "tags.scm")
with open(output_file, "w") as f:
f.write(response.text)
print(f"Successfully downloaded tags for {lang}") # 1. Branch from config (if specified)
except requests.exceptions.RequestException as e: config_branch = config.get("branch")
print(f"Error fetching tags for {lang}: {e}") if config_branch:
branches_to_try.append(config_branch)
print("All language tags processed") # 2. Default branch from GitHub API
default_branch = get_default_branch(owner, repo)
if default_branch and default_branch not in branches_to_try:
branches_to_try.append(default_branch)
# 3. Add common branch names
for branch in common_branches:
if branch not in branches_to_try:
branches_to_try.append(branch)
# Try each branch
success = False
for branch in branches_to_try:
if try_download_tags(owner, repo, branch, directory, output_file):
print(f"Successfully downloaded tags for {lang} (branch: {branch})")
success = True
successes += 1
break
if not success:
print(f"Failed to download tags for {lang} after trying all branches")
# Be nice to GitHub's API
time.sleep(0.1)
print(f"All language tags processed. Downloaded {successes}/{total} successfully.")
if __name__ == "__main__": if __name__ == "__main__":