aider/local_analytics/clean_query_field.py

import json
from pathlib import Path
import re

# Define file paths (assuming they are in the same directory as the script)
BASE_DIR = Path(__file__).resolve().parent
SESSION_DATA_FILE = BASE_DIR / "session.jsonl"

# Regex to identify common code/diff starting lines.
# This regex checks if the *first line* of a query starts with one of these patterns.
CODE_DIFF_MARKERS_REGEX = re.compile(
    r"^(```|diff --git|--- |\+\+\+ |@@ )"
)

def clean_query(query_text):
    """
    Cleans the query text.
    The cleaned query should be only the first line of the original query,
    and should not be a code/diff line itself.
    """
    if not isinstance(query_text, str) or not query_text.strip():
        # Return as is if not a string, or if it's an empty/whitespace-only string
        return query_text

    # First, get the part of the query before any "```diff" block
    query_before_diff = re.split(r"```diff", query_text, 1)[0]

    # If the part before "```diff" is empty or just whitespace, return empty string
    if not query_before_diff.strip():
        return ""

    # Now, take the first line of this potentially multi-line pre-diff query
    lines_before_diff = query_before_diff.splitlines()
    if not lines_before_diff: # Should be caught by query_before_diff.strip() check, but for safety
        return ""

    first_line = lines_before_diff[0]

    # Check if this first line itself is a code/diff marker
    if CODE_DIFF_MARKERS_REGEX.match(first_line):
        # If the first line itself is identified as a code/diff marker,
        # this implies the query might predominantly be code or a diff.
        # In this case, we set the query to an empty string.
        return ""
    else:
        # Otherwise, the first line is considered the cleaned query.
        return first_line

def main():
    """Main function to clean the query field in session.jsonl."""
    if not SESSION_DATA_FILE.exists():
        print(f"Error: Session data file not found at {SESSION_DATA_FILE}")
        return

    updated_lines = []
    modified_count = 0
    processed_lines = 0

    print(f"Starting cleaning process for {SESSION_DATA_FILE}...")

    with open(SESSION_DATA_FILE, "r", encoding="utf-8") as f:
        for line_num, line_content in enumerate(f, 1):
            processed_lines += 1
            try:
                data = json.loads(line_content)
                original_query = data.get("query") # Use .get() for safety

                if "query" in data and isinstance(original_query, str):
                    cleaned_query = clean_query(original_query)
                    if cleaned_query != original_query:
                        data["query"] = cleaned_query
                        modified_count += 1

                updated_lines.append(json.dumps(data) + "\n")

            except json.JSONDecodeError as e:
                print(f"Warning: Error decoding JSON from line {line_num}: {e}. Keeping original line.")
                updated_lines.append(line_content) # Keep original line if JSON error
            except Exception as e:
                print(f"Warning: Error processing line {line_num}: {e}. Keeping original line.")
                updated_lines.append(line_content) # Keep original line if other error

    # Write back to the original file
    try:
        with open(SESSION_DATA_FILE, "w", encoding="utf-8") as f:
            for updated_line in updated_lines:
                f.write(updated_line)
        print(f"\nProcessing complete.")
        print(f"Processed {processed_lines} lines.")
        print(f"{modified_count} queries were cleaned.")
        print(f"Cleaned data saved to {SESSION_DATA_FILE.resolve()}")
    except IOError as e:
        print(f"Error writing cleaned data to {SESSION_DATA_FILE}: {e}")


if __name__ == "__main__":
    main()