aider/local_analytics/clean_query_field.py
flanker e8bee42d76 feat: Local Analytics Dashboard for Aider
feat: Initialize LocalAnalyticsCollector in main.py

feat: Display session data in local analytics dashboard

fix: Use cumulative data from last interaction for dashboard stats

fix: Extract initial query from diffs in local analytics collector.
2025-05-14 20:39:13 +05:30

97 lines
3.7 KiB
Python

import json
from pathlib import Path
import re
# Define file paths (assuming they are in the same directory as the script)
BASE_DIR = Path(__file__).resolve().parent
SESSION_DATA_FILE = BASE_DIR / "session.jsonl"
# Regex to identify common code/diff starting lines.
# This regex checks if the *first line* of a query starts with one of these patterns.
CODE_DIFF_MARKERS_REGEX = re.compile(
r"^(```|diff --git|--- |\+\+\+ |@@ )"
)
def clean_query(query_text):
"""
Cleans the query text.
The cleaned query should be only the first line of the original query,
and should not be a code/diff line itself.
"""
if not isinstance(query_text, str) or not query_text.strip():
# Return as is if not a string, or if it's an empty/whitespace-only string
return query_text
# First, get the part of the query before any "```diff" block
query_before_diff = re.split(r"```diff", query_text, 1)[0]
# If the part before "```diff" is empty or just whitespace, return empty string
if not query_before_diff.strip():
return ""
# Now, take the first line of this potentially multi-line pre-diff query
lines_before_diff = query_before_diff.splitlines()
if not lines_before_diff: # Should be caught by query_before_diff.strip() check, but for safety
return ""
first_line = lines_before_diff[0]
# Check if this first line itself is a code/diff marker
if CODE_DIFF_MARKERS_REGEX.match(first_line):
# If the first line itself is identified as a code/diff marker,
# this implies the query might predominantly be code or a diff.
# In this case, we set the query to an empty string.
return ""
else:
# Otherwise, the first line is considered the cleaned query.
return first_line
def main():
"""Main function to clean the query field in session.jsonl."""
if not SESSION_DATA_FILE.exists():
print(f"Error: Session data file not found at {SESSION_DATA_FILE}")
return
updated_lines = []
modified_count = 0
processed_lines = 0
print(f"Starting cleaning process for {SESSION_DATA_FILE}...")
with open(SESSION_DATA_FILE, "r", encoding="utf-8") as f:
for line_num, line_content in enumerate(f, 1):
processed_lines += 1
try:
data = json.loads(line_content)
original_query = data.get("query") # Use .get() for safety
if "query" in data and isinstance(original_query, str):
cleaned_query = clean_query(original_query)
if cleaned_query != original_query:
data["query"] = cleaned_query
modified_count += 1
updated_lines.append(json.dumps(data) + "\n")
except json.JSONDecodeError as e:
print(f"Warning: Error decoding JSON from line {line_num}: {e}. Keeping original line.")
updated_lines.append(line_content) # Keep original line if JSON error
except Exception as e:
print(f"Warning: Error processing line {line_num}: {e}. Keeping original line.")
updated_lines.append(line_content) # Keep original line if other error
# Write back to the original file
try:
with open(SESSION_DATA_FILE, "w", encoding="utf-8") as f:
for updated_line in updated_lines:
f.write(updated_line)
print(f"\nProcessing complete.")
print(f"Processed {processed_lines} lines.")
print(f"{modified_count} queries were cleaned.")
print(f"Cleaned data saved to {SESSION_DATA_FILE.resolve()}")
except IOError as e:
print(f"Error writing cleaned data to {SESSION_DATA_FILE}: {e}")
if __name__ == "__main__":
main()