aider/local_analytics/local_analytics_collector.py

# aider/local_analytics_collector.py
import atexit
import datetime
import logging
import os
import platform
import shelve
import sys
import time
import uuid
import json # Import json module
import re # Import re module

import litellm

# Import from the local_analytics package (assuming project_root/local_analytics/dashboard_generator.py)
from local_analytics.dashboard_generator import main

try:
    from aider import __version__ as aider_version_val
except ImportError:
    aider_version_val = "unknown"

# Path constants relative to the project root where Aider is run
DATA_SHELVE_FILE = "local_analytics/aider_analytics_data.shelve"
# Constant for the dashboard HTML file
# REMOVED: DASHBOARD_HTML_FILE = "local_analytics/dashboard.html"
LOG_FILE = "local_analytics/local_analytics_collector.logs"
SESSION_JSONL_FILE = "local_analytics/session.jsonl" # Define the new JSONL file path

class LocalAnalyticsCollector:
    """
    Collects local analytics data for Aider sessions and interactions.

    This class tracks various metrics related to LLM calls, token usage,
    code modifications, and session timings. Data is stored locally using
    the `shelve` module.
    """
    def __init__(self, io, git_root=None, enabled=True):
        """
        Initializes the LocalAnalyticsCollector.

        Args:
            io: An InputOutput object for user interaction (currently unused beyond holding a reference).
            git_root (str, optional): The root directory of the git project.
                                     Defaults to None, in which case the current working directory is used.
            enabled (bool, optional): Whether analytics collection is enabled. Defaults to True.
        """
        self.io = io # Retain for the final user-facing message
        self.enabled = enabled
        if not self.enabled:
            return

        if git_root:
            self.project_name = os.path.basename(os.path.abspath(git_root))
            base_path = git_root
        else:
            self.project_name = os.path.basename(os.getcwd())
            base_path = os.getcwd()

        self.data_file = os.path.join(base_path, DATA_SHELVE_FILE)
        self.log_file = os.path.join(base_path, LOG_FILE)
        # Store the dashboard output file path
        # REMOVED: self.dashboard_output_file = os.path.join(base_path, DASHBOARD_HTML_FILE)
        # Store the session JSONL file path
        self.session_jsonl_file = os.path.join(base_path, SESSION_JSONL_FILE)


        self.session_id = str(uuid.uuid4())
        self.aider_version = aider_version_val
        self.platform_info = platform.platform()
        self.python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"

        self._current_interaction_data = None
        self._interaction_start_time_monotonic = None

        # <<< START LOGGER SETUP
        log_dir = os.path.dirname(self.log_file)
        if log_dir and not os.path.exists(log_dir):
            os.makedirs(log_dir, exist_ok=True)

        self.logger = logging.getLogger(__name__ + ".LocalAnalyticsCollector") # Or just __name__
        self.logger.setLevel(logging.DEBUG)
        self.logger.propagate = False # Prevent logs from reaching root logger / console

        # Remove existing handlers to prevent duplication if __init__ is called multiple times
        for handler in self.logger.handlers[:]:
            self.logger.removeHandler(handler)
            handler.close()

        fh = logging.FileHandler(self.log_file, encoding='utf-8')
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(session_id)s - %(message)s')
        fh.setFormatter(formatter)
        self.logger.addHandler(fh)

        # Make session_id available to logger formatter
        self._log_adapter = logging.LoggerAdapter(self.logger, {'session_id': self.session_id})

        self._log_adapter.debug(f"--- LocalAnalyticsCollector Initialized ---")
        self._log_adapter.debug(f"Project: {self.project_name}")
        self._log_adapter.debug(f"Data file: {self.data_file}")
        self._log_adapter.debug(f"Log file: {self.log_file}")
        self._log_adapter.debug(f"Session JSONL file: {self.session_jsonl_file}")
        # <<< END LOGGER SETUP

        data_dir = os.path.dirname(self.data_file)
        if data_dir and not os.path.exists(data_dir):
            os.makedirs(data_dir, exist_ok=True)
        # Ensure directory for dashboard.html and session.jsonl also exists
        # REMOVED: output_dir = os.path.dirname(self.dashboard_output_file) # Assuming dashboard and jsonl are in the same dir
        output_dir = os.path.dirname(self.session_jsonl_file) # Use session_jsonl_file path
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)


        atexit.register(self.end_session)

        self._original_success_callbacks = litellm.success_callback[:]
        self._original_failure_callbacks = litellm.failure_callback[:]
        if self._litellm_success_callback not in litellm.success_callback:
            litellm.success_callback.append(self._litellm_success_callback)


    def start_interaction(self, query, modified_files_in_chat=None):
        """
        Starts tracking a new interaction.

        If a previous interaction was in progress, it will be ended first.

        Args:
            query (str): The user's query for this interaction.
            modified_files_in_chat (list, optional): A list of files modified in the chat context.
                                                    Defaults to None.
        """
        if not self.enabled:
            return
        if self._current_interaction_data:
            self.end_interaction()  # End previous interaction if any

        self._interaction_start_time_monotonic = time.monotonic()
        self._current_interaction_data = {
            "session_id": self.session_id,
            "project_name": self.project_name,
            "interaction_timestamp": datetime.datetime.now().isoformat(),
            "interaction_duration_seconds": 0,
            "query": re.split(r"```diff", query, 1)[0].strip(),
            "aider_version": self.aider_version,
            "platform_info": self.platform_info,
            "python_version": self.python_version,
            "token_summary": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "estimated_cost": 0.0},
            "models_used_summary": [],
            "llm_calls_details": [],
            "modified_files_in_chat": modified_files_in_chat or [],
            "commits_made_this_interaction": []
        }


    def end_interaction(self):
        """
        Ends the current interaction and saves its data.

        Calculates interaction duration, summarizes model usage, and persists
        the interaction data to the shelve database.
        """
        if not self.enabled or not self._current_interaction_data:
            return

        if self._interaction_start_time_monotonic:
            duration = time.monotonic() - self._interaction_start_time_monotonic
            self._current_interaction_data["interaction_duration_seconds"] = duration

        # Summarize model usage from detailed calls
        model_summary_map = {}
        for call in self._current_interaction_data.get("llm_calls_details", []):
            model_name = call.get("model", "unknown_model")
            entry = model_summary_map.setdefault(
                model_name,
                {
                    "name": model_name,
                    "calls": 0,
                    "cost": 0.0,
                    "prompt_tokens": 0,
                    "completion_tokens": 0,
                },
            )
            entry["calls"] += 1
            entry["cost"] += call.get("cost", 0.0)
            entry["prompt_tokens"] += call.get("prompt_tokens", 0)
            entry["completion_tokens"] += call.get("completion_tokens", 0)
        self._current_interaction_data["models_used_summary"] = list(model_summary_map.values())

        try:
            with shelve.open(self.data_file) as db:
                interactions = db.get("interactions", [])
                interactions.append(self._current_interaction_data)
                db["interactions"] = interactions
        except Exception as e:
            self._log_adapter.error(f"Error saving interaction to shelve: {e}")

        self._current_interaction_data = None
        self._interaction_start_time_monotonic = None


    def _litellm_success_callback(self, kwargs, completion_response, start_time, end_time):
        """
        Callback for successful LiteLLM calls.

        This method is registered with LiteLLM to capture details of each
        successful LLM API call, including token usage and cost.

        Args:
            kwargs: Keyword arguments passed to the LiteLLM completion call.
            completion_response: The response object from LiteLLM.
            start_time: Timestamp when the LLM call started.
            end_time: Timestamp when the LLM call ended.
        """
        if not self.enabled or not self._current_interaction_data:
            return

        model_name = kwargs.get("model", "unknown_model")
        usage = getattr(completion_response, "usage", None)
        prompt_tokens = getattr(usage, 'prompt_tokens', 0) if usage else 0
        completion_tokens = getattr(usage, 'completion_tokens', 0) if usage else 0

        cost = 0.0
        try:
            # Ensure cost is float, handle potential errors from litellm.completion_cost
            calculated_cost = litellm.completion_cost(completion_response=completion_response)
            cost = float(calculated_cost) if calculated_cost is not None else 0.0
        except Exception as e: # Broad exception catch if litellm.completion_cost fails
            self._log_adapter.warning(
                f"Analytics: Could not calculate cost for LLM call. Error: {e}"
            )
            cost = 0.0 # Ensure cost is always a float, defaulting to 0.0 on error

        call_detail = {
            "model": model_name,
            "id": getattr(completion_response, "id", None),
            "finish_reason": (
                getattr(completion_response.choices[0], "finish_reason", None)
                if hasattr(completion_response, "choices") and completion_response.choices
                else None
            ),
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "cost": cost,
            "timestamp": start_time.isoformat(),
        }
        self._current_interaction_data["llm_calls_details"].append(call_detail)

        ts = self._current_interaction_data["token_summary"]
        ts["prompt_tokens"] += prompt_tokens
        ts["completion_tokens"] += completion_tokens
        ts["total_tokens"] += prompt_tokens + completion_tokens
        ts["estimated_cost"] += cost


    def log_commit(self, commit_hash, commit_message):
        """
        Logs a git commit made during the current interaction.

        Args:
            commit_hash (str): The hash of the commit.
            commit_message (str): The commit message.
        """
        if not self.enabled or not self._current_interaction_data:
            return
        commit_info = {"hash": commit_hash, "message": commit_message}
        self._current_interaction_data["commits_made_this_interaction"].append(commit_info)


    def end_session(self):
        """
        Ends the analytics collection session.

        Ensures any ongoing interaction is ended, generates the HTML dashboard,
        unregisters the atexit handler, and restores original LiteLLM callbacks.
        """
        if not self.enabled: # If analytics was never enabled or session already ended.
            # Unregister atexit handler early if it was somehow registered without enabling
            # This path should ideally not be hit if __init__ logic is correct.
            try:
                atexit.unregister(self.end_session)
            except TypeError: # pragma: no cover
                pass # Handler was not registered or other issue
            return


        # End any ongoing interaction first
        if self._current_interaction_data:
            self.end_interaction()

        # Write all the `shelve` data to session.jsonl
        if hasattr(self, 'data_file') and hasattr(self, 'session_jsonl_file'):
            try:
                with shelve.open(self.data_file, 'r') as db:
                    interactions = db.get("interactions", [])

                with open(self.session_jsonl_file, 'w', encoding='utf-8') as f:
                    for interaction in interactions:
                        # Ensure data is JSON serializable (e.g., handle datetime objects if any slipped through)
                        # Although datetime is converted to isoformat already, this is a good practice.
                        # Simple approach: convert to string if not serializable, or use a custom encoder.
                        # For now, assuming isoformat is sufficient based on start_interaction.
                        json_line = json.dumps(interaction)
                        f.write(json_line + '\n')

                # generate dashboard
                main()

                if hasattr(self, '_log_adapter'):
                    self._log_adapter.info(f"Shelve data written to {self.session_jsonl_file}")

            except Exception as e:
                if hasattr(self, '_log_adapter'):
                    self._log_adapter.error(f"Error writing shelve data to JSONL: {e}")
                else: # pragma: no cover
                    print(f"Error writing shelve data to JSONL: {e}") # Fallback if logger not set


        # Cleanup atexit handler
        try:
            atexit.unregister(self.end_session)
        except TypeError: # pragma: no cover
            pass # Handler was not registered or other issue

        # Restore LiteLLM callbacks
        # Check if _original_success_callbacks exists before assigning
        if hasattr(self, '_original_success_callbacks'):
            litellm.success_callback = self._original_success_callbacks
        # if hasattr(self, '_original_failure_callbacks'): # If failure callbacks were also stored
        #    litellm.failure_callback = self._original_failure_callbacks

        if hasattr(self, '_log_adapter'):
            self._log_adapter.info("LocalAnalyticsCollector session ended.")

        # Ensure logger handlers are closed to release file locks, especially on Windows
        if hasattr(self, 'logger'): # Check if logger was initialized
            for handler in self.logger.handlers[:]:
                handler.close()
                self.logger.removeHandler(handler)
        # Set self.enabled to False after cleanup to prevent re-entry or further use
        self.enabled = False