"""
IMDb SQLite Cleanup Script
--------------------------

This script performs cleanup operations on an IMDb SQLite database created with imdb-sqlite.
It executes the following steps in batch mode (500,000 rows per batch):

1. Delete all titles where is_adult = 1 and remove the column is_adult
   -> If the column is already missing, this step is skipped
2. Delete all titles with premiered NULL or premiered < 2000
3. Delete orphaned entries from related tables (crew, akas, episodes, ratings, people)
4. Run VACUUM at the end to shrink the database file

The script logs progress with timestamps, shows percentage progress with thousand separators,
and measures execution time for each step.

Created with the assistance of Microsoft Copilot.
"""

import sqlite3
import time
from datetime import datetime

BATCH_SIZE = 500_000
DB_PATH = "imdb.db"   # Path to your imdb-sqlite database

def log(msg):
    """Print message with current timestamp."""
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

def delete_in_batches(conn, select_sql, target_table, target_column, label):
    """
    Delete rows in batches using a temporary table to avoid SQLite's parameter limit.
    Shows progress with percentage, thousand separators, and remaining rows.
    """
    cur = conn.cursor()
    total_deleted = 0
    start = time.time()

    # Determine total rows to delete
    count_sql = select_sql.replace("SELECT rowid", "SELECT COUNT(rowid)").split("LIMIT")[0]
    cur.execute(count_sql)
    total_rows = cur.fetchone()[0]

    if total_rows == 0:
        log(f"{label}: nothing to delete")
        return

    log(f"{label}: total rows to delete = {total_rows:,}")

    cur.execute("CREATE TEMP TABLE IF NOT EXISTS to_delete(id INTEGER)")
    conn.commit()

    while True:
        cur.execute(select_sql)
        rows = cur.fetchall()
        if not rows:
            break

        ids = [(r[0],) for r in rows]
        cur.executemany("INSERT INTO to_delete(id) VALUES (?)", ids)
        cur.execute(f"DELETE FROM {target_table} WHERE {target_column} IN (SELECT id FROM to_delete)")
        cur.execute("DELETE FROM to_delete")
        conn.commit()

        total_deleted += len(ids)
        percent = (total_deleted / total_rows) * 100
        remaining = total_rows - total_deleted
        log(f"{label}: deleted {len(ids):,} rows "
            f"(total {total_deleted:,}, {percent:.2f}% done, {remaining:,} remaining)")

    elapsed = time.time() - start
    log(f"{label}: finished in {elapsed:.2f} seconds")

def column_exists(conn, table, column):
    """Check if a column exists in a given table."""
    cur = conn.cursor()
    cur.execute(f"PRAGMA table_info({table})")
    cols = [row[1] for row in cur.fetchall()]
    return column in cols

def drop_is_adult_column(conn):
    cur = conn.cursor()
    if not column_exists(conn, "titles", "is_adult"):
        log("Column is_adult already removed, skipping...")
        return

    cur.execute("SELECT sqlite_version()")
    version = cur.fetchone()[0]
    log(f"SQLite version: {version}")

    major, minor, patch = map(int, version.split("."))
    start = time.time()
    if (major, minor, patch) >= (3, 35, 0):
        log("Dropping column is_adult directly...")
        cur.execute("ALTER TABLE titles DROP COLUMN is_adult")
    else:
        log("Using workaround with new table (no DROP COLUMN support)...")
        cur.execute("""
            CREATE TABLE titles_new AS
            SELECT title_id, type, primary_title, original_title,
                   premiered, ended, runtime_minutes, genres
            FROM titles
        """)
        cur.execute("DROP TABLE titles")
        cur.execute("ALTER TABLE titles_new RENAME TO titles")
    conn.commit()
    elapsed = time.time() - start
    log(f"Column removal finished in {elapsed:.2f} seconds")

def main():
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()

    log("Starting cleanup process...")

    # Step 1
    if column_exists(conn, "titles", "is_adult"):
        delete_in_batches(
            conn,
            f"SELECT rowid FROM titles WHERE is_adult = 1 LIMIT {BATCH_SIZE}",
            "titles", "rowid",
            "Step 1 (adult titles)"
        )
    else:
        log("Column is_adult not found, skipping deletion of adult titles...")
    drop_is_adult_column(conn)

    # Step 2
    delete_in_batches(
        conn,
        f"SELECT rowid FROM titles WHERE premiered IS NULL OR premiered < 2000 LIMIT {BATCH_SIZE}",
        "titles", "rowid",
        "Step 2 (premiered < 2000)"
    )

    # Step 3
    delete_in_batches(
        conn,
        f"SELECT rowid FROM crew WHERE title_id NOT IN (SELECT title_id FROM titles) LIMIT {BATCH_SIZE}",
        "crew", "rowid",
        "Step 3a (crew)"
    )
    delete_in_batches(
        conn,
        f"SELECT rowid FROM akas WHERE title_id NOT IN (SELECT title_id FROM titles) LIMIT {BATCH_SIZE}",
        "akas", "rowid",
        "Step 3b (akas)"
    )
    delete_in_batches(
        conn,
        f"""SELECT rowid FROM episodes
            WHERE episode_title_id NOT IN (SELECT title_id FROM titles)
               OR show_title_id NOT IN (SELECT title_id FROM titles)
            LIMIT {BATCH_SIZE}""",
        "episodes", "rowid",
        "Step 3c (episodes)"
    )
    delete_in_batches(
        conn,
        f"SELECT rowid FROM ratings WHERE title_id NOT IN (SELECT title_id FROM titles) LIMIT {BATCH_SIZE}",
        "ratings", "rowid",
        "Step 3d (ratings)"
    )
    delete_in_batches(
        conn,
        f"SELECT rowid FROM people "
        f"WHERE person_id NOT IN (SELECT person_id FROM crew) LIMIT {BATCH_SIZE}",
        "people", "rowid",
        "Step 3e (people)"
    )

    # Step 4
    log("Step 4: Running VACUUM to shrink the database file...")
    start = time.time()
    cur.execute("VACUUM")
    conn.commit()
    elapsed = time.time() - start
    log(f"Step 4: VACUUM finished in {elapsed:.2f} seconds")

    conn.close()
    log("Cleanup process completed!")

if __name__ == "__main__":
    main()
