Add ISO-8601 dates from filenames in Paperless-ngx post-consume script

Paperless-ngx is a simply amazing document management tool. It has made managing the thousands of documents that I have in my collection an absolute breeze.
I have a lot of documents which have ISO-8601 dates in the filename. When I ingest them into Paperless-ngx I want those dates to be assigned as the “Date created” for that document. In order to get paperless-ngx to do this, I needed to create a Post-Consume script
After finding this link to a python script that more or less did what I wanted, I tweaked it with Claude 4 Sonnet’s help and it works well. It looks at the original filename of the ingested document and parses the date from it, if it exists, then makes that date the “Date Created” for the document. Additionally, it will tag the document with the year that the document was created. It requires the PAPERLESS_API_KEY environment variable to be populated. Add the path to the script as PAPERLESS_POST_CONSUME_SCRIPT environment variable.
#!/usr/bin/env python3
"""
Paperless-ngx Post-Process Script with Configurable Regex Patterns
Transforms document titles and dates based on original filename patterns
Uses environment variables for configuration

Inspired by https://github.com/paperless-ngx/paperless-ngx/discussions/7580
and tweaked with Claude 4 Sonnet
"""

import os
import json
import requests
import re
from datetime import datetime
from typing import Dict, List, Optional, Tuple


# Configuration: Define your regex patterns and transformations here
FILENAME_PATTERNS = [
    {
        "name": "Brokerage Statement Pattern",
        "pattern": r"^([^_]+)_(\d{4})-(\d{2})-(\d{2})_(\d+)\.(.+)$",
        "title_transform": lambda m: f"{m.group(1)}",  # Document type
        "date_transform": lambda m: f"{m.group(2)}-{m.group(3)}-{m.group(4)}",  # YYYY-MM-DD
        "date_format": "%Y-%m-%d"
    },
    {
        "name": "Standard Date-Title Pattern",
        "pattern": r"^(\d{4}-\d{2}-\d{2}) - (.+)\.(.+)$",
        "title_transform": lambda m: m.group(2),  # Title part
        "date_transform": lambda m: m.group(1),   # Date part
        "date_format": "%Y-%m-%d"
    },
    {
        "name": "Invoice Pattern",
        "pattern": r"^Invoice_(\d{4})(\d{2})(\d{2})_(.+)_(\d+)\.(.+)$",
        "title_transform": lambda m: f"Invoice - {m.group(4)}",
        "date_transform": lambda m: f"{m.group(1)}-{m.group(2)}-{m.group(3)}",
        "date_format": "%Y-%m-%d"
    },
    {
        "name": "Bank Statement Pattern",
        "pattern": r"^(.+)_Statement_(\d{1,2})-(\d{1,2})-(\d{4})\.(.+)$",
        "title_transform": lambda m: f"{m.group(1)} Statement",
        "date_transform": lambda m: f"{m.group(4)}-{m.group(2):0>2}-{m.group(3):0>2}",
        "date_format": "%Y-%m-%d"
    }
]


def get_config_from_env():
    """Get configuration from environment variables"""
    paperless_url = os.getenv("PAPERLESS_URL", "http://localhost:8000")
    api_token = os.getenv("PAPERLESS_API_TOKEN")
    timeout = float(os.getenv("PAPERLESS_TIMEOUT", "10.0"))

    return paperless_url, api_token, timeout


def _set_auth_headers(session: requests.Session, api_token: str):
    """Set authentication headers using API token"""
    session.headers.update({
        "Authorization": f"Token {api_token}",
        "Content-Type": "application/json"
    })


def match_filename_pattern(filename: str) -> Optional[Tuple[Dict, re.Match]]:
    """
    Try to match filename against configured patterns
    Returns (pattern_config, match_object) or None
    """
    for pattern_config in FILENAME_PATTERNS:
        match = re.match(pattern_config["pattern"], filename)
        if match:
            return pattern_config, match
    return None


def extract_title_and_date(filename: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Extract title and date from filename using configured patterns
    Returns (new_title, date_string, pattern_name) or (None, None, None)
    """
    result = match_filename_pattern(filename)
    if not result:
        return None, None, None

    pattern_config, match = result

    try:
        # Extract title using the transform function
        new_title = pattern_config["title_transform"](match)

        # Extract date using the transform function
        date_string = pattern_config["date_transform"](match)


        return new_title, date_string, pattern_config["name"]

    except Exception as e:
        print(f"Error applying pattern '{pattern_config['name']}': {e}")
        return None, None, None


def parse_date(date_string: str, date_format: str) -> Optional[datetime]:
    """Parse date string using the specified format"""
    try:
        return datetime.strptime(date_string, date_format).date()
    except ValueError as e:
        print(f"Failed to parse date '{date_string}' with format '{date_format}': {e}")
        return None


def get_or_create_year_tag(year: str, paperless_url: str, timeout: float, session: requests.Session) -> Optional[int]:
    """
    Get existing year tag or create a new one
    Returns tag ID or None if failed
    """
    try:
        # First, try to find existing tag (note: this searches for tags containing the name)
        search_url = paperless_url + f"/api/tags/?name={year}"

        tags_resp = session.get(search_url, timeout=timeout)
        tags_resp.raise_for_status()
        tags_data = tags_resp.json()

        if tags_data["results"]:
            # Look for exact match since API returns partial matches
            for tag in tags_data["results"]:
                if tag['name'] == year:
                    print(f"Found existing year tag '{year}' with ID: {tag['id']}")
                    return tag['id']

        # Tag doesn't exist, create it
        print(f"No existing tag found, creating new tag '{year}'")
        create_resp = session.post(
            paperless_url + "/api/tags/",
            data=json.dumps({
                "name": year,
                "color": "#007acc",  # Blue color for year tags
                "is_inbox_tag": False
            }),
            timeout=timeout
        )
        create_resp.raise_for_status()
        tag_data = create_resp.json()
        tag_id = tag_data["id"]
        print(f"Created new year tag '{year}' with ID: {tag_id}")
        return tag_id

    except requests.exceptions.RequestException as e:
        print(f"Failed to get/create year tag '{year}': {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"Response status: {e.response.status_code}")
            print(f"Response text: {e.response.text}")
        return None


def add_year_tag_to_document(doc_pk: int, year: str, paperless_url: str, timeout: float, session: requests.Session) -> bool:
    """
    Add year tag to document
    Returns True if successful, False otherwise
    """
    # Get or create the year tag
    tag_id = get_or_create_year_tag(year, paperless_url, timeout, session)
    if not tag_id:
        return False

    try:
        # Get current document tags
        doc_resp = session.get(
            paperless_url + f"/api/documents/{doc_pk}/",
            timeout=timeout
        )
        doc_resp.raise_for_status()
        doc_data = doc_resp.json()
        current_tags = doc_data.get("tags", [])

        # Check if year tag is already assigned
        if tag_id in current_tags:
            print(f"Document {doc_pk} already has year tag '{year}'")
            return True

        # Add year tag to existing tags
        updated_tags = current_tags + [tag_id]

        # Update document with new tags
        update_resp = session.patch(
            paperless_url + f"/api/documents/{doc_pk}/",
            data=json.dumps({"tags": updated_tags}),
            timeout=timeout
        )
        update_resp.raise_for_status()
        print(f"Document {doc_pk} - Added year tag '{year}'")
        return True

    except requests.exceptions.RequestException as e:
        print(f"Failed to add year tag '{year}' to document {doc_pk}: {e}")
        return False


def test_api_connection(paperless_url: str, timeout: float, session: requests.Session) -> bool:
    """Test API connection and authentication"""
    try:
        response = session.get(
            paperless_url + "/api/documents/?page_size=1",
            timeout=timeout
        )
        response.raise_for_status()
        print("API connection successful")
        return True
    except requests.exceptions.RequestException as e:
        print(f"API connection failed: {e}")
        return False


def update_document(doc_pk: int, paperless_url: str, timeout: float, session: requests.Session):
    """Main function to update document title and date"""

    # Get document info
    try:
        doc_info_resp = session.get(
            paperless_url + f"/api/documents/{doc_pk}/",
            timeout=timeout
        )
        doc_info_resp.raise_for_status()
        doc_info = doc_info_resp.json()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch document {doc_pk}: {e}")
        return

    original_filename = doc_info["original_file_name"]
    current_title = doc_info["title"]

    print(f"Processing document {doc_pk}: {original_filename}")

    # Try to extract title and date from filename
    new_title, date_string, pattern_name = extract_title_and_date(original_filename)

    if not new_title and not date_string:
        print(f"Document {doc_pk} - No matching pattern found for: {original_filename}")
        return

    print(f"Document {doc_pk} - Matched pattern: {pattern_name}")

    # Prepare update data
    update_data = {}
    parsed_date = None

    # Update title if extracted
    if new_title and new_title != current_title:
        update_data["title"] = new_title
        print(f"Document {doc_pk} - Title will be updated to: {new_title}")

    # Update date if extracted and valid
    if date_string:
        # Find the pattern config to get date format
        pattern_result = match_filename_pattern(original_filename)
        if pattern_result:
            pattern_config, _ = pattern_result
            parsed_date = parse_date(date_string, pattern_config["date_format"])

            if parsed_date:
                update_data["created"] = parsed_date.isoformat()
                print(f"Document {doc_pk} - Date will be updated to: {parsed_date}")
            else:
                print(f"Document {doc_pk} - Invalid date format: {date_string}")

    # Apply updates if any
    if update_data:
        try:
            resp = session.patch(
                paperless_url + f"/api/documents/{doc_pk}/",
                data=json.dumps(update_data),
                timeout=timeout,
            )
            resp.raise_for_status()
            print(f"Document {doc_pk} - Successfully updated: {update_data}")

        except requests.exceptions.RequestException as e:
            print(f"Document {doc_pk} - Failed to update: {e}")
            return
    else:
        print(f"Document {doc_pk} - No updates needed")

    # Add year tag if we have a valid date
    if parsed_date:
        year = str(parsed_date.year)
        add_year_tag_to_document(doc_pk, year, paperless_url, timeout, session)


if __name__ == "__main__":
    # Get configuration from environment variables
    paperless_url, api_token, timeout = get_config_from_env()

    # Validate required environment variables
    if not api_token:
        print("Error: PAPERLESS_API_TOKEN environment variable is required")
        print("Set it with: export PAPERLESS_API_TOKEN=your_token_here")
        exit(1)

    print(f"Using Paperless URL: {paperless_url}")
    print(f"Using timeout: {timeout}s")

    try:
        with requests.Session() as sess:
            # Set authentication headers
            _set_auth_headers(sess, api_token)

            # Test API connection
            if not test_api_connection(paperless_url, timeout, sess):
                print("Exiting due to API connection failure")
                exit(1)

            # Get document ID from environment
            doc_pk = int(os.environ["DOCUMENT_ID"])
            update_document(doc_pk, paperless_url, timeout, sess)

    except KeyError:
        print("Error: DOCUMENT_ID environment variable not found")
        exit(1)
    except ValueError:
        print("Error: DOCUMENT_ID is not a valid integer")
        exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}")
        exit(1)
Technicus

Add ISO-8601 dates from filenames in Paperless-ngx post-consume script

Leave a Reply Cancel reply

Nick's technical musings