Paperless-ngx is a simply amazing document management tool. It has made managing the thousands of documents that I have in my collection an absolute breeze.
I have a lot of documents which have ISO-8601 dates in the filename. When I ingest them into Paperless-ngx I want those dates to be assigned as the “Date created” for that document. In order to get paperless-ngx to do this, I needed to create a Post-Consume script
After finding this link to a python script that more or less did what I wanted, I tweaked it with Claude 4 Sonnet’s help and it works well. It looks at the original filename of the ingested document and parses the date from it, if it exists, then makes that date the “Date Created” for the document. Additionally, it will tag the document with the year that the document was created. It requires the PAPERLESS_API_KEY environment variable to be populated. Add the path to the script as PAPERLESS_POST_CONSUME_SCRIPT environment variable.
#!/usr/bin/env python3
"""
Paperless-ngx Post-Process Script with Configurable Regex Patterns
Transforms document titles and dates based on original filename patterns
Uses environment variables for configuration
Inspired by https://github.com/paperless-ngx/paperless-ngx/discussions/7580
and tweaked with Claude 4 Sonnet
"""
import os
import json
import requests
import re
from datetime import datetime
from typing import Dict, List, Optional, Tuple
# Configuration: Define your regex patterns and transformations here
FILENAME_PATTERNS = [
{
"name": "Brokerage Statement Pattern",
"pattern": r"^([^_]+)_(\d{4})-(\d{2})-(\d{2})_(\d+)\.(.+)$",
"title_transform": lambda m: f"{m.group(1)}", # Document type
"date_transform": lambda m: f"{m.group(2)}-{m.group(3)}-{m.group(4)}", # YYYY-MM-DD
"date_format": "%Y-%m-%d"
},
{
"name": "Standard Date-Title Pattern",
"pattern": r"^(\d{4}-\d{2}-\d{2}) - (.+)\.(.+)$",
"title_transform": lambda m: m.group(2), # Title part
"date_transform": lambda m: m.group(1), # Date part
"date_format": "%Y-%m-%d"
},
{
"name": "Invoice Pattern",
"pattern": r"^Invoice_(\d{4})(\d{2})(\d{2})_(.+)_(\d+)\.(.+)$",
"title_transform": lambda m: f"Invoice - {m.group(4)}",
"date_transform": lambda m: f"{m.group(1)}-{m.group(2)}-{m.group(3)}",
"date_format": "%Y-%m-%d"
},
{
"name": "Bank Statement Pattern",
"pattern": r"^(.+)_Statement_(\d{1,2})-(\d{1,2})-(\d{4})\.(.+)$",
"title_transform": lambda m: f"{m.group(1)} Statement",
"date_transform": lambda m: f"{m.group(4)}-{m.group(2):0>2}-{m.group(3):0>2}",
"date_format": "%Y-%m-%d"
}
]
def get_config_from_env():
"""Get configuration from environment variables"""
paperless_url = os.getenv("PAPERLESS_URL", "http://localhost:8000")
api_token = os.getenv("PAPERLESS_API_TOKEN")
timeout = float(os.getenv("PAPERLESS_TIMEOUT", "10.0"))
return paperless_url, api_token, timeout
def _set_auth_headers(session: requests.Session, api_token: str):
"""Set authentication headers using API token"""
session.headers.update({
"Authorization": f"Token {api_token}",
"Content-Type": "application/json"
})
def match_filename_pattern(filename: str) -> Optional[Tuple[Dict, re.Match]]:
"""
Try to match filename against configured patterns
Returns (pattern_config, match_object) or None
"""
for pattern_config in FILENAME_PATTERNS:
match = re.match(pattern_config["pattern"], filename)
if match:
return pattern_config, match
return None
def extract_title_and_date(filename: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Extract title and date from filename using configured patterns
Returns (new_title, date_string, pattern_name) or (None, None, None)
"""
result = match_filename_pattern(filename)
if not result:
return None, None, None
pattern_config, match = result
try:
# Extract title using the transform function
new_title = pattern_config["title_transform"](match)
# Extract date using the transform function
date_string = pattern_config["date_transform"](match)
return new_title, date_string, pattern_config["name"]
except Exception as e:
print(f"Error applying pattern '{pattern_config['name']}': {e}")
return None, None, None
def parse_date(date_string: str, date_format: str) -> Optional[datetime]:
"""Parse date string using the specified format"""
try:
return datetime.strptime(date_string, date_format).date()
except ValueError as e:
print(f"Failed to parse date '{date_string}' with format '{date_format}': {e}")
return None
def get_or_create_year_tag(year: str, paperless_url: str, timeout: float, session: requests.Session) -> Optional[int]:
"""
Get existing year tag or create a new one
Returns tag ID or None if failed
"""
try:
# First, try to find existing tag (note: this searches for tags containing the name)
search_url = paperless_url + f"/api/tags/?name={year}"
tags_resp = session.get(search_url, timeout=timeout)
tags_resp.raise_for_status()
tags_data = tags_resp.json()
if tags_data["results"]:
# Look for exact match since API returns partial matches
for tag in tags_data["results"]:
if tag['name'] == year:
print(f"Found existing year tag '{year}' with ID: {tag['id']}")
return tag['id']
# Tag doesn't exist, create it
print(f"No existing tag found, creating new tag '{year}'")
create_resp = session.post(
paperless_url + "/api/tags/",
data=json.dumps({
"name": year,
"color": "#007acc", # Blue color for year tags
"is_inbox_tag": False
}),
timeout=timeout
)
create_resp.raise_for_status()
tag_data = create_resp.json()
tag_id = tag_data["id"]
print(f"Created new year tag '{year}' with ID: {tag_id}")
return tag_id
except requests.exceptions.RequestException as e:
print(f"Failed to get/create year tag '{year}': {e}")
if hasattr(e, 'response') and e.response is not None:
print(f"Response status: {e.response.status_code}")
print(f"Response text: {e.response.text}")
return None
def add_year_tag_to_document(doc_pk: int, year: str, paperless_url: str, timeout: float, session: requests.Session) -> bool:
"""
Add year tag to document
Returns True if successful, False otherwise
"""
# Get or create the year tag
tag_id = get_or_create_year_tag(year, paperless_url, timeout, session)
if not tag_id:
return False
try:
# Get current document tags
doc_resp = session.get(
paperless_url + f"/api/documents/{doc_pk}/",
timeout=timeout
)
doc_resp.raise_for_status()
doc_data = doc_resp.json()
current_tags = doc_data.get("tags", [])
# Check if year tag is already assigned
if tag_id in current_tags:
print(f"Document {doc_pk} already has year tag '{year}'")
return True
# Add year tag to existing tags
updated_tags = current_tags + [tag_id]
# Update document with new tags
update_resp = session.patch(
paperless_url + f"/api/documents/{doc_pk}/",
data=json.dumps({"tags": updated_tags}),
timeout=timeout
)
update_resp.raise_for_status()
print(f"Document {doc_pk} - Added year tag '{year}'")
return True
except requests.exceptions.RequestException as e:
print(f"Failed to add year tag '{year}' to document {doc_pk}: {e}")
return False
def test_api_connection(paperless_url: str, timeout: float, session: requests.Session) -> bool:
"""Test API connection and authentication"""
try:
response = session.get(
paperless_url + "/api/documents/?page_size=1",
timeout=timeout
)
response.raise_for_status()
print("API connection successful")
return True
except requests.exceptions.RequestException as e:
print(f"API connection failed: {e}")
return False
def update_document(doc_pk: int, paperless_url: str, timeout: float, session: requests.Session):
"""Main function to update document title and date"""
# Get document info
try:
doc_info_resp = session.get(
paperless_url + f"/api/documents/{doc_pk}/",
timeout=timeout
)
doc_info_resp.raise_for_status()
doc_info = doc_info_resp.json()
except requests.exceptions.RequestException as e:
print(f"Failed to fetch document {doc_pk}: {e}")
return
original_filename = doc_info["original_file_name"]
current_title = doc_info["title"]
print(f"Processing document {doc_pk}: {original_filename}")
# Try to extract title and date from filename
new_title, date_string, pattern_name = extract_title_and_date(original_filename)
if not new_title and not date_string:
print(f"Document {doc_pk} - No matching pattern found for: {original_filename}")
return
print(f"Document {doc_pk} - Matched pattern: {pattern_name}")
# Prepare update data
update_data = {}
parsed_date = None
# Update title if extracted
if new_title and new_title != current_title:
update_data["title"] = new_title
print(f"Document {doc_pk} - Title will be updated to: {new_title}")
# Update date if extracted and valid
if date_string:
# Find the pattern config to get date format
pattern_result = match_filename_pattern(original_filename)
if pattern_result:
pattern_config, _ = pattern_result
parsed_date = parse_date(date_string, pattern_config["date_format"])
if parsed_date:
update_data["created"] = parsed_date.isoformat()
print(f"Document {doc_pk} - Date will be updated to: {parsed_date}")
else:
print(f"Document {doc_pk} - Invalid date format: {date_string}")
# Apply updates if any
if update_data:
try:
resp = session.patch(
paperless_url + f"/api/documents/{doc_pk}/",
data=json.dumps(update_data),
timeout=timeout,
)
resp.raise_for_status()
print(f"Document {doc_pk} - Successfully updated: {update_data}")
except requests.exceptions.RequestException as e:
print(f"Document {doc_pk} - Failed to update: {e}")
return
else:
print(f"Document {doc_pk} - No updates needed")
# Add year tag if we have a valid date
if parsed_date:
year = str(parsed_date.year)
add_year_tag_to_document(doc_pk, year, paperless_url, timeout, session)
if __name__ == "__main__":
# Get configuration from environment variables
paperless_url, api_token, timeout = get_config_from_env()
# Validate required environment variables
if not api_token:
print("Error: PAPERLESS_API_TOKEN environment variable is required")
print("Set it with: export PAPERLESS_API_TOKEN=your_token_here")
exit(1)
print(f"Using Paperless URL: {paperless_url}")
print(f"Using timeout: {timeout}s")
try:
with requests.Session() as sess:
# Set authentication headers
_set_auth_headers(sess, api_token)
# Test API connection
if not test_api_connection(paperless_url, timeout, sess):
print("Exiting due to API connection failure")
exit(1)
# Get document ID from environment
doc_pk = int(os.environ["DOCUMENT_ID"])
update_document(doc_pk, paperless_url, timeout, sess)
except KeyError:
print("Error: DOCUMENT_ID environment variable not found")
exit(1)
except ValueError:
print("Error: DOCUMENT_ID is not a valid integer")
exit(1)
except Exception as e:
print(f"Unexpected error: {e}")
exit(1)