Initial commit of document-service

This commit is contained in:
2026-04-23 16:20:58 -05:00
commit 51d60f0032
30 changed files with 4357 additions and 0 deletions

0
app/__init__.py Normal file
View File

31
app/config.py Normal file
View File

@@ -0,0 +1,31 @@
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
# S3 settings
s3_endpoint: str = "http://localhost:9000"
s3_access_key: str = "minioadmin"
s3_secret_key: str = "minioadmin"
s3_bucket: str = "document-bucket"
s3_region: str = "us-east-1"
# Service settings
host: str = "0.0.0.0"
port: int = 8082
# File size limits (bytes)
max_file_size_pdf: int = 50 * 1024 * 1024 # 50MB
max_file_size_docx: int = 25 * 1024 * 1024 # 25MB
max_file_size_xlsx: int = 25 * 1024 * 1024 # 25MB
max_file_size_jpg: int = 10 * 1024 * 1024 # 10MB
max_file_size_jpeg: int = 10 * 1024 * 1024 # 10MB
max_file_size_png: int = 10 * 1024 * 1024 # 10MB
max_file_size_gif: int = 10 * 1024 * 1024 # 10MB
max_file_size_default: int = 10 * 1024 * 1024 # 10MB
# Logging
log_level: str = "INFO"
class Config:
env_file = ".env"
settings = Settings()

38
app/enums.py Normal file
View File

@@ -0,0 +1,38 @@
from enum import Enum
class DocumentType(str, Enum):
PDF = "pdf"
DOCX = "docx"
XLSX = "xlsx"
JPG = "jpg"
JPEG = "jpeg"
PNG = "png"
GIF = "gif"
@classmethod
def from_mime_type(cls, mime_type: str) -> "DocumentType":
"""Map MIME type to DocumentType"""
mapping = {
"application/pdf": cls.PDF,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": cls.DOCX,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": cls.XLSX,
"image/jpeg": cls.JPG,
"image/png": cls.PNG,
"image/gif": cls.GIF,
}
return mapping.get(mime_type.lower())
@classmethod
def from_extension(cls, filename: str) -> "DocumentType":
"""Map file extension to DocumentType"""
ext = filename.split(".")[-1].lower()
mapping = {
"pdf": cls.PDF,
"docx": cls.DOCX,
"xlsx": cls.XLSX,
"jpg": cls.JPG,
"jpeg": cls.JPEG,
"png": cls.PNG,
"gif": cls.GIF,
}
return mapping.get(ext)

13
app/logger.py Normal file
View File

@@ -0,0 +1,13 @@
import logging
from app.config import settings
def setup_logging():
"""Setup logging configuration"""
logging.basicConfig(
level=getattr(logging, settings.log_level.upper()),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
def get_logger(name: str) -> logging.Logger:
"""Get logger with specified name"""
return logging.getLogger(name)

82
app/main.py Normal file
View File

@@ -0,0 +1,82 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.openapi.utils import get_openapi
from app.routers import documents
from app.config import settings
from app.logger import setup_logging
from app.middleware.auth import AuthMiddleware
# Setup logging
setup_logging()
app = FastAPI(
title="Document Service",
version="1.0.0",
description="Generic document management service with S3 storage and PDF field discovery",
openapi_url="/openapi3.json",
docs_url="/docs",
redoc_url="/redoc"
)
# Add auth middleware
app.add_middleware(AuthMiddleware)
app.add_middleware(
CORSMiddleware,
allow_origins=["http://localhost:3000"],
allow_methods=["*"],
allow_headers=["*"]
)
app.include_router(documents.router)
@app.on_event("startup")
async def startup_event():
"""Run startup tasks.
Raises:
Exception: If S3 bucket initialization fails (service will fail to start)
"""
from app import s3
from app.logger import get_logger
logger = get_logger(__name__)
logger.info("Starting up document service...")
try:
s3.ensure_bucket_exists()
logger.info("S3 bucket initialization complete")
except Exception as e:
logger.error(f"Failed to initialize S3 bucket: {e}")
# Re-raise to fail startup
raise
@app.get("/health", tags=["health"])
def health():
return {"status": "ok"}
@app.get("/health/ready", tags=["health"])
def health_ready():
"""Health check for Kubernetes readiness probes."""
return {"status": "ready"}
def custom_openapi():
if app.openapi_schema:
return app.openapi_schema
schema = get_openapi(
title="Document Service",
version="1.0.0",
openapi_version="3.1.0",
description="Generic document management service with S3 storage and PDF field discovery",
routes=app.routes
)
schema["servers"] = [
{"url": "http://localhost:8082", "description": "Local dev"}
]
app.openapi_schema = schema
return app.openapi_schema
app.openapi = custom_openapi

View File

16
app/middleware/auth.py Normal file
View File

@@ -0,0 +1,16 @@
from fastapi import Request
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.responses import JSONResponse
from app.logger import get_logger
logger = get_logger(__name__)
class AuthMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
# Skip auth for health endpoint
if request.url.path == "/health":
return await call_next(request)
request.state.org_id = "test"
response = await call_next(request)
return response

30
app/models.py Normal file
View File

@@ -0,0 +1,30 @@
from pydantic import BaseModel, Field
from datetime import datetime
from typing import Optional
from app.enums import DocumentType
class DocumentMetadata(BaseModel):
document_id: str = Field(..., description="UUID of the document")
org_id: str = Field(..., description="Organization ID")
document_type: DocumentType = Field(..., description="Type of document")
filename: str = Field(..., description="Original filename")
content_type: str = Field(..., description="MIME type")
file_size: int = Field(..., description="File size in bytes")
s3_key: str = Field(..., description="S3 key for the document")
created_at: datetime = Field(default_factory=datetime.utcnow)
updated_at: datetime = Field(default_factory=datetime.utcnow)
class UploadResponse(BaseModel):
document_id: str
metadata: DocumentMetadata
download_url: str
class DownloadUrlResponse(BaseModel):
download_url: str
s3_key: str
expires_in: int
class FieldsResponse(BaseModel):
document_id: str
document_type: DocumentType
fields: list[dict]

105
app/pdf.py Normal file
View File

@@ -0,0 +1,105 @@
import os
from pypdf import PdfReader
from typing import Any
def discover_fields(pdf_path: str) -> list[dict]:
"""
Introspect a PDF and return all fillable AcroForm fields.
Handles any form of AcroForm structure.
"""
reader = PdfReader(pdf_path)
# Try multiple methods to get fields
fields = None
# Method 1: Try get_fields() first
try:
fields = reader.get_fields()
except Exception as e:
print(f"get_fields() failed: {e}")
fields = None
# Method 2: Try to get fields from AcroForm directly
if not fields:
try:
if "/AcroForm" in reader.trailer["/Root"]:
acroform = reader.trailer["/Root"]["/AcroForm"]
if "/Fields" in acroform:
fields = {}
field_array = acroform["/Fields"]
for field_ref in field_array:
try:
field_obj = field_ref.get_object()
field_name = field_obj.get("/T", "")
if field_name:
fields[field_name] = field_obj
except Exception as e:
print(f"Error processing field: {e}")
continue
except Exception as e:
print(f"Direct AcroForm access failed: {e}")
fields = None
# Method 3: Try to get fields from page annotations
if not fields:
try:
fields = {}
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
try:
annot_obj = annot.get_object()
if "/Subtype" in annot_obj and annot_obj["/Subtype"] == "/Widget":
field_name = annot_obj.get("/T", "")
if field_name and field_name not in fields:
fields[field_name] = annot_obj
except Exception as e:
print(f"Error processing annotation: {e}")
continue
except Exception as e:
print(f"Page annotation access failed: {e}")
fields = None
if not fields:
return []
result = []
for field_name, field_obj in fields.items():
try:
field_type = field_obj.get("/FT", "")
options = []
# /Ch = choice field (select/dropdown)
if field_type == "/Ch":
opt = field_obj.get("/Opt", [])
if opt:
options = [o if isinstance(o, str) else o[1] for o in opt]
result.append({
"field": field_name,
"label": field_name.replace("_", " ").title(),
"type": _map_field_type(field_type, field_obj),
"required": False,
"options": options if options else None
})
except Exception as e:
print(f"Error processing field {field_name}: {e}")
continue
return result
def _map_field_type(ft: str, field_obj: dict) -> str:
mapping = {
"/Tx": "string",
"/Btn": "boolean",
"/Ch": "select",
"/Sig": "string"
}
base = mapping.get(ft, "string")
# Check if it's a date field by name hint
field_name = field_obj.get("/T", "").lower()
if any(hint in field_name for hint in ["date", "fecha", "birth", "nacimiento"]):
return "date"
return base

1
app/routers/__init__.py Normal file
View File

@@ -0,0 +1 @@
from app.routers import documents

355
app/routers/documents.py Normal file
View File

@@ -0,0 +1,355 @@
import os
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Request
from typing import Optional
from datetime import datetime
from app import s3, pdf, utils
from app.config import settings
from app.enums import DocumentType
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
from app.logger import get_logger
router = APIRouter(prefix="/api/documents", tags=["documents"])
logger = get_logger(__name__)
@router.post("/upload", response_model=UploadResponse)
async def upload_document(
request: Request,
file: UploadFile = File(...)
):
"""Upload a new document"""
org_id = request.state.org_id
user_id = getattr(request.state, "user_id", "system")
logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}")
# Detect content type
detected_content_type = utils.detect_content_type(file)
logger.info(f"Detected content type: {detected_content_type}")
# Detect document type
document_type = utils.detect_document_type(file.filename, detected_content_type)
if not document_type:
logger.error(f"Unsupported document type: {file.filename}")
raise HTTPException(status_code=415, detail="Unsupported document type")
# Get file size
file.file.seek(0, os.SEEK_END)
file_size = file.file.tell()
file.file.seek(0)
# Validate file size
utils.validate_file_size(file_size, document_type)
# Generate document ID and S3 key
document_id = utils.generate_document_id()
sanitized_filename = utils.sanitize_filename(file.filename)
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
# Prepare metadata
metadata_dict = {
"org_id": org_id,
"document_type": document_type.value,
"filename": file.filename,
"file_size": str(file_size),
"created_at": datetime.utcnow().isoformat()
}
# Upload to S3
try:
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
logger.info(f"File uploaded successfully: {s3_key}")
except Exception as e:
logger.error(f"Failed to upload file: {e}")
raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}")
# Generate download URL
download_url = s3.presigned_download_url(s3_key)
# Create metadata response
metadata = DocumentMetadata(
document_id=document_id,
org_id=org_id,
document_type=document_type,
filename=file.filename,
content_type=detected_content_type,
file_size=file_size,
s3_key=s3_key,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow()
)
logger.info(f"Upload completed - document_id: {document_id}")
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
@router.put("/{document_id}", response_model=UploadResponse)
async def rewrite_document(
request: Request,
document_id: str,
file: UploadFile = File(...)
):
"""Rewrite/replace an existing document"""
org_id = request.state.org_id
user_id = getattr(request.state, "user_id", "system")
logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}")
# Detect content type
detected_content_type = utils.detect_content_type(file)
# Detect document type
document_type = utils.detect_document_type(file.filename, detected_content_type)
if not document_type:
raise HTTPException(status_code=415, detail="Unsupported document type")
# Get file size
file.file.seek(0, os.SEEK_END)
file_size = file.file.tell()
file.file.seek(0)
# Validate file size
utils.validate_file_size(file_size, document_type)
# Generate S3 key
sanitized_filename = utils.sanitize_filename(file.filename)
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
# Check if document exists
if not s3.file_exists(s3_key):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
# Verify org_id matches
existing_metadata = s3.get_file_metadata(s3_key)
if existing_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Prepare metadata
metadata_dict = {
"org_id": org_id,
"document_type": document_type.value,
"filename": file.filename,
"file_size": str(file_size),
"updated_at": datetime.utcnow().isoformat()
}
# Upload to S3 (overwrites existing)
try:
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
logger.info(f"File rewritten successfully: {s3_key}")
except Exception as e:
logger.error(f"Failed to rewrite file: {e}")
raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}")
# Generate download URL
download_url = s3.presigned_download_url(s3_key)
# Create metadata response
metadata = DocumentMetadata(
document_id=document_id,
org_id=org_id,
document_type=document_type,
filename=file.filename,
content_type=detected_content_type,
file_size=file_size,
s3_key=s3_key,
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
updated_at=datetime.utcnow()
)
logger.info(f"Rewrite completed - document_id: {document_id}")
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
@router.get("/{document_id}", response_model=DocumentMetadata)
async def get_document(request: Request, document_id: str):
"""Get document metadata"""
org_id = request.state.org_id
logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}")
# List objects to find the document
client = s3.get_client()
prefix = utils.s3_path_prefix(org_id, document_id)
try:
response = client.list_objects_v2(
Bucket=settings.s3_bucket,
Prefix=prefix,
MaxKeys=1
)
except Exception as e:
logger.error(f"Failed to list objects: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
if not response.get("Contents"):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
s3_key = response["Contents"][0]["Key"]
# Get metadata from S3
s3_metadata = s3.get_file_metadata(s3_key)
# Verify org_id matches
if s3_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Get object info
try:
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
except Exception as e:
logger.error(f"Failed to get object info: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
# Create metadata response
metadata = DocumentMetadata(
document_id=document_id,
org_id=s3_metadata.get("org_id"),
document_type=DocumentType(s3_metadata.get("document_type")),
filename=s3_metadata.get("filename"),
content_type=object_info.get("ContentType"),
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
s3_key=s3_key,
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
)
logger.info(f"Get document completed - document_id: {document_id}")
return metadata
@router.get("/{document_id}/download-url", response_model=DownloadUrlResponse)
async def get_download_url(request: Request, document_id: str, expires_in: int = 3600):
"""Get presigned download URL"""
org_id = request.state.org_id
logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}")
# List objects to find the document
client = s3.get_client()
prefix = utils.s3_path_prefix(org_id, document_id)
try:
response = client.list_objects_v2(
Bucket=settings.s3_bucket,
Prefix=prefix,
MaxKeys=1
)
except Exception as e:
logger.error(f"Failed to list objects: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
if not response.get("Contents"):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
s3_key = response["Contents"][0]["Key"]
# Verify org_id matches
s3_metadata = s3.get_file_metadata(s3_key)
if s3_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Generate download URL
download_url = s3.presigned_download_url(s3_key, expires_in)
logger.info(f"Download URL generated - document_id: {document_id}")
return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in)
@router.get("/{document_id}/fields", response_model=FieldsResponse)
async def get_document_fields(request: Request, document_id: str):
"""Get PDF form fields (PDF only)"""
org_id = request.state.org_id
logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}")
# List objects to find the document
client = s3.get_client()
prefix = utils.s3_path_prefix(org_id, document_id)
try:
response = client.list_objects_v2(
Bucket=settings.s3_bucket,
Prefix=prefix,
MaxKeys=1
)
except Exception as e:
logger.error(f"Failed to list objects: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
if not response.get("Contents"):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
s3_key = response["Contents"][0]["Key"]
# Get metadata
s3_metadata = s3.get_file_metadata(s3_key)
# Verify org_id matches
if s3_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Check if PDF
document_type = s3_metadata.get("document_type")
if document_type != DocumentType.PDF.value:
logger.error(f"Document is not PDF: {document_type}")
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
# Download and discover fields
try:
pdf_path = s3.download_to_temp(s3_key)
fields = pdf.discover_fields(pdf_path)
logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}")
except Exception as e:
logger.error(f"Failed to discover fields: {e}")
raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}")
finally:
if os.path.exists(pdf_path):
os.unlink(pdf_path)
return FieldsResponse(
document_id=document_id,
document_type=DocumentType.PDF,
fields=fields
)
@router.delete("/{document_id}")
async def delete_document(request: Request, document_id: str):
"""Delete document"""
org_id = request.state.org_id
logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}")
# List objects to find the document
client = s3.get_client()
prefix = utils.s3_path_prefix(org_id, document_id)
try:
response = client.list_objects_v2(
Bucket=settings.s3_bucket,
Prefix=prefix,
MaxKeys=1
)
except Exception as e:
logger.error(f"Failed to list objects: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
if not response.get("Contents"):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
s3_key = response["Contents"][0]["Key"]
# Verify org_id matches
s3_metadata = s3.get_file_metadata(s3_key)
if s3_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Delete from S3
try:
s3.delete_file(s3_key)
logger.info(f"Document deleted - document_id: {document_id}")
except Exception as e:
logger.error(f"Failed to delete document: {e}")
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
return {"message": "Document deleted successfully"}

101
app/s3.py Normal file
View File

@@ -0,0 +1,101 @@
import boto3
import tempfile
import os
from botocore.client import Config
from fastapi import UploadFile
from app.config import settings
from app.logger import get_logger
logger = get_logger(__name__)
def get_client():
return boto3.client(
"s3",
endpoint_url=settings.s3_endpoint,
aws_access_key_id=settings.s3_access_key,
aws_secret_access_key=settings.s3_secret_key,
config=Config(signature_version="s3v4"),
region_name=settings.s3_region
)
def ensure_bucket_exists() -> None:
"""Ensure the S3 bucket exists, create it if it doesn't exist.
Raises:
Exception: If bucket creation fails (service will fail to start)
"""
client = get_client()
try:
client.head_bucket(Bucket=settings.s3_bucket)
logger.info(f"Bucket '{settings.s3_bucket}' already exists")
except client.exceptions.ClientError as e:
error_code = e.response['Error']['Code']
if error_code == '404':
try:
client.create_bucket(
Bucket=settings.s3_bucket,
CreateBucketConfiguration={
'LocationConstraint': settings.s3_region
}
)
logger.info(f"Created bucket '{settings.s3_bucket}'")
except Exception as create_error:
logger.error(f"Failed to create bucket '{settings.s3_bucket}': {create_error}")
raise
else:
logger.error(f"Error checking bucket: {e}")
raise
def upload_file(file: UploadFile, s3_key: str, content_type: str, metadata: dict = None) -> str:
"""Upload file to S3 with metadata"""
client = get_client()
extra_args = {"ContentType": content_type}
if metadata:
extra_args["Metadata"] = metadata
client.upload_fileobj(
file.file,
settings.s3_bucket,
s3_key,
ExtraArgs=extra_args
)
return s3_key
def delete_file(s3_key: str) -> None:
"""Delete file from S3"""
client = get_client()
client.delete_object(Bucket=settings.s3_bucket, Key=s3_key)
def file_exists(s3_key: str) -> bool:
"""Check if file exists in S3"""
client = get_client()
try:
client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
return True
except client.exceptions.ClientError:
return False
def get_file_metadata(s3_key: str) -> dict:
"""Get file metadata from S3"""
client = get_client()
response = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
return response.get("Metadata", {})
def download_to_temp(s3_key: str) -> str:
"""Download file from S3 to temp file"""
client = get_client()
suffix = os.path.splitext(s3_key)[-1] or ".tmp"
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
client.download_fileobj(settings.s3_bucket, s3_key, tmp)
tmp.close()
return tmp.name
def presigned_download_url(s3_key: str, expires_in: int = 3600) -> str:
"""Generate presigned download URL"""
client = get_client()
return client.generate_presigned_url(
"get_object",
Params={"Bucket": settings.s3_bucket, "Key": s3_key},
ExpiresIn=expires_in
)

66
app/utils.py Normal file
View File

@@ -0,0 +1,66 @@
import uuid
import magic
from fastapi import HTTPException, UploadFile
from app.config import settings
from app.enums import DocumentType
def generate_document_id() -> str:
"""Generate UUID for document"""
return str(uuid.uuid4())
def s3_path_prefix(org_id: str, document_id: str) -> str:
"""Generate S3 path prefix for document operations"""
return f"documents/{org_id}/{document_id}/"
def detect_content_type(file: UploadFile) -> str:
"""Detect content type using python-magic"""
file.file.seek(0)
content = file.file.read(2048)
file.file.seek(0)
mime = magic.Magic(mime=True)
return mime.from_buffer(content)
def detect_document_type(filename: str, content_type: str) -> DocumentType:
"""Detect document type from filename and content type"""
# Try content type first
doc_type = DocumentType.from_mime_type(content_type)
if doc_type:
return doc_type
# Fall back to extension
return DocumentType.from_extension(filename)
def get_file_size_limit(document_type: DocumentType) -> int:
"""Get max file size for document type"""
limits = {
DocumentType.PDF: settings.max_file_size_pdf,
DocumentType.DOCX: settings.max_file_size_docx,
DocumentType.XLSX: settings.max_file_size_xlsx,
DocumentType.JPG: settings.max_file_size_jpg,
DocumentType.JPEG: settings.max_file_size_jpeg,
DocumentType.PNG: settings.max_file_size_png,
DocumentType.GIF: settings.max_file_size_gif,
}
return limits.get(document_type, settings.max_file_size_default)
def validate_file_size(file_size: int, document_type: DocumentType) -> None:
"""Validate file size against limits"""
max_size = get_file_size_limit(document_type)
if file_size > max_size:
raise HTTPException(
status_code=413,
detail=f"File size {file_size} exceeds maximum {max_size} for {document_type.value}"
)
def document_s3_key(org_id: str, document_id: str, filename: str) -> str:
"""Generate S3 key for document"""
return f"{s3_path_prefix(org_id, document_id)}{filename}"
def sanitize_filename(filename: str) -> str:
"""Sanitize filename for S3"""
# Remove path separators and special characters
filename = filename.replace("/", "_").replace("\\", "_")
# Keep only safe characters
safe_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_")
return "".join(c for c in filename if c in safe_chars)