Files
document-service/app/routers/documents.py
HaimKortovich 7e91a385f4
All checks were successful
Build and Publish / build-release (push) Successful in 47s
fix indent
2026-04-24 12:21:53 -05:00

356 lines
13 KiB
Python

import os
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Request
from typing import Optional
from datetime import datetime
from app import s3, pdf, utils
from app.config import settings
from app.enums import DocumentType
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
from app.logger import get_logger
router = APIRouter(prefix="/api/documents", tags=["documents"])
logger = get_logger(__name__)
@router.post("/upload", response_model=UploadResponse)
async def upload_document(
request: Request,
file: UploadFile = File(...)
):
"""Upload a new document"""
org_id = request.state.org_id
user_id = getattr(request.state, "user_id", "system")
logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}")
# Detect content type
detected_content_type = utils.detect_content_type(file)
logger.info(f"Detected content type: {detected_content_type}")
# Detect document type
document_type = utils.detect_document_type(file.filename, detected_content_type)
if not document_type:
logger.error(f"Unsupported document type: {file.filename}")
raise HTTPException(status_code=415, detail="Unsupported document type")
# Get file size
file.file.seek(0, os.SEEK_END)
file_size = file.file.tell()
file.file.seek(0)
# Validate file size
utils.validate_file_size(file_size, document_type)
# Generate document ID and S3 key
document_id = utils.generate_document_id()
sanitized_filename = utils.sanitize_filename(file.filename)
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
# Prepare metadata
metadata_dict = {
"org_id": org_id,
"document_type": document_type.value,
"filename": sanitized_filename,
"file_size": str(file_size),
"created_at": datetime.utcnow().isoformat()
}
# Upload to S3
try:
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
logger.info(f"File uploaded successfully: {s3_key}")
except Exception as e:
logger.error(f"Failed to upload file: {e}")
raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}")
# Generate download URL
download_url = s3.presigned_download_url(s3_key)
# Create metadata response
metadata = DocumentMetadata(
document_id=document_id,
org_id=org_id,
document_type=document_type,
filename=sanitized_filename,
content_type=detected_content_type,
file_size=file_size,
s3_key=s3_key,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow()
)
logger.info(f"Upload completed - document_id: {document_id}")
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
@router.put("/{document_id}", response_model=UploadResponse)
async def rewrite_document(
request: Request,
document_id: str,
file: UploadFile = File(...)
):
"""Rewrite/replace an existing document"""
org_id = request.state.org_id
user_id = getattr(request.state, "user_id", "system")
logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}")
# Detect content type
detected_content_type = utils.detect_content_type(file)
# Detect document type
document_type = utils.detect_document_type(file.filename, detected_content_type)
if not document_type:
raise HTTPException(status_code=415, detail="Unsupported document type")
# Get file size
file.file.seek(0, os.SEEK_END)
file_size = file.file.tell()
file.file.seek(0)
# Validate file size
utils.validate_file_size(file_size, document_type)
# Generate S3 key
sanitized_filename = utils.sanitize_filename(file.filename)
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
# Check if document exists
if not s3.file_exists(s3_key):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
# Verify org_id matches
existing_metadata = s3.get_file_metadata(s3_key)
if existing_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Prepare metadata
metadata_dict = {
"org_id": org_id,
"document_type": document_type.value,
"filename": sanitized_filename,
"file_size": str(file_size),
"updated_at": datetime.utcnow().isoformat()
}
# Upload to S3 (overwrites existing)
try:
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
logger.info(f"File rewritten successfully: {s3_key}")
except Exception as e:
logger.error(f"Failed to rewrite file: {e}")
raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}")
# Generate download URL
download_url = s3.presigned_download_url(s3_key)
# Create metadata response
metadata = DocumentMetadata(
document_id=document_id,
org_id=org_id,
document_type=document_type,
filename=sanitized_filename,
content_type=detected_content_type,
file_size=file_size,
s3_key=s3_key,
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
updated_at=datetime.utcnow()
)
logger.info(f"Rewrite completed - document_id: {document_id}")
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
@router.get("/{document_id}", response_model=DocumentMetadata)
async def get_document(request: Request, document_id: str):
"""Get document metadata"""
org_id = request.state.org_id
logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}")
# List objects to find the document
client = s3.get_client()
prefix = utils.s3_path_prefix(org_id, document_id)
try:
response = client.list_objects_v2(
Bucket=settings.s3_bucket,
Prefix=prefix,
MaxKeys=1
)
except Exception as e:
logger.error(f"Failed to list objects: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
if not response.get("Contents"):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
s3_key = response["Contents"][0]["Key"]
# Get metadata from S3
s3_metadata = s3.get_file_metadata(s3_key)
# Verify org_id matches
if s3_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Get object info
try:
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
except Exception as e:
logger.error(f"Failed to get object info: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
# Create metadata response
metadata = DocumentMetadata(
document_id=document_id,
org_id=s3_metadata.get("org_id"),
document_type=DocumentType(s3_metadata.get("document_type")),
filename=s3_metadata.get("filename"),
content_type=object_info.get("ContentType"),
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
s3_key=s3_key,
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
)
logger.info(f"Get document completed - document_id: {document_id}")
return metadata
@router.get("/{document_id}/download-url", response_model=DownloadUrlResponse)
async def get_download_url(request: Request, document_id: str, expires_in: int = 3600):
"""Get presigned download URL"""
org_id = request.state.org_id
logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}")
# List objects to find the document
client = s3.get_client()
prefix = utils.s3_path_prefix(org_id, document_id)
try:
response = client.list_objects_v2(
Bucket=settings.s3_bucket,
Prefix=prefix,
MaxKeys=1
)
except Exception as e:
logger.error(f"Failed to list objects: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
if not response.get("Contents"):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
s3_key = response["Contents"][0]["Key"]
# Verify org_id matches
s3_metadata = s3.get_file_metadata(s3_key)
if s3_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Generate download URL
download_url = s3.presigned_download_url(s3_key, expires_in)
logger.info(f"Download URL generated - document_id: {document_id}")
return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in)
@router.get("/{document_id}/fields", response_model=FieldsResponse)
async def get_document_fields(request: Request, document_id: str):
"""Get PDF form fields (PDF only)"""
org_id = request.state.org_id
logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}")
# List objects to find the document
client = s3.get_client()
prefix = utils.s3_path_prefix(org_id, document_id)
try:
response = client.list_objects_v2(
Bucket=settings.s3_bucket,
Prefix=prefix,
MaxKeys=1
)
except Exception as e:
logger.error(f"Failed to list objects: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
if not response.get("Contents"):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
s3_key = response["Contents"][0]["Key"]
# Get metadata
s3_metadata = s3.get_file_metadata(s3_key)
# Verify org_id matches
if s3_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Check if PDF
document_type = s3_metadata.get("document_type")
if document_type != DocumentType.PDF.value:
logger.error(f"Document is not PDF: {document_type}")
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
# Download and discover fields
try:
pdf_path = s3.download_to_temp(s3_key)
fields = pdf.discover_fields(pdf_path)
logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}")
except Exception as e:
logger.error(f"Failed to discover fields: {e}")
raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}")
finally:
if os.path.exists(pdf_path):
os.unlink(pdf_path)
return FieldsResponse(
document_id=document_id,
document_type=DocumentType.PDF,
fields=fields
)
@router.delete("/{document_id}")
async def delete_document(request: Request, document_id: str):
"""Delete document"""
org_id = request.state.org_id
logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}")
# List objects to find the document
client = s3.get_client()
prefix = utils.s3_path_prefix(org_id, document_id)
try:
response = client.list_objects_v2(
Bucket=settings.s3_bucket,
Prefix=prefix,
MaxKeys=1
)
except Exception as e:
logger.error(f"Failed to list objects: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document")
if not response.get("Contents"):
logger.error(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
s3_key = response["Contents"][0]["Key"]
# Verify org_id matches
s3_metadata = s3.get_file_metadata(s3_key)
if s3_metadata.get("org_id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch")
# Delete from S3
try:
s3.delete_file(s3_key)
logger.info(f"Document deleted - document_id: {document_id}")
except Exception as e:
logger.error(f"Failed to delete document: {e}")
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
return {"message": "Document deleted successfully"}