All checks were successful
Build and Publish / build-release (push) Successful in 44s
355 lines
13 KiB
Python
355 lines
13 KiB
Python
import os
|
|
from fastapi import APIRouter, HTTPException, UploadFile, File, Request
|
|
from datetime import datetime
|
|
|
|
from app import s3, pdf, utils
|
|
from app.config import settings
|
|
from app.enums import DocumentType
|
|
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
|
|
from app.logger import get_logger
|
|
|
|
router = APIRouter(prefix="/api/documents", tags=["documents"])
|
|
logger = get_logger(__name__)
|
|
|
|
@router.post("/upload", response_model=UploadResponse)
|
|
async def upload_document(
|
|
request: Request,
|
|
file: UploadFile = File(...)
|
|
):
|
|
"""Upload a new document"""
|
|
org_id = request.state.org_id
|
|
user_id = getattr(request.state, "user_id", "system")
|
|
logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}")
|
|
|
|
# Detect content type
|
|
detected_content_type = utils.detect_content_type(file)
|
|
logger.info(f"Detected content type: {detected_content_type}")
|
|
|
|
# Detect document type
|
|
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
|
if not document_type:
|
|
logger.error(f"Unsupported document type: {file.filename}")
|
|
raise HTTPException(status_code=415, detail="Unsupported document type")
|
|
|
|
# Get file size
|
|
file.file.seek(0, os.SEEK_END)
|
|
file_size = file.file.tell()
|
|
file.file.seek(0)
|
|
|
|
# Validate file size
|
|
utils.validate_file_size(file_size, document_type)
|
|
|
|
# Generate document ID and S3 key
|
|
document_id = utils.generate_document_id()
|
|
sanitized_filename = utils.sanitize_filename(file.filename)
|
|
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
|
|
|
# Prepare metadata
|
|
metadata_dict = {
|
|
"org_id": org_id,
|
|
"document_type": document_type.value,
|
|
"filename": sanitized_filename,
|
|
"file_size": str(file_size),
|
|
"created_at": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# Upload to S3
|
|
try:
|
|
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
|
logger.info(f"File uploaded successfully: {s3_key}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to upload file: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}")
|
|
|
|
# Generate download URL
|
|
download_url = s3.presigned_download_url(s3_key)
|
|
|
|
# Create metadata response
|
|
metadata = DocumentMetadata(
|
|
document_id=document_id,
|
|
org_id=org_id,
|
|
document_type=document_type,
|
|
filename=sanitized_filename,
|
|
content_type=detected_content_type,
|
|
file_size=file_size,
|
|
s3_key=s3_key,
|
|
created_at=datetime.utcnow(),
|
|
updated_at=datetime.utcnow()
|
|
)
|
|
|
|
logger.info(f"Upload completed - document_id: {document_id}")
|
|
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
|
|
|
@router.put("/{document_id}", response_model=UploadResponse)
|
|
async def rewrite_document(
|
|
request: Request,
|
|
document_id: str,
|
|
file: UploadFile = File(...)
|
|
):
|
|
"""Rewrite/replace an existing document"""
|
|
org_id = request.state.org_id
|
|
user_id = getattr(request.state, "user_id", "system")
|
|
logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}")
|
|
|
|
# Detect content type
|
|
detected_content_type = utils.detect_content_type(file)
|
|
|
|
# Detect document type
|
|
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
|
if not document_type:
|
|
raise HTTPException(status_code=415, detail="Unsupported document type")
|
|
|
|
# Get file size
|
|
file.file.seek(0, os.SEEK_END)
|
|
file_size = file.file.tell()
|
|
file.file.seek(0)
|
|
|
|
# Validate file size
|
|
utils.validate_file_size(file_size, document_type)
|
|
|
|
# Generate S3 key
|
|
sanitized_filename = utils.sanitize_filename(file.filename)
|
|
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
|
|
|
# Check if document exists
|
|
if not s3.file_exists(s3_key):
|
|
logger.error(f"Document not found: {document_id}")
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
# Verify org_id matches
|
|
existing_metadata = s3.get_file_metadata(s3_key)
|
|
if existing_metadata.get("org_id") != org_id:
|
|
logger.error(f"Organization mismatch for document: {document_id}")
|
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
|
|
|
# Prepare metadata
|
|
metadata_dict = {
|
|
"org_id": org_id,
|
|
"document_type": document_type.value,
|
|
"filename": sanitized_filename,
|
|
"file_size": str(file_size),
|
|
"updated_at": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# Upload to S3 (overwrites existing)
|
|
try:
|
|
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
|
logger.info(f"File rewritten successfully: {s3_key}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to rewrite file: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}")
|
|
|
|
# Generate download URL
|
|
download_url = s3.presigned_download_url(s3_key)
|
|
|
|
# Create metadata response
|
|
metadata = DocumentMetadata(
|
|
document_id=document_id,
|
|
org_id=org_id,
|
|
document_type=document_type,
|
|
filename=sanitized_filename,
|
|
content_type=detected_content_type,
|
|
file_size=file_size,
|
|
s3_key=s3_key,
|
|
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
|
|
updated_at=datetime.utcnow()
|
|
)
|
|
|
|
logger.info(f"Rewrite completed - document_id: {document_id}")
|
|
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
|
|
|
@router.get("/{document_id}", response_model=DocumentMetadata)
|
|
async def get_document(request: Request, document_id: str):
|
|
"""Get document metadata"""
|
|
org_id = request.state.org_id
|
|
logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}")
|
|
|
|
# List objects to find the document
|
|
client = s3.get_client()
|
|
prefix = utils.s3_path_prefix(org_id, document_id)
|
|
|
|
try:
|
|
response = client.list_objects_v2(
|
|
Bucket=settings.s3_bucket,
|
|
Prefix=prefix,
|
|
MaxKeys=1
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to list objects: {e}")
|
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
|
|
|
if not response.get("Contents"):
|
|
logger.error(f"Document not found: {document_id}")
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
s3_key = response["Contents"][0]["Key"]
|
|
|
|
# Get metadata from S3
|
|
s3_metadata = s3.get_file_metadata(s3_key)
|
|
|
|
# Verify org_id matches
|
|
if s3_metadata.get("org_id") != org_id:
|
|
logger.error(f"Organization mismatch for document: {document_id}")
|
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
|
|
|
# Get object info
|
|
try:
|
|
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
|
except Exception as e:
|
|
logger.error(f"Failed to get object info: {e}")
|
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
|
|
|
# Create metadata response
|
|
metadata = DocumentMetadata(
|
|
document_id=document_id,
|
|
org_id=s3_metadata.get("org_id"),
|
|
document_type=DocumentType(s3_metadata.get("document_type")),
|
|
filename=s3_metadata.get("filename"),
|
|
content_type=object_info.get("ContentType"),
|
|
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
|
|
s3_key=s3_key,
|
|
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
|
|
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
|
|
)
|
|
|
|
logger.info(f"Get document completed - document_id: {document_id}")
|
|
return metadata
|
|
|
|
@router.get("/{document_id}/download-url", response_model=DownloadUrlResponse)
|
|
async def get_download_url(request: Request, document_id: str, expires_in: int = 3600):
|
|
"""Get presigned download URL"""
|
|
org_id = request.state.org_id
|
|
logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}")
|
|
|
|
# List objects to find the document
|
|
client = s3.get_client()
|
|
prefix = utils.s3_path_prefix(org_id, document_id)
|
|
|
|
try:
|
|
response = client.list_objects_v2(
|
|
Bucket=settings.s3_bucket,
|
|
Prefix=prefix,
|
|
MaxKeys=1
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to list objects: {e}")
|
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
|
|
|
if not response.get("Contents"):
|
|
logger.error(f"Document not found: {document_id}")
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
s3_key = response["Contents"][0]["Key"]
|
|
|
|
# Verify org_id matches
|
|
s3_metadata = s3.get_file_metadata(s3_key)
|
|
if s3_metadata.get("org_id") != org_id:
|
|
logger.error(f"Organization mismatch for document: {document_id}")
|
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
|
|
|
# Generate download URL
|
|
download_url = s3.presigned_download_url(s3_key, expires_in)
|
|
|
|
logger.info(f"Download URL generated - document_id: {document_id}")
|
|
return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in)
|
|
|
|
@router.get("/{document_id}/fields", response_model=FieldsResponse)
|
|
async def get_document_fields(request: Request, document_id: str):
|
|
"""Get PDF form fields (PDF only)"""
|
|
org_id = request.state.org_id
|
|
logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}")
|
|
|
|
# List objects to find the document
|
|
client = s3.get_client()
|
|
prefix = utils.s3_path_prefix(org_id, document_id)
|
|
|
|
try:
|
|
response = client.list_objects_v2(
|
|
Bucket=settings.s3_bucket,
|
|
Prefix=prefix,
|
|
MaxKeys=1
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to list objects: {e}")
|
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
|
|
|
if not response.get("Contents"):
|
|
logger.error(f"Document not found: {document_id}")
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
s3_key = response["Contents"][0]["Key"]
|
|
|
|
# Get metadata
|
|
s3_metadata = s3.get_file_metadata(s3_key)
|
|
|
|
# Verify org_id matches
|
|
if s3_metadata.get("org_id") != org_id:
|
|
logger.error(f"Organization mismatch for document: {document_id}")
|
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
|
|
|
# Check if PDF
|
|
document_type = s3_metadata.get("document_type")
|
|
if document_type != DocumentType.PDF.value:
|
|
logger.error(f"Document is not PDF: {document_type}")
|
|
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
|
|
|
|
# Download and discover fields
|
|
try:
|
|
pdf_path = s3.download_to_temp(s3_key)
|
|
fields = pdf.discover_fields(pdf_path)
|
|
logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to discover fields: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}")
|
|
finally:
|
|
if os.path.exists(pdf_path):
|
|
os.unlink(pdf_path)
|
|
|
|
return FieldsResponse(
|
|
document_id=document_id,
|
|
document_type=DocumentType.PDF,
|
|
fields=fields
|
|
)
|
|
|
|
@router.delete("/{document_id}")
|
|
async def delete_document(request: Request, document_id: str):
|
|
"""Delete document"""
|
|
org_id = request.state.org_id
|
|
logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}")
|
|
|
|
# List objects to find the document
|
|
client = s3.get_client()
|
|
prefix = utils.s3_path_prefix(org_id, document_id)
|
|
|
|
try:
|
|
response = client.list_objects_v2(
|
|
Bucket=settings.s3_bucket,
|
|
Prefix=prefix,
|
|
MaxKeys=1
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to list objects: {e}")
|
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
|
|
|
if not response.get("Contents"):
|
|
logger.error(f"Document not found: {document_id}")
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
s3_key = response["Contents"][0]["Key"]
|
|
|
|
# Verify org_id matches
|
|
s3_metadata = s3.get_file_metadata(s3_key)
|
|
if s3_metadata.get("org_id") != org_id:
|
|
logger.error(f"Organization mismatch for document: {document_id}")
|
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
|
|
|
# Delete from S3
|
|
try:
|
|
s3.delete_file(s3_key)
|
|
logger.info(f"Document deleted - document_id: {document_id}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete document: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
|
|
|
|
return {"message": "Document deleted successfully"}
|