import os from fastapi import APIRouter, HTTPException, UploadFile, File, Request from datetime import datetime from app import s3, pdf, utils from app.enums import DocumentType from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse from app.logger import get_logger router = APIRouter(prefix="/api/documents", tags=["documents"]) logger = get_logger(__name__) @router.post("/upload", response_model=UploadResponse) async def upload_document( request: Request, file: UploadFile = File(...) ): """Upload a new document""" org_id = request.state.org_id user_id = getattr(request.state, "user_id", "system") logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}") # Detect content type detected_content_type = utils.detect_content_type(file) logger.info(f"Detected content type: {detected_content_type}") # Detect document type document_type = utils.detect_document_type(file.filename, detected_content_type) if not document_type: logger.error(f"Unsupported document type: {file.filename}") raise HTTPException(status_code=415, detail="Unsupported document type") # Get file size file.file.seek(0, os.SEEK_END) file_size = file.file.tell() file.file.seek(0) # Validate file size utils.validate_file_size(file_size, document_type) # Generate document ID and S3 key document_id = utils.generate_document_id() sanitized_filename = utils.sanitize_filename(file.filename) s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename) # Prepare metadata metadata_dict = { "org-id": org_id, "document-type": document_type.value, "filename": sanitized_filename, "file-size": str(file_size), "created-at": datetime.utcnow().isoformat() } # Upload to S3 try: s3.upload_file(file, s3_key, detected_content_type, metadata_dict) logger.info(f"File uploaded successfully: {s3_key}") except Exception as e: logger.error(f"Failed to upload file: {e}") raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}") # Generate download URL download_url = s3.presigned_download_url(s3_key) # Create metadata response metadata = DocumentMetadata( document_id=document_id, org_id=org_id, document_type=document_type, filename=sanitized_filename, content_type=detected_content_type, file_size=file_size, s3_key=s3_key, created_at=datetime.utcnow(), updated_at=datetime.utcnow() ) logger.info(f"Upload completed - document_id: {document_id}") return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url) @router.put("/{document_id}", response_model=UploadResponse) async def rewrite_document( request: Request, document_id: str, file: UploadFile = File(...) ): """Rewrite/replace an existing document""" org_id = request.state.org_id user_id = getattr(request.state, "user_id", "system") logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}") # Detect content type detected_content_type = utils.detect_content_type(file) # Detect document type document_type = utils.detect_document_type(file.filename, detected_content_type) if not document_type: raise HTTPException(status_code=415, detail="Unsupported document type") # Get file size file.file.seek(0, os.SEEK_END) file_size = file.file.tell() file.file.seek(0) # Validate file size utils.validate_file_size(file_size, document_type) # Generate S3 key sanitized_filename = utils.sanitize_filename(file.filename) s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename) # Check if document exists if not s3.file_exists(s3_key): logger.error(f"Document not found: {document_id}") raise HTTPException(status_code=404, detail="Document not found") # Verify org_id matches existing_metadata = s3.get_file_metadata(s3_key) if existing_metadata.get("org-id") != org_id: logger.error(f"Organization mismatch for document: {document_id}") raise HTTPException(status_code=403, detail="Organization mismatch") # Prepare metadata metadata_dict = { "org-id": org_id, "document-type": document_type.value, "filename": sanitized_filename, "file-size": str(file_size), "updated-at": datetime.utcnow().isoformat() } # Upload to S3 (overwrites existing) try: s3.upload_file(file, s3_key, detected_content_type, metadata_dict) logger.info(f"File rewritten successfully: {s3_key}") except Exception as e: logger.error(f"Failed to rewrite file: {e}") raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}") # Generate download URL download_url = s3.presigned_download_url(s3_key) # Create metadata response metadata = DocumentMetadata( document_id=document_id, org_id=org_id, document_type=document_type, filename=sanitized_filename, content_type=detected_content_type, file_size=file_size, s3_key=s3_key, created_at=datetime.fromisoformat(existing_metadata.get("created-at", datetime.utcnow().isoformat())), updated_at=datetime.utcnow() ) logger.info(f"Rewrite completed - document_id: {document_id}") return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url) @router.get("/{document_id}", response_model=DocumentMetadata) async def get_document(request: Request, document_id: str): """Get document metadata""" org_id = request.state.org_id logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}") # List objects to find the document client = s3.get_client() prefix = utils.s3_path_prefix(org_id, document_id) try: response = client.list_objects_v2( Bucket=s3.get_bucket_name(), Prefix=prefix, MaxKeys=1 ) except Exception as e: logger.error(f"Failed to list objects: {e}") raise HTTPException(status_code=500, detail="Failed to retrieve document") if not response.get("Contents"): logger.error(f"Document not found: {document_id}") raise HTTPException(status_code=404, detail="Document not found") s3_key = response["Contents"][0]["Key"] # Get metadata from S3 s3_metadata = s3.get_file_metadata(s3_key) # Verify org_id matches if s3_metadata.get("org-id") != org_id: logger.error(f"Organization mismatch for document: {document_id}") raise HTTPException(status_code=403, detail="Organization mismatch") # Get object info try: object_info = client.head_object(Bucket=s3.get_bucket_name(), Key=s3_key) except Exception as e: logger.error(f"Failed to get object info: {e}") raise HTTPException(status_code=500, detail="Failed to retrieve document") # Create metadata response metadata = DocumentMetadata( document_id=document_id, org_id=s3_metadata.get("org-id"), document_type=DocumentType(s3_metadata.get("document-type")), filename=s3_metadata.get("filename"), content_type=object_info.get("ContentType"), file_size=int(s3_metadata.get("file-size", object_info.get("ContentLength", 0))), s3_key=s3_key, created_at=datetime.fromisoformat(s3_metadata.get("created-at", datetime.utcnow().isoformat())), updated_at=datetime.fromisoformat(s3_metadata.get("updated-at", datetime.utcnow().isoformat())) ) logger.info(f"Get document completed - document_id: {document_id}") return metadata @router.get("/{document_id}/download-url", response_model=DownloadUrlResponse) async def get_download_url(request: Request, document_id: str, expires_in: int = 3600): """Get presigned download URL""" org_id = request.state.org_id logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}") # List objects to find the document client = s3.get_client() prefix = utils.s3_path_prefix(org_id, document_id) try: response = client.list_objects_v2( Bucket=s3.get_bucket_name(), Prefix=prefix, MaxKeys=1 ) except Exception as e: logger.error(f"Failed to list objects: {e}") raise HTTPException(status_code=500, detail="Failed to retrieve document") if not response.get("Contents"): logger.error(f"Document not found: {document_id}") raise HTTPException(status_code=404, detail="Document not found") s3_key = response["Contents"][0]["Key"] # Verify org_id matches s3_metadata = s3.get_file_metadata(s3_key) if s3_metadata.get("org-id") != org_id: logger.error(f"Organization mismatch for document: {document_id}") raise HTTPException(status_code=403, detail="Organization mismatch") # Generate download URL download_url = s3.presigned_download_url(s3_key, expires_in) logger.info(f"Download URL generated - document_id: {document_id}") return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in) @router.get("/{document_id}/fields", response_model=FieldsResponse) async def get_document_fields(request: Request, document_id: str): """Get PDF form fields (PDF only)""" org_id = request.state.org_id logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}") # List objects to find the document client = s3.get_client() prefix = utils.s3_path_prefix(org_id, document_id) try: response = client.list_objects_v2( Bucket=s3.get_bucket_name(), Prefix=prefix, MaxKeys=1 ) except Exception as e: logger.error(f"Failed to list objects: {e}") raise HTTPException(status_code=500, detail="Failed to retrieve document") if not response.get("Contents"): logger.error(f"Document not found: {document_id}") raise HTTPException(status_code=404, detail="Document not found") s3_key = response["Contents"][0]["Key"] # Get metadata s3_metadata = s3.get_file_metadata(s3_key) # Verify org_id matches if s3_metadata.get("org-id") != org_id: logger.error(f"Organization mismatch for document: {document_id}") raise HTTPException(status_code=403, detail="Organization mismatch") # Check if PDF document_type = s3_metadata.get("document-type") if document_type != DocumentType.PDF.value: logger.error(f"Document is not PDF: {document_type}") raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents") # Download and discover fields try: pdf_path = s3.download_to_temp(s3_key) fields = pdf.discover_fields(pdf_path) logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}") except Exception as e: logger.error(f"Failed to discover fields: {e}") raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}") finally: if os.path.exists(pdf_path): os.unlink(pdf_path) return FieldsResponse( document_id=document_id, document_type=DocumentType.PDF, fields=fields ) @router.delete("/{document_id}") async def delete_document(request: Request, document_id: str): """Delete document""" org_id = request.state.org_id logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}") # List objects to find the document client = s3.get_client() prefix = utils.s3_path_prefix(org_id, document_id) try: response = client.list_objects_v2( Bucket=s3.get_bucket_name(), Prefix=prefix, MaxKeys=1 ) except Exception as e: logger.error(f"Failed to list objects: {e}") raise HTTPException(status_code=500, detail="Failed to retrieve document") if not response.get("Contents"): logger.error(f"Document not found: {document_id}") raise HTTPException(status_code=404, detail="Document not found") s3_key = response["Contents"][0]["Key"] # Verify org_id matches s3_metadata = s3.get_file_metadata(s3_key) if s3_metadata.get("org-id") != org_id: logger.error(f"Organization mismatch for document: {document_id}") raise HTTPException(status_code=403, detail="Organization mismatch") # Delete from S3 try: s3.delete_file(s3_key) logger.info(f"Document deleted - document_id: {document_id}") except Exception as e: logger.error(f"Failed to delete file: {e}") raise HTTPException(status_code=500, detail=f"Failed to delete file: {e}") return {"message": "Document deleted successfully"}