Initial commit of document-service
This commit is contained in:
1
app/routers/__init__.py
Normal file
1
app/routers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from app.routers import documents
|
||||
355
app/routers/documents.py
Normal file
355
app/routers/documents.py
Normal file
@@ -0,0 +1,355 @@
|
||||
import os
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Request
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from app import s3, pdf, utils
|
||||
from app.config import settings
|
||||
from app.enums import DocumentType
|
||||
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
|
||||
from app.logger import get_logger
|
||||
|
||||
router = APIRouter(prefix="/api/documents", tags=["documents"])
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@router.post("/upload", response_model=UploadResponse)
|
||||
async def upload_document(
|
||||
request: Request,
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
"""Upload a new document"""
|
||||
org_id = request.state.org_id
|
||||
user_id = getattr(request.state, "user_id", "system")
|
||||
logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}")
|
||||
|
||||
# Detect content type
|
||||
detected_content_type = utils.detect_content_type(file)
|
||||
logger.info(f"Detected content type: {detected_content_type}")
|
||||
|
||||
# Detect document type
|
||||
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
||||
if not document_type:
|
||||
logger.error(f"Unsupported document type: {file.filename}")
|
||||
raise HTTPException(status_code=415, detail="Unsupported document type")
|
||||
|
||||
# Get file size
|
||||
file.file.seek(0, os.SEEK_END)
|
||||
file_size = file.file.tell()
|
||||
file.file.seek(0)
|
||||
|
||||
# Validate file size
|
||||
utils.validate_file_size(file_size, document_type)
|
||||
|
||||
# Generate document ID and S3 key
|
||||
document_id = utils.generate_document_id()
|
||||
sanitized_filename = utils.sanitize_filename(file.filename)
|
||||
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
||||
|
||||
# Prepare metadata
|
||||
metadata_dict = {
|
||||
"org_id": org_id,
|
||||
"document_type": document_type.value,
|
||||
"filename": file.filename,
|
||||
"file_size": str(file_size),
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Upload to S3
|
||||
try:
|
||||
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
||||
logger.info(f"File uploaded successfully: {s3_key}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key)
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=org_id,
|
||||
document_type=document_type,
|
||||
filename=file.filename,
|
||||
content_type=detected_content_type,
|
||||
file_size=file_size,
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.utcnow(),
|
||||
updated_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
logger.info(f"Upload completed - document_id: {document_id}")
|
||||
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
||||
|
||||
@router.put("/{document_id}", response_model=UploadResponse)
|
||||
async def rewrite_document(
|
||||
request: Request,
|
||||
document_id: str,
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
"""Rewrite/replace an existing document"""
|
||||
org_id = request.state.org_id
|
||||
user_id = getattr(request.state, "user_id", "system")
|
||||
logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}")
|
||||
|
||||
# Detect content type
|
||||
detected_content_type = utils.detect_content_type(file)
|
||||
|
||||
# Detect document type
|
||||
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
||||
if not document_type:
|
||||
raise HTTPException(status_code=415, detail="Unsupported document type")
|
||||
|
||||
# Get file size
|
||||
file.file.seek(0, os.SEEK_END)
|
||||
file_size = file.file.tell()
|
||||
file.file.seek(0)
|
||||
|
||||
# Validate file size
|
||||
utils.validate_file_size(file_size, document_type)
|
||||
|
||||
# Generate S3 key
|
||||
sanitized_filename = utils.sanitize_filename(file.filename)
|
||||
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
||||
|
||||
# Check if document exists
|
||||
if not s3.file_exists(s3_key):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Verify org_id matches
|
||||
existing_metadata = s3.get_file_metadata(s3_key)
|
||||
if existing_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Prepare metadata
|
||||
metadata_dict = {
|
||||
"org_id": org_id,
|
||||
"document_type": document_type.value,
|
||||
"filename": file.filename,
|
||||
"file_size": str(file_size),
|
||||
"updated_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Upload to S3 (overwrites existing)
|
||||
try:
|
||||
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
||||
logger.info(f"File rewritten successfully: {s3_key}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to rewrite file: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key)
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=org_id,
|
||||
document_type=document_type,
|
||||
filename=file.filename,
|
||||
content_type=detected_content_type,
|
||||
file_size=file_size,
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
|
||||
updated_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
logger.info(f"Rewrite completed - document_id: {document_id}")
|
||||
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
||||
|
||||
@router.get("/{document_id}", response_model=DocumentMetadata)
|
||||
async def get_document(request: Request, document_id: str):
|
||||
"""Get document metadata"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Get metadata from S3
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
|
||||
# Verify org_id matches
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Get object info
|
||||
try:
|
||||
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get object info: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=s3_metadata.get("org_id"),
|
||||
document_type=DocumentType(s3_metadata.get("document_type")),
|
||||
filename=s3_metadata.get("filename"),
|
||||
content_type=object_info.get("ContentType"),
|
||||
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
|
||||
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
|
||||
)
|
||||
|
||||
logger.info(f"Get document completed - document_id: {document_id}")
|
||||
return metadata
|
||||
|
||||
@router.get("/{document_id}/download-url", response_model=DownloadUrlResponse)
|
||||
async def get_download_url(request: Request, document_id: str, expires_in: int = 3600):
|
||||
"""Get presigned download URL"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Verify org_id matches
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key, expires_in)
|
||||
|
||||
logger.info(f"Download URL generated - document_id: {document_id}")
|
||||
return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in)
|
||||
|
||||
@router.get("/{document_id}/fields", response_model=FieldsResponse)
|
||||
async def get_document_fields(request: Request, document_id: str):
|
||||
"""Get PDF form fields (PDF only)"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Get metadata
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
|
||||
# Verify org_id matches
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Check if PDF
|
||||
document_type = s3_metadata.get("document_type")
|
||||
if document_type != DocumentType.PDF.value:
|
||||
logger.error(f"Document is not PDF: {document_type}")
|
||||
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
|
||||
|
||||
# Download and discover fields
|
||||
try:
|
||||
pdf_path = s3.download_to_temp(s3_key)
|
||||
fields = pdf.discover_fields(pdf_path)
|
||||
logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to discover fields: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}")
|
||||
finally:
|
||||
if os.path.exists(pdf_path):
|
||||
os.unlink(pdf_path)
|
||||
|
||||
return FieldsResponse(
|
||||
document_id=document_id,
|
||||
document_type=DocumentType.PDF,
|
||||
fields=fields
|
||||
)
|
||||
|
||||
@router.delete("/{document_id}")
|
||||
async def delete_document(request: Request, document_id: str):
|
||||
"""Delete document"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Verify org_id matches
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Delete from S3
|
||||
try:
|
||||
s3.delete_file(s3_key)
|
||||
logger.info(f"Document deleted - document_id: {document_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete document: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
|
||||
|
||||
return {"message": "Document deleted successfully"}
|
||||
Reference in New Issue
Block a user