dont use _ in metadata
All checks were successful
Build and Publish / build-release (push) Successful in 49s

This commit is contained in:
2026-04-24 15:16:36 -05:00
parent c952063d7b
commit 1550fc7599
2 changed files with 48 additions and 53 deletions

View File

@@ -3,7 +3,6 @@ from fastapi import APIRouter, HTTPException, UploadFile, File, Request
from datetime import datetime from datetime import datetime
from app import s3, pdf, utils from app import s3, pdf, utils
from app.config import settings
from app.enums import DocumentType from app.enums import DocumentType
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
from app.logger import get_logger from app.logger import get_logger
@@ -44,13 +43,13 @@ async def upload_document(
sanitized_filename = utils.sanitize_filename(file.filename) sanitized_filename = utils.sanitize_filename(file.filename)
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename) s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
# Prepare metadata # Prepare metadata
metadata_dict = { metadata_dict = {
"org_id": org_id, "org-id": org_id,
"document_type": document_type.value, "document-type": document_type.value,
"filename": sanitized_filename, "filename": sanitized_filename,
"file_size": str(file_size), "file-size": str(file_size),
"created_at": datetime.utcnow().isoformat() "created-at": datetime.utcnow().isoformat()
} }
# Upload to S3 # Upload to S3
@@ -66,15 +65,15 @@ async def upload_document(
# Create metadata response # Create metadata response
metadata = DocumentMetadata( metadata = DocumentMetadata(
document_id=document_id, document_id=document_id,
org_id=org_id, org_id=org_id,
document_type=document_type, document_type=document_type,
filename=sanitized_filename, filename=sanitized_filename,
content_type=detected_content_type, content_type=detected_content_type,
file_size=file_size, file_size=file_size,
s3_key=s3_key, s3_key=s3_key,
created_at=datetime.utcnow(), created_at=datetime.utcnow(),
updated_at=datetime.utcnow() updated_at=datetime.utcnow()
) )
logger.info(f"Upload completed - document_id: {document_id}") logger.info(f"Upload completed - document_id: {document_id}")
@@ -118,17 +117,17 @@ async def rewrite_document(
# Verify org_id matches # Verify org_id matches
existing_metadata = s3.get_file_metadata(s3_key) existing_metadata = s3.get_file_metadata(s3_key)
if existing_metadata.get("org_id") != org_id: if existing_metadata.get("org-id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}") logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch") raise HTTPException(status_code=403, detail="Organization mismatch")
# Prepare metadata # Prepare metadata
metadata_dict = { metadata_dict = {
"org_id": org_id, "org-id": org_id,
"document_type": document_type.value, "document-type": document_type.value,
"filename": sanitized_filename, "filename": sanitized_filename,
"file_size": str(file_size), "file-size": str(file_size),
"updated_at": datetime.utcnow().isoformat() "updated-at": datetime.utcnow().isoformat()
} }
# Upload to S3 (overwrites existing) # Upload to S3 (overwrites existing)
@@ -142,16 +141,16 @@ async def rewrite_document(
# Generate download URL # Generate download URL
download_url = s3.presigned_download_url(s3_key) download_url = s3.presigned_download_url(s3_key)
# Create metadata response # Create metadata response
metadata = DocumentMetadata( metadata = DocumentMetadata(
document_id=document_id, document_id=document_id,
org_id=org_id, org_id=org_id,
document_type=document_type, document_type=document_type,
filename=sanitized_filename, filename=sanitized_filename,
content_type=detected_content_type, content_type=detected_content_type,
file_size=file_size, file_size=file_size,
s3_key=s3_key, s3_key=s3_key,
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())), created_at=datetime.fromisoformat(existing_metadata.get("created-at", datetime.utcnow().isoformat())),
updated_at=datetime.utcnow() updated_at=datetime.utcnow()
) )
@@ -170,7 +169,7 @@ async def get_document(request: Request, document_id: str):
try: try:
response = client.list_objects_v2( response = client.list_objects_v2(
Bucket=settings.s3_bucket, Bucket=s3.get_bucket_name(),
Prefix=prefix, Prefix=prefix,
MaxKeys=1 MaxKeys=1
) )
@@ -188,13 +187,13 @@ async def get_document(request: Request, document_id: str):
s3_metadata = s3.get_file_metadata(s3_key) s3_metadata = s3.get_file_metadata(s3_key)
# Verify org_id matches # Verify org_id matches
if s3_metadata.get("org_id") != org_id: if s3_metadata.get("org-id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}") logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch") raise HTTPException(status_code=403, detail="Organization mismatch")
# Get object info # Get object info
try: try:
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key) object_info = client.head_object(Bucket=s3.get_bucket_name(), Key=s3_key)
except Exception as e: except Exception as e:
logger.error(f"Failed to get object info: {e}") logger.error(f"Failed to get object info: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve document") raise HTTPException(status_code=500, detail="Failed to retrieve document")
@@ -202,14 +201,14 @@ async def get_document(request: Request, document_id: str):
# Create metadata response # Create metadata response
metadata = DocumentMetadata( metadata = DocumentMetadata(
document_id=document_id, document_id=document_id,
org_id=s3_metadata.get("org_id"), org_id=s3_metadata.get("org-id"),
document_type=DocumentType(s3_metadata.get("document_type")), document_type=DocumentType(s3_metadata.get("document-type")),
filename=s3_metadata.get("filename"), filename=s3_metadata.get("filename"),
content_type=object_info.get("ContentType"), content_type=object_info.get("ContentType"),
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))), file_size=int(s3_metadata.get("file-size", object_info.get("ContentLength", 0))),
s3_key=s3_key, s3_key=s3_key,
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())), created_at=datetime.fromisoformat(s3_metadata.get("created-at", datetime.utcnow().isoformat())),
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat())) updated_at=datetime.fromisoformat(s3_metadata.get("updated-at", datetime.utcnow().isoformat()))
) )
logger.info(f"Get document completed - document_id: {document_id}") logger.info(f"Get document completed - document_id: {document_id}")
@@ -227,7 +226,7 @@ async def get_download_url(request: Request, document_id: str, expires_in: int =
try: try:
response = client.list_objects_v2( response = client.list_objects_v2(
Bucket=settings.s3_bucket, Bucket=s3.get_bucket_name(),
Prefix=prefix, Prefix=prefix,
MaxKeys=1 MaxKeys=1
) )
@@ -243,7 +242,7 @@ async def get_download_url(request: Request, document_id: str, expires_in: int =
# Verify org_id matches # Verify org_id matches
s3_metadata = s3.get_file_metadata(s3_key) s3_metadata = s3.get_file_metadata(s3_key)
if s3_metadata.get("org_id") != org_id: if s3_metadata.get("org-id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}") logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch") raise HTTPException(status_code=403, detail="Organization mismatch")
@@ -265,7 +264,7 @@ async def get_document_fields(request: Request, document_id: str):
try: try:
response = client.list_objects_v2( response = client.list_objects_v2(
Bucket=settings.s3_bucket, Bucket=s3.get_bucket_name(),
Prefix=prefix, Prefix=prefix,
MaxKeys=1 MaxKeys=1
) )
@@ -283,12 +282,12 @@ async def get_document_fields(request: Request, document_id: str):
s3_metadata = s3.get_file_metadata(s3_key) s3_metadata = s3.get_file_metadata(s3_key)
# Verify org_id matches # Verify org_id matches
if s3_metadata.get("org_id") != org_id: if s3_metadata.get("org-id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}") logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch") raise HTTPException(status_code=403, detail="Organization mismatch")
# Check if PDF # Check if PDF
document_type = s3_metadata.get("document_type") document_type = s3_metadata.get("document-type")
if document_type != DocumentType.PDF.value: if document_type != DocumentType.PDF.value:
logger.error(f"Document is not PDF: {document_type}") logger.error(f"Document is not PDF: {document_type}")
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents") raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
@@ -323,7 +322,7 @@ async def delete_document(request: Request, document_id: str):
try: try:
response = client.list_objects_v2( response = client.list_objects_v2(
Bucket=settings.s3_bucket, Bucket=s3.get_bucket_name(),
Prefix=prefix, Prefix=prefix,
MaxKeys=1 MaxKeys=1
) )
@@ -339,7 +338,7 @@ async def delete_document(request: Request, document_id: str):
# Verify org_id matches # Verify org_id matches
s3_metadata = s3.get_file_metadata(s3_key) s3_metadata = s3.get_file_metadata(s3_key)
if s3_metadata.get("org_id") != org_id: if s3_metadata.get("org-id") != org_id:
logger.error(f"Organization mismatch for document: {document_id}") logger.error(f"Organization mismatch for document: {document_id}")
raise HTTPException(status_code=403, detail="Organization mismatch") raise HTTPException(status_code=403, detail="Organization mismatch")
@@ -348,7 +347,7 @@ async def delete_document(request: Request, document_id: str):
s3.delete_file(s3_key) s3.delete_file(s3_key)
logger.info(f"Document deleted - document_id: {document_id}") logger.info(f"Document deleted - document_id: {document_id}")
except Exception as e: except Exception as e:
logger.error(f"Failed to delete document: {e}") logger.error(f"Failed to delete file: {e}")
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}") raise HTTPException(status_code=500, detail=f"Failed to delete file: {e}")
return {"message": "Document deleted successfully"} return {"message": "Document deleted successfully"}

View File

@@ -90,10 +90,6 @@ def upload_file(file: UploadFile, s3_key: str, content_type: str, metadata: dict
file_content = file.file.read() file_content = file.file.read()
file.file.seek(0) file.file.seek(0)
extra_args = {"ContentType": content_type}
if metadata:
extra_args["Metadata"] = metadata
client.put_object( client.put_object(
Bucket=bucket_name, Bucket=bucket_name,
Key=s3_key, Key=s3_key,