Compare commits
15 Commits
5e6e421466
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 9a58242a91 | |||
| 1550fc7599 | |||
| c952063d7b | |||
| 2018f5a5f6 | |||
| c77845852e | |||
| 64a6bbed68 | |||
| 036d2553be | |||
| 70691ad392 | |||
| ff3d93e933 | |||
| 7e91a385f4 | |||
| ee4b1faea4 | |||
| 5ee4e847d0 | |||
| 49f2d1a97d | |||
| 3bcc38b4ae | |||
| 7d0561f2db |
12
README.md
12
README.md
@@ -16,7 +16,7 @@ Generic document management service with S3 storage and PDF field discovery.
|
|||||||
|
|
||||||
### Upload Document
|
### Upload Document
|
||||||
```
|
```
|
||||||
POST /api/documents/upload
|
POST /api/v1/documents/upload
|
||||||
Content-Type: multipart/form-data
|
Content-Type: multipart/form-data
|
||||||
Authorization: Bearer <token>
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
@@ -34,7 +34,7 @@ Response:
|
|||||||
|
|
||||||
### Rewrite Document
|
### Rewrite Document
|
||||||
```
|
```
|
||||||
PUT /api/documents/{document_id}
|
PUT /api/v1/documents/{document_id}
|
||||||
Content-Type: multipart/form-data
|
Content-Type: multipart/form-data
|
||||||
Authorization: Bearer <token>
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
@@ -52,7 +52,7 @@ Response:
|
|||||||
|
|
||||||
### Get Document Metadata
|
### Get Document Metadata
|
||||||
```
|
```
|
||||||
GET /api/documents/{document_id}
|
GET /api/v1/documents/{document_id}
|
||||||
Authorization: Bearer <token>
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
Response:
|
Response:
|
||||||
@@ -72,7 +72,7 @@ Response:
|
|||||||
|
|
||||||
### Get Download URL
|
### Get Download URL
|
||||||
```
|
```
|
||||||
GET /api/documents/{document_id}/download-url?expires_in=3600
|
GET /api/v1/documents/{document_id}/download-url?expires_in=3600
|
||||||
Authorization: Bearer <token>
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
Response:
|
Response:
|
||||||
@@ -85,7 +85,7 @@ Response:
|
|||||||
|
|
||||||
### Get PDF Fields
|
### Get PDF Fields
|
||||||
```
|
```
|
||||||
GET /api/documents/{document_id}/fields
|
GET /api/v1/documents/{document_id}/fields
|
||||||
Authorization: Bearer <token>
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
Response:
|
Response:
|
||||||
@@ -106,7 +106,7 @@ Response:
|
|||||||
|
|
||||||
### Delete Document
|
### Delete Document
|
||||||
```
|
```
|
||||||
DELETE /api/documents/{document_id}
|
DELETE /api/v1/documents/{document_id}
|
||||||
Authorization: Bearer <token>
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
Response:
|
Response:
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
# S3 settings
|
# COSI settings (preferred)
|
||||||
|
cosi_bucket_info_path: str = ""
|
||||||
|
|
||||||
|
# S3 settings (fallback)
|
||||||
s3_endpoint: str = "http://localhost:9000"
|
s3_endpoint: str = "http://localhost:9000"
|
||||||
s3_access_key: str = "minioadmin"
|
s3_access_key_id: str = "minioadmin"
|
||||||
s3_secret_key: str = "minioadmin"
|
s3_secret_key: str = "minioadmin"
|
||||||
s3_bucket: str = "document-bucket"
|
s3_bucket: str = "document-bucket"
|
||||||
s3_region: str = "us-east-1"
|
s3_region: str = "us-east-1"
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ from fastapi import FastAPI
|
|||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.openapi.utils import get_openapi
|
from fastapi.openapi.utils import get_openapi
|
||||||
from app.routers import documents
|
from app.routers import documents
|
||||||
from app.config import settings
|
|
||||||
from app.logger import setup_logging
|
from app.logger import setup_logging
|
||||||
from app.middleware.auth import AuthMiddleware
|
from app.middleware.auth import AuthMiddleware
|
||||||
|
|
||||||
@@ -13,9 +12,9 @@ app = FastAPI(
|
|||||||
title="Document Service",
|
title="Document Service",
|
||||||
version="1.0.0",
|
version="1.0.0",
|
||||||
description="Generic document management service with S3 storage and PDF field discovery",
|
description="Generic document management service with S3 storage and PDF field discovery",
|
||||||
openapi_url="/openapi3.json",
|
openapi_url="/api/openapi",
|
||||||
docs_url="/docs",
|
docs_url="/api/docs",
|
||||||
redoc_url="/redoc"
|
redoc_url="/api/redoc"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add auth middleware
|
# Add auth middleware
|
||||||
@@ -23,7 +22,7 @@ app.add_middleware(AuthMiddleware)
|
|||||||
|
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
allow_origins=["http://localhost:3000"],
|
allow_origins=["*"],
|
||||||
allow_methods=["*"],
|
allow_methods=["*"],
|
||||||
allow_headers=["*"]
|
allow_headers=["*"]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from starlette.middleware.base import BaseHTTPMiddleware
|
from starlette.middleware.base import BaseHTTPMiddleware
|
||||||
from starlette.responses import JSONResponse
|
|
||||||
from app.logger import get_logger
|
from app.logger import get_logger
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
|
||||||
from app.enums import DocumentType
|
from app.enums import DocumentType
|
||||||
|
|
||||||
class DocumentMetadata(BaseModel):
|
class DocumentMetadata(BaseModel):
|
||||||
|
|||||||
@@ -1,6 +1,4 @@
|
|||||||
import os
|
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
def discover_fields(pdf_path: str) -> list[dict]:
|
def discover_fields(pdf_path: str) -> list[dict]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1 +1,3 @@
|
|||||||
from app.routers import documents
|
from app.routers import documents as documents
|
||||||
|
|
||||||
|
__all__ = ["documents"]
|
||||||
|
|||||||
@@ -1,15 +1,13 @@
|
|||||||
import os
|
import os
|
||||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Request
|
from fastapi import APIRouter, HTTPException, UploadFile, File, Request
|
||||||
from typing import Optional
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from app import s3, pdf, utils
|
from app import s3, pdf, utils
|
||||||
from app.config import settings
|
|
||||||
from app.enums import DocumentType
|
from app.enums import DocumentType
|
||||||
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
|
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
|
||||||
from app.logger import get_logger
|
from app.logger import get_logger
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/documents", tags=["documents"])
|
router = APIRouter(prefix="/api/v1/documents", tags=["documents"])
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
@router.post("/upload", response_model=UploadResponse)
|
@router.post("/upload", response_model=UploadResponse)
|
||||||
@@ -47,11 +45,11 @@ async def upload_document(
|
|||||||
|
|
||||||
# Prepare metadata
|
# Prepare metadata
|
||||||
metadata_dict = {
|
metadata_dict = {
|
||||||
"org_id": org_id,
|
"org-id": org_id,
|
||||||
"document_type": document_type.value,
|
"document-type": document_type.value,
|
||||||
"filename": file.filename,
|
"filename": sanitized_filename,
|
||||||
"file_size": str(file_size),
|
"file-size": str(file_size),
|
||||||
"created_at": datetime.utcnow().isoformat()
|
"created-at": datetime.utcnow().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
# Upload to S3
|
# Upload to S3
|
||||||
@@ -70,7 +68,7 @@ async def upload_document(
|
|||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
org_id=org_id,
|
org_id=org_id,
|
||||||
document_type=document_type,
|
document_type=document_type,
|
||||||
filename=file.filename,
|
filename=sanitized_filename,
|
||||||
content_type=detected_content_type,
|
content_type=detected_content_type,
|
||||||
file_size=file_size,
|
file_size=file_size,
|
||||||
s3_key=s3_key,
|
s3_key=s3_key,
|
||||||
@@ -119,17 +117,17 @@ async def rewrite_document(
|
|||||||
|
|
||||||
# Verify org_id matches
|
# Verify org_id matches
|
||||||
existing_metadata = s3.get_file_metadata(s3_key)
|
existing_metadata = s3.get_file_metadata(s3_key)
|
||||||
if existing_metadata.get("org_id") != org_id:
|
if existing_metadata.get("org-id") != org_id:
|
||||||
logger.error(f"Organization mismatch for document: {document_id}")
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
# Prepare metadata
|
# Prepare metadata
|
||||||
metadata_dict = {
|
metadata_dict = {
|
||||||
"org_id": org_id,
|
"org-id": org_id,
|
||||||
"document_type": document_type.value,
|
"document-type": document_type.value,
|
||||||
"filename": file.filename,
|
"filename": sanitized_filename,
|
||||||
"file_size": str(file_size),
|
"file-size": str(file_size),
|
||||||
"updated_at": datetime.utcnow().isoformat()
|
"updated-at": datetime.utcnow().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
# Upload to S3 (overwrites existing)
|
# Upload to S3 (overwrites existing)
|
||||||
@@ -148,12 +146,12 @@ async def rewrite_document(
|
|||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
org_id=org_id,
|
org_id=org_id,
|
||||||
document_type=document_type,
|
document_type=document_type,
|
||||||
filename=file.filename,
|
filename=sanitized_filename,
|
||||||
content_type=detected_content_type,
|
content_type=detected_content_type,
|
||||||
file_size=file_size,
|
file_size=file_size,
|
||||||
s3_key=s3_key,
|
s3_key=s3_key,
|
||||||
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
|
created_at=datetime.fromisoformat(existing_metadata.get("created-at", datetime.utcnow().isoformat())),
|
||||||
updated_at=datetime.utcnow()
|
updated_at=datetime.utcnow()
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Rewrite completed - document_id: {document_id}")
|
logger.info(f"Rewrite completed - document_id: {document_id}")
|
||||||
@@ -171,7 +169,7 @@ async def get_document(request: Request, document_id: str):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
response = client.list_objects_v2(
|
response = client.list_objects_v2(
|
||||||
Bucket=settings.s3_bucket,
|
Bucket=s3.get_bucket_name(),
|
||||||
Prefix=prefix,
|
Prefix=prefix,
|
||||||
MaxKeys=1
|
MaxKeys=1
|
||||||
)
|
)
|
||||||
@@ -189,13 +187,13 @@ async def get_document(request: Request, document_id: str):
|
|||||||
s3_metadata = s3.get_file_metadata(s3_key)
|
s3_metadata = s3.get_file_metadata(s3_key)
|
||||||
|
|
||||||
# Verify org_id matches
|
# Verify org_id matches
|
||||||
if s3_metadata.get("org_id") != org_id:
|
if s3_metadata.get("org-id") != org_id:
|
||||||
logger.error(f"Organization mismatch for document: {document_id}")
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
# Get object info
|
# Get object info
|
||||||
try:
|
try:
|
||||||
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
object_info = client.head_object(Bucket=s3.get_bucket_name(), Key=s3_key)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to get object info: {e}")
|
logger.error(f"Failed to get object info: {e}")
|
||||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||||
@@ -203,14 +201,14 @@ async def get_document(request: Request, document_id: str):
|
|||||||
# Create metadata response
|
# Create metadata response
|
||||||
metadata = DocumentMetadata(
|
metadata = DocumentMetadata(
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
org_id=s3_metadata.get("org_id"),
|
org_id=s3_metadata.get("org-id"),
|
||||||
document_type=DocumentType(s3_metadata.get("document_type")),
|
document_type=DocumentType(s3_metadata.get("document-type")),
|
||||||
filename=s3_metadata.get("filename"),
|
filename=s3_metadata.get("filename"),
|
||||||
content_type=object_info.get("ContentType"),
|
content_type=object_info.get("ContentType"),
|
||||||
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
|
file_size=int(s3_metadata.get("file-size", object_info.get("ContentLength", 0))),
|
||||||
s3_key=s3_key,
|
s3_key=s3_key,
|
||||||
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
|
created_at=datetime.fromisoformat(s3_metadata.get("created-at", datetime.utcnow().isoformat())),
|
||||||
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
|
updated_at=datetime.fromisoformat(s3_metadata.get("updated-at", datetime.utcnow().isoformat()))
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Get document completed - document_id: {document_id}")
|
logger.info(f"Get document completed - document_id: {document_id}")
|
||||||
@@ -228,7 +226,7 @@ async def get_download_url(request: Request, document_id: str, expires_in: int =
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
response = client.list_objects_v2(
|
response = client.list_objects_v2(
|
||||||
Bucket=settings.s3_bucket,
|
Bucket=s3.get_bucket_name(),
|
||||||
Prefix=prefix,
|
Prefix=prefix,
|
||||||
MaxKeys=1
|
MaxKeys=1
|
||||||
)
|
)
|
||||||
@@ -244,7 +242,7 @@ async def get_download_url(request: Request, document_id: str, expires_in: int =
|
|||||||
|
|
||||||
# Verify org_id matches
|
# Verify org_id matches
|
||||||
s3_metadata = s3.get_file_metadata(s3_key)
|
s3_metadata = s3.get_file_metadata(s3_key)
|
||||||
if s3_metadata.get("org_id") != org_id:
|
if s3_metadata.get("org-id") != org_id:
|
||||||
logger.error(f"Organization mismatch for document: {document_id}")
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
@@ -266,7 +264,7 @@ async def get_document_fields(request: Request, document_id: str):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
response = client.list_objects_v2(
|
response = client.list_objects_v2(
|
||||||
Bucket=settings.s3_bucket,
|
Bucket=s3.get_bucket_name(),
|
||||||
Prefix=prefix,
|
Prefix=prefix,
|
||||||
MaxKeys=1
|
MaxKeys=1
|
||||||
)
|
)
|
||||||
@@ -284,12 +282,12 @@ async def get_document_fields(request: Request, document_id: str):
|
|||||||
s3_metadata = s3.get_file_metadata(s3_key)
|
s3_metadata = s3.get_file_metadata(s3_key)
|
||||||
|
|
||||||
# Verify org_id matches
|
# Verify org_id matches
|
||||||
if s3_metadata.get("org_id") != org_id:
|
if s3_metadata.get("org-id") != org_id:
|
||||||
logger.error(f"Organization mismatch for document: {document_id}")
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
# Check if PDF
|
# Check if PDF
|
||||||
document_type = s3_metadata.get("document_type")
|
document_type = s3_metadata.get("document-type")
|
||||||
if document_type != DocumentType.PDF.value:
|
if document_type != DocumentType.PDF.value:
|
||||||
logger.error(f"Document is not PDF: {document_type}")
|
logger.error(f"Document is not PDF: {document_type}")
|
||||||
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
|
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
|
||||||
@@ -324,7 +322,7 @@ async def delete_document(request: Request, document_id: str):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
response = client.list_objects_v2(
|
response = client.list_objects_v2(
|
||||||
Bucket=settings.s3_bucket,
|
Bucket=s3.get_bucket_name(),
|
||||||
Prefix=prefix,
|
Prefix=prefix,
|
||||||
MaxKeys=1
|
MaxKeys=1
|
||||||
)
|
)
|
||||||
@@ -340,7 +338,7 @@ async def delete_document(request: Request, document_id: str):
|
|||||||
|
|
||||||
# Verify org_id matches
|
# Verify org_id matches
|
||||||
s3_metadata = s3.get_file_metadata(s3_key)
|
s3_metadata = s3.get_file_metadata(s3_key)
|
||||||
if s3_metadata.get("org_id") != org_id:
|
if s3_metadata.get("org-id") != org_id:
|
||||||
logger.error(f"Organization mismatch for document: {document_id}")
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
@@ -349,7 +347,7 @@ async def delete_document(request: Request, document_id: str):
|
|||||||
s3.delete_file(s3_key)
|
s3.delete_file(s3_key)
|
||||||
logger.info(f"Document deleted - document_id: {document_id}")
|
logger.info(f"Document deleted - document_id: {document_id}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to delete document: {e}")
|
logger.error(f"Failed to delete file: {e}")
|
||||||
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
|
raise HTTPException(status_code=500, detail=f"Failed to delete file: {e}")
|
||||||
|
|
||||||
return {"message": "Document deleted successfully"}
|
return {"message": "Document deleted successfully"}
|
||||||
|
|||||||
109
app/s3.py
109
app/s3.py
@@ -1,6 +1,7 @@
|
|||||||
import boto3
|
import boto3
|
||||||
import tempfile
|
import json
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
from botocore.client import Config
|
from botocore.client import Config
|
||||||
from fastapi import UploadFile
|
from fastapi import UploadFile
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
@@ -8,94 +9,136 @@ from app.logger import get_logger
|
|||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def read_bucket_info() -> dict:
|
||||||
|
if not settings.cosi_bucket_info_path:
|
||||||
|
raise ValueError("COSI_BUCKET_INFO_PATH not set")
|
||||||
|
with open(settings.cosi_bucket_info_path, "r") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cosi_s3_config() -> dict:
|
||||||
|
bucket_info = read_bucket_info()
|
||||||
|
s3_conf = bucket_info["spec"]["secretS3"]
|
||||||
|
return {
|
||||||
|
"endpoint": s3_conf["endpoint"],
|
||||||
|
"access_key": s3_conf["accessKeyID"],
|
||||||
|
"secret_key": s3_conf["accessSecretKey"],
|
||||||
|
"bucket": bucket_info["spec"]["bucketName"]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_client():
|
def get_client():
|
||||||
|
if settings.cosi_bucket_info_path:
|
||||||
|
cosi_config = get_cosi_s3_config()
|
||||||
|
return boto3.client(
|
||||||
|
"s3",
|
||||||
|
endpoint_url=cosi_config["endpoint"],
|
||||||
|
aws_access_key_id=cosi_config["access_key"],
|
||||||
|
aws_secret_access_key=cosi_config["secret_key"],
|
||||||
|
config=Config(signature_version="s3v4"),
|
||||||
|
region_name="us-east-1"
|
||||||
|
)
|
||||||
return boto3.client(
|
return boto3.client(
|
||||||
"s3",
|
"s3",
|
||||||
endpoint_url=settings.s3_endpoint,
|
endpoint_url=settings.s3_endpoint,
|
||||||
aws_access_key_id=settings.s3_access_key,
|
aws_access_key_id=settings.s3_access_key_id,
|
||||||
aws_secret_access_key=settings.s3_secret_key,
|
aws_secret_access_key=settings.s3_secret_key,
|
||||||
config=Config(signature_version="s3v4"),
|
config=Config(signature_version="s3v4"),
|
||||||
region_name=settings.s3_region
|
region_name=settings.s3_region
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_bucket_name() -> str:
|
||||||
|
if settings.cosi_bucket_info_path:
|
||||||
|
return get_cosi_s3_config()["bucket"]
|
||||||
|
return settings.s3_bucket
|
||||||
|
|
||||||
|
|
||||||
def ensure_bucket_exists() -> None:
|
def ensure_bucket_exists() -> None:
|
||||||
"""Ensure the S3 bucket exists, create it if it doesn't exist.
|
bucket_name = get_bucket_name()
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If bucket creation fails (service will fail to start)
|
|
||||||
"""
|
|
||||||
client = get_client()
|
client = get_client()
|
||||||
try:
|
try:
|
||||||
client.head_bucket(Bucket=settings.s3_bucket)
|
client.head_bucket(Bucket=bucket_name)
|
||||||
logger.info(f"Bucket '{settings.s3_bucket}' already exists")
|
logger.info(f"Bucket '{bucket_name}' already exists")
|
||||||
except client.exceptions.ClientError as e:
|
except client.exceptions.ClientError as e:
|
||||||
error_code = e.response['Error']['Code']
|
error_code = e.response['Error']['Code']
|
||||||
if error_code == '404':
|
if error_code == '404':
|
||||||
try:
|
try:
|
||||||
client.create_bucket(
|
client.create_bucket(
|
||||||
Bucket=settings.s3_bucket,
|
Bucket=bucket_name,
|
||||||
CreateBucketConfiguration={
|
CreateBucketConfiguration={
|
||||||
'LocationConstraint': settings.s3_region
|
'LocationConstraint': 'us-east-1'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
logger.info(f"Created bucket '{settings.s3_bucket}'")
|
logger.info(f"Created bucket '{bucket_name}'")
|
||||||
except Exception as create_error:
|
except Exception as create_error:
|
||||||
logger.error(f"Failed to create bucket '{settings.s3_bucket}': {create_error}")
|
logger.error(f"Failed to create bucket '{bucket_name}': {create_error}")
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
logger.error(f"Error checking bucket: {e}")
|
logger.error(f"Error checking bucket: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def upload_file(file: UploadFile, s3_key: str, content_type: str, metadata: dict = None) -> str:
|
def upload_file(file: UploadFile, s3_key: str, content_type: str, metadata: dict = None) -> str:
|
||||||
"""Upload file to S3 with metadata"""
|
bucket_name = get_bucket_name()
|
||||||
client = get_client()
|
client = get_client()
|
||||||
|
|
||||||
extra_args = {"ContentType": content_type}
|
file.file.seek(0, os.SEEK_END)
|
||||||
if metadata:
|
file_size = file.file.tell()
|
||||||
extra_args["Metadata"] = metadata
|
file.file.seek(0)
|
||||||
|
file_content = file.file.read()
|
||||||
|
file.file.seek(0)
|
||||||
|
|
||||||
client.upload_fileobj(
|
client.put_object(
|
||||||
file.file,
|
Bucket=bucket_name,
|
||||||
settings.s3_bucket,
|
Key=s3_key,
|
||||||
s3_key,
|
Body=file_content,
|
||||||
ExtraArgs=extra_args
|
ContentLength=file_size,
|
||||||
|
ContentType=content_type,
|
||||||
|
Metadata=metadata
|
||||||
)
|
)
|
||||||
return s3_key
|
return s3_key
|
||||||
|
|
||||||
|
|
||||||
def delete_file(s3_key: str) -> None:
|
def delete_file(s3_key: str) -> None:
|
||||||
"""Delete file from S3"""
|
bucket_name = get_bucket_name()
|
||||||
client = get_client()
|
client = get_client()
|
||||||
client.delete_object(Bucket=settings.s3_bucket, Key=s3_key)
|
client.delete_object(Bucket=bucket_name, Key=s3_key)
|
||||||
|
|
||||||
|
|
||||||
def file_exists(s3_key: str) -> bool:
|
def file_exists(s3_key: str) -> bool:
|
||||||
"""Check if file exists in S3"""
|
bucket_name = get_bucket_name()
|
||||||
client = get_client()
|
client = get_client()
|
||||||
try:
|
try:
|
||||||
client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
client.head_object(Bucket=bucket_name, Key=s3_key)
|
||||||
return True
|
return True
|
||||||
except client.exceptions.ClientError:
|
except client.exceptions.ClientError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_file_metadata(s3_key: str) -> dict:
|
def get_file_metadata(s3_key: str) -> dict:
|
||||||
"""Get file metadata from S3"""
|
bucket_name = get_bucket_name()
|
||||||
client = get_client()
|
client = get_client()
|
||||||
response = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
response = client.head_object(Bucket=bucket_name, Key=s3_key)
|
||||||
return response.get("Metadata", {})
|
return response.get("Metadata", {})
|
||||||
|
|
||||||
|
|
||||||
def download_to_temp(s3_key: str) -> str:
|
def download_to_temp(s3_key: str) -> str:
|
||||||
"""Download file from S3 to temp file"""
|
bucket_name = get_bucket_name()
|
||||||
client = get_client()
|
client = get_client()
|
||||||
suffix = os.path.splitext(s3_key)[-1] or ".tmp"
|
suffix = os.path.splitext(s3_key)[-1] or ".tmp"
|
||||||
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
||||||
client.download_fileobj(settings.s3_bucket, s3_key, tmp)
|
client.download_fileobj(bucket_name, s3_key, tmp)
|
||||||
tmp.close()
|
tmp.close()
|
||||||
return tmp.name
|
return tmp.name
|
||||||
|
|
||||||
|
|
||||||
def presigned_download_url(s3_key: str, expires_in: int = 3600) -> str:
|
def presigned_download_url(s3_key: str, expires_in: int = 3600) -> str:
|
||||||
"""Generate presigned download URL"""
|
bucket_name = get_bucket_name()
|
||||||
client = get_client()
|
client = get_client()
|
||||||
return client.generate_presigned_url(
|
return client.generate_presigned_url(
|
||||||
"get_object",
|
"get_object",
|
||||||
Params={"Bucket": settings.s3_bucket, "Key": s3_key},
|
Params={"Bucket": bucket_name, "Key": s3_key},
|
||||||
ExpiresIn=expires_in
|
ExpiresIn=expires_in
|
||||||
)
|
)
|
||||||
@@ -58,9 +58,6 @@ def document_s3_key(org_id: str, document_id: str, filename: str) -> str:
|
|||||||
return f"{s3_path_prefix(org_id, document_id)}{filename}"
|
return f"{s3_path_prefix(org_id, document_id)}{filename}"
|
||||||
|
|
||||||
def sanitize_filename(filename: str) -> str:
|
def sanitize_filename(filename: str) -> str:
|
||||||
"""Sanitize filename for S3"""
|
"""URL encode filename for S3"""
|
||||||
# Remove path separators and special characters
|
from urllib.parse import quote
|
||||||
filename = filename.replace("/", "_").replace("\\", "_")
|
return quote(filename, safe="")
|
||||||
# Keep only safe characters
|
|
||||||
safe_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_")
|
|
||||||
return "".join(c for c in filename if c in safe_chars)
|
|
||||||
|
|||||||
@@ -11,22 +11,8 @@ controllers:
|
|||||||
env:
|
env:
|
||||||
LOG_LEVEL: info
|
LOG_LEVEL: info
|
||||||
PORT: "8082"
|
PORT: "8082"
|
||||||
S3_ENDPOINT:
|
COSI_BUCKET_INFO_PATH:
|
||||||
value: "https://dev.s3.corredorconect.com/"
|
value: "/var/run/secrets/cosi/BucketInfo"
|
||||||
S3_ACCESS_KEY:
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: 'document-service-s3-credentials'
|
|
||||||
key: rootAccessKey
|
|
||||||
S3_SECRET_KEY:
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: 'document-service-s3-credentials'
|
|
||||||
key: rootSecretAccessKey
|
|
||||||
S3_BUCKET:
|
|
||||||
value: "document-bucket"
|
|
||||||
S3_REGION:
|
|
||||||
value: "us-east-1"
|
|
||||||
probes:
|
probes:
|
||||||
liveness:
|
liveness:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -46,13 +32,6 @@ controllers:
|
|||||||
port: 8082
|
port: 8082
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 5
|
periodSeconds: 5
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 256Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
|
|
||||||
service:
|
service:
|
||||||
main:
|
main:
|
||||||
@@ -63,3 +42,36 @@ service:
|
|||||||
port: 8082
|
port: 8082
|
||||||
protocol: HTTP
|
protocol: HTTP
|
||||||
|
|
||||||
|
persistence:
|
||||||
|
cosi-bucket-info:
|
||||||
|
enabled: true
|
||||||
|
type: secret
|
||||||
|
name: document-service-s3-credentials
|
||||||
|
globalMounts:
|
||||||
|
- path: /var/run/secrets/cosi
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
rawResources:
|
||||||
|
bucket:
|
||||||
|
enabled: true
|
||||||
|
apiVersion: objectstorage.k8s.io/v1alpha1
|
||||||
|
kind: BucketClaim
|
||||||
|
suffix: bucket
|
||||||
|
spec:
|
||||||
|
spec:
|
||||||
|
bucketClassName: seaweedfs
|
||||||
|
protocols:
|
||||||
|
- s3
|
||||||
|
|
||||||
|
bucket-access:
|
||||||
|
enabled: true
|
||||||
|
apiVersion: objectstorage.k8s.io/v1alpha1
|
||||||
|
kind: BucketAccess
|
||||||
|
suffix: bucket-access
|
||||||
|
spec:
|
||||||
|
spec:
|
||||||
|
bucketAccessClassName: seaweedfs
|
||||||
|
bucketClaimName: '{{ include "bjw-s.common.lib.chart.names.fullname" $ }}-bucket'
|
||||||
|
credentialsSecretName: document-service-s3-credentials
|
||||||
|
protocol: s3
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ class TestDocumentUpload:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
response = test_client.post(
|
response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
@@ -61,7 +61,7 @@ class TestDocumentUpload:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
response = test_client.post(
|
response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
@@ -80,7 +80,7 @@ class TestDocumentUpload:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
response = test_client.post(
|
response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
@@ -97,7 +97,7 @@ class TestDocumentUpload:
|
|||||||
data = {"org_id": "test-org-123"}
|
data = {"org_id": "test-org-123"}
|
||||||
|
|
||||||
response = test_client.post(
|
response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data
|
data=data
|
||||||
)
|
)
|
||||||
@@ -113,7 +113,7 @@ class TestDocumentUpload:
|
|||||||
headers = {"Authorization": "Invalid token"}
|
headers = {"Authorization": "Invalid token"}
|
||||||
|
|
||||||
response = test_client.post(
|
response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
@@ -127,7 +127,7 @@ class TestDocumentUpload:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
response = test_client.post(
|
response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
@@ -145,7 +145,7 @@ class TestDocumentMetadata:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
response = test_client.get(
|
response = test_client.get(
|
||||||
"/api/documents/test-doc-456",
|
"/api/v1/documents/test-doc-456",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
@@ -155,7 +155,7 @@ class TestDocumentMetadata:
|
|||||||
|
|
||||||
def test_get_document_without_auth_returns_401(self, test_client):
|
def test_get_document_without_auth_returns_401(self, test_client):
|
||||||
"""Test getting document without auth returns 401."""
|
"""Test getting document without auth returns 401."""
|
||||||
response = test_client.get("/api/documents/test-doc-456")
|
response = test_client.get("/api/v1/documents/test-doc-456")
|
||||||
|
|
||||||
assert response.status_code == 401
|
assert response.status_code == 401
|
||||||
|
|
||||||
@@ -168,7 +168,7 @@ class TestDownloadUrl:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
response = test_client.get(
|
response = test_client.get(
|
||||||
"/api/documents/test-doc-456/download-url",
|
"/api/v1/documents/test-doc-456/download-url",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
@@ -178,7 +178,7 @@ class TestDownloadUrl:
|
|||||||
|
|
||||||
def test_get_download_url_without_auth_returns_401(self, test_client):
|
def test_get_download_url_without_auth_returns_401(self, test_client):
|
||||||
"""Test getting download URL without auth returns 401."""
|
"""Test getting download URL without auth returns 401."""
|
||||||
response = test_client.get("/api/documents/test-doc-456/download-url")
|
response = test_client.get("/api/v1/documents/test-doc-456/download-url")
|
||||||
|
|
||||||
assert response.status_code == 401
|
assert response.status_code == 401
|
||||||
|
|
||||||
@@ -195,7 +195,7 @@ class TestPDFFieldDiscovery:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
upload_response = test_client.post(
|
upload_response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
@@ -207,7 +207,7 @@ class TestPDFFieldDiscovery:
|
|||||||
# Get fields
|
# Get fields
|
||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
response = test_client.get(
|
response = test_client.get(
|
||||||
f"/api/documents/{document_id}/fields",
|
f"/api/v1/documents/{document_id}/fields",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
@@ -235,7 +235,7 @@ class TestPDFFieldDiscovery:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
upload_response = test_client.post(
|
upload_response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
@@ -247,7 +247,7 @@ class TestPDFFieldDiscovery:
|
|||||||
# Get fields
|
# Get fields
|
||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
response = test_client.get(
|
response = test_client.get(
|
||||||
f"/api/documents/{document_id}/fields",
|
f"/api/v1/documents/{document_id}/fields",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
@@ -266,7 +266,7 @@ class TestPDFFieldDiscovery:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
upload_response = test_client.post(
|
upload_response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
@@ -278,7 +278,7 @@ class TestPDFFieldDiscovery:
|
|||||||
# Get fields
|
# Get fields
|
||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
response = test_client.get(
|
response = test_client.get(
|
||||||
f"/api/documents/{document_id}/fields",
|
f"/api/v1/documents/{document_id}/fields",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
@@ -290,7 +290,7 @@ class TestPDFFieldDiscovery:
|
|||||||
|
|
||||||
def test_get_pdf_fields_without_auth_returns_401(self, test_client):
|
def test_get_pdf_fields_without_auth_returns_401(self, test_client):
|
||||||
"""Test getting PDF fields without auth returns 401."""
|
"""Test getting PDF fields without auth returns 401."""
|
||||||
response = test_client.get("/api/documents/test-doc-456/fields")
|
response = test_client.get("/api/v1/documents/test-doc-456/fields")
|
||||||
|
|
||||||
assert response.status_code == 401
|
assert response.status_code == 401
|
||||||
|
|
||||||
@@ -303,7 +303,7 @@ class TestDocumentDeletion:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
response = test_client.delete(
|
response = test_client.delete(
|
||||||
"/api/documents/test-doc-456",
|
"/api/v1/documents/test-doc-456",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
@@ -313,7 +313,7 @@ class TestDocumentDeletion:
|
|||||||
|
|
||||||
def test_delete_document_without_auth_returns_401(self, test_client):
|
def test_delete_document_without_auth_returns_401(self, test_client):
|
||||||
"""Test deleting document without auth returns 401."""
|
"""Test deleting document without auth returns 401."""
|
||||||
response = test_client.delete("/api/documents/test-doc-456")
|
response = test_client.delete("/api/v1/documents/test-doc-456")
|
||||||
|
|
||||||
assert response.status_code == 401
|
assert response.status_code == 401
|
||||||
|
|
||||||
@@ -418,7 +418,7 @@ class TestCompleteWorkflow:
|
|||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
upload_response = test_client.post(
|
upload_response = test_client.post(
|
||||||
"/api/documents/upload",
|
"/api/v1/documents/upload",
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers
|
headers=headers
|
||||||
@@ -430,28 +430,28 @@ class TestCompleteWorkflow:
|
|||||||
# Get metadata
|
# Get metadata
|
||||||
headers = {"Authorization": sample_auth_token}
|
headers = {"Authorization": sample_auth_token}
|
||||||
metadata_response = test_client.get(
|
metadata_response = test_client.get(
|
||||||
f"/api/documents/{document_id}",
|
f"/api/v1/documents/{document_id}",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get fields
|
# Get fields
|
||||||
fields_response = test_client.get(
|
fields_response = test_client.get(
|
||||||
f"/api/documents/{document_id}/fields",
|
f"/api/v1/documents/{document_id}/fields",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get download URL
|
# Get download URL
|
||||||
download_response = test_client.get(
|
download_response = test_client.get(
|
||||||
f"/api/documents/{document_id}/download-url",
|
f"/api/v1/documents/{document_id}/download-url",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
|
|
||||||
# Delete document
|
# Delete document
|
||||||
delete_response = test_client.delete(
|
delete_response = test_client.delete(
|
||||||
f"/api/documents/{document_id}",
|
f"/api/v1/documents/{document_id}",
|
||||||
params={"org_id": "test-org-123"},
|
params={"org_id": "test-org-123"},
|
||||||
headers=headers
|
headers=headers
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user