Initial commit of document-service

This commit is contained in:
2026-04-23 16:20:58 -05:00
commit 51d60f0032
30 changed files with 4357 additions and 0 deletions

464
tests/test_documents.py Normal file
View File

@@ -0,0 +1,464 @@
"""
Comprehensive test suite for document-service.
Tests document upload, retrieval, field discovery, and complete workflows.
"""
import pytest
import os
from fastapi.testclient import TestClient
from unittest.mock import Mock, patch
from moto import mock_aws
import boto3
from app.main import app
from app.pdf import discover_fields
# Test data paths
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
class TestHealthEndpoint:
"""Test health endpoint functionality."""
def test_health_endpoint(self, test_client):
"""Test health endpoint returns 200 OK."""
response = test_client.get("/health")
assert response.status_code == 200
assert response.json() == {"status": "ok"}
class TestDocumentUpload:
"""Test document upload functionality."""
def test_upload_simple_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
"""Test uploading a simple PDF with form fields."""
with open(test_pdf_files["simple_form"], "rb") as f:
files = {"file": ("simple_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
headers = {"Authorization": sample_auth_token}
response = test_client.post(
"/api/documents/upload",
files=files,
data=data,
headers=headers
)
assert response.status_code == 201
result = response.json()
assert "document_id" in result
assert "metadata" in result
assert "download_url" in result
assert result["metadata"]["document_type"] == "pdf"
assert result["metadata"]["filename"] == "simple_form.pdf"
def test_upload_complex_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
"""Test uploading a complex PDF with multiple field types."""
with open(test_pdf_files["complex_form"], "rb") as f:
files = {"file": ("complex_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
headers = {"Authorization": sample_auth_token}
response = test_client.post(
"/api/documents/upload",
files=files,
data=data,
headers=headers
)
assert response.status_code == 201
result = response.json()
assert "document_id" in result
assert result["metadata"]["document_type"] == "pdf"
def test_upload_no_form_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
"""Test uploading a PDF without form fields."""
with open(test_pdf_files["no_form"], "rb") as f:
files = {"file": ("no_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
headers = {"Authorization": sample_auth_token}
response = test_client.post(
"/api/documents/upload",
files=files,
data=data,
headers=headers
)
assert response.status_code == 201
result = response.json()
assert "document_id" in result
def test_upload_without_auth_returns_401(self, test_client, test_pdf_files):
"""Test upload without auth returns 401."""
with open(test_pdf_files["simple_form"], "rb") as f:
files = {"file": ("simple_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
response = test_client.post(
"/api/documents/upload",
files=files,
data=data
)
assert response.status_code == 401
assert "detail" in response.json()
def test_upload_with_invalid_auth_returns_401(self, test_client, test_pdf_files):
"""Test upload with invalid auth returns 401."""
with open(test_pdf_files["simple_form"], "rb") as f:
files = {"file": ("simple_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
headers = {"Authorization": "Invalid token"}
response = test_client.post(
"/api/documents/upload",
files=files,
data=data,
headers=headers
)
assert response.status_code == 401
def test_upload_missing_file_returns_400(self, test_client, sample_auth_token):
"""Test upload without file returns 400."""
data = {"org_id": "test-org-123"}
headers = {"Authorization": sample_auth_token}
response = test_client.post(
"/api/documents/upload",
data=data,
headers=headers
)
assert response.status_code == 422 # FastAPI validation error
class TestDocumentMetadata:
"""Test document metadata retrieval."""
def test_get_document_metadata_success(self, test_client, sample_auth_token):
"""Test getting document metadata successfully."""
# This test would require a document to be uploaded first
# For now, we'll test the endpoint structure
headers = {"Authorization": sample_auth_token}
response = test_client.get(
"/api/documents/test-doc-456",
params={"org_id": "test-org-123"},
headers=headers
)
# Will return 404 since document doesn't exist, but endpoint is accessible
assert response.status_code in [404, 403]
def test_get_document_without_auth_returns_401(self, test_client):
"""Test getting document without auth returns 401."""
response = test_client.get("/api/documents/test-doc-456")
assert response.status_code == 401
class TestDownloadUrl:
"""Test download URL generation."""
def test_get_download_url_success(self, test_client, sample_auth_token):
"""Test getting download URL successfully."""
headers = {"Authorization": sample_auth_token}
response = test_client.get(
"/api/documents/test-doc-456/download-url",
params={"org_id": "test-org-123"},
headers=headers
)
# Will return 404 since document doesn't exist, but endpoint is accessible
assert response.status_code in [404, 403]
def test_get_download_url_without_auth_returns_401(self, test_client):
"""Test getting download URL without auth returns 401."""
response = test_client.get("/api/documents/test-doc-456/download-url")
assert response.status_code == 401
class TestPDFFieldDiscovery:
"""Test PDF field discovery functionality."""
def test_get_pdf_fields_simple_form(self, test_client, test_pdf_files, sample_auth_token):
"""Test getting PDF fields from simple form."""
# First upload the document
with open(test_pdf_files["simple_form"], "rb") as f:
files = {"file": ("simple_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
headers = {"Authorization": sample_auth_token}
upload_response = test_client.post(
"/api/documents/upload",
files=files,
data=data,
headers=headers
)
if upload_response.status_code == 201:
document_id = upload_response.json()["document_id"]
# Get fields
headers = {"Authorization": sample_auth_token}
response = test_client.get(
f"/api/documents/{document_id}/fields",
params={"org_id": "test-org-123"},
headers=headers
)
assert response.status_code == 200
result = response.json()
assert "fields" in result
assert len(result["fields"]) == 6 # name, email, phone, country, birth_date, agree_terms
# Check field types
field_types = {f["field"]: f["type"] for f in result["fields"]}
assert field_types["name"] == "string"
assert field_types["email"] == "string"
assert field_types["phone"] == "string"
assert field_types["country"] == "select"
assert field_types["birth_date"] == "date"
assert field_types["agree_terms"] == "boolean"
def test_get_pdf_fields_complex_form(self, test_client, test_pdf_files, sample_auth_token):
"""Test getting PDF fields from complex form."""
# First upload the document
with open(test_pdf_files["complex_form"], "rb") as f:
files = {"file": ("complex_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
headers = {"Authorization": sample_auth_token}
upload_response = test_client.post(
"/api/documents/upload",
files=files,
data=data,
headers=headers
)
if upload_response.status_code == 201:
document_id = upload_response.json()["document_id"]
# Get fields
headers = {"Authorization": sample_auth_token}
response = test_client.get(
f"/api/documents/{document_id}/fields",
params={"org_id": "test-org-123"},
headers=headers
)
assert response.status_code == 200
result = response.json()
assert "fields" in result
assert len(result["fields"]) == 16 # All fields from complex form
def test_get_pdf_fields_no_form_returns_empty_list(self, test_client, test_pdf_files, sample_auth_token):
"""Test getting PDF fields from PDF without form fields."""
# First upload the document
with open(test_pdf_files["no_form"], "rb") as f:
files = {"file": ("no_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
headers = {"Authorization": sample_auth_token}
upload_response = test_client.post(
"/api/documents/upload",
files=files,
data=data,
headers=headers
)
if upload_response.status_code == 201:
document_id = upload_response.json()["document_id"]
# Get fields
headers = {"Authorization": sample_auth_token}
response = test_client.get(
f"/api/documents/{document_id}/fields",
params={"org_id": "test-org-123"},
headers=headers
)
assert response.status_code == 200
result = response.json()
assert "fields" in result
assert len(result["fields"]) == 0
def test_get_pdf_fields_without_auth_returns_401(self, test_client):
"""Test getting PDF fields without auth returns 401."""
response = test_client.get("/api/documents/test-doc-456/fields")
assert response.status_code == 401
class TestDocumentDeletion:
"""Test document deletion functionality."""
def test_delete_document_success(self, test_client, sample_auth_token):
"""Test deleting document successfully."""
headers = {"Authorization": sample_auth_token}
response = test_client.delete(
"/api/documents/test-doc-456",
params={"org_id": "test-org-123"},
headers=headers
)
# Will return 404 since document doesn't exist, but endpoint is accessible
assert response.status_code in [404, 403]
def test_delete_document_without_auth_returns_401(self, test_client):
"""Test deleting document without auth returns 401."""
response = test_client.delete("/api/documents/test-doc-456")
assert response.status_code == 401
class TestPDFFieldDiscoveryDirect:
"""Test PDF field discovery directly (without API)."""
def test_simple_form_pdf_fields(self, test_pdf_files):
"""Test field discovery on simple form PDF."""
fields = discover_fields(test_pdf_files["simple_form"])
assert len(fields) == 6
field_names = [f["field"] for f in fields]
assert "name" in field_names
assert "email" in field_names
assert "phone" in field_names
assert "country" in field_names
assert "birth_date" in field_names
assert "agree_terms" in field_names
# Check field types
field_types = {f["field"]: f["type"] for f in fields}
assert field_types["name"] == "string"
assert field_types["email"] == "string"
assert field_types["phone"] == "string"
assert field_types["country"] == "select"
assert field_types["birth_date"] == "date"
assert field_types["agree_terms"] == "boolean"
def test_complex_form_pdf_fields(self, test_pdf_files):
"""Test field discovery on complex form PDF."""
fields = discover_fields(test_pdf_files["complex_form"])
assert len(fields) == 16
field_names = [f["field"] for f in fields]
# Check for expected fields
assert "first_name" in field_names
assert "last_name" in field_names
assert "email" in field_names
assert "country" in field_names
assert "gender" in field_names
assert "agree_terms" in field_names
assert "signature" in field_names
# Check field types
field_types = {f["field"]: f["type"] for f in fields}
assert field_types["first_name"] == "string"
assert field_types["country"] == "select"
assert field_types["gender"] == "boolean"
assert field_types["agree_terms"] == "boolean"
assert field_types["signature"] == "string"
def test_no_form_pdf_fields(self, test_pdf_files):
"""Test field discovery on PDF without form fields."""
fields = discover_fields(test_pdf_files["no_form"])
assert len(fields) == 0
def test_large_form_pdf_fields(self, test_pdf_files):
"""Test field discovery on large PDF without form fields."""
fields = discover_fields(test_pdf_files["large_form"])
assert len(fields) == 0
def test_pdf_field_labels_generated_correctly(self, test_pdf_files):
"""Test that field labels are generated correctly."""
fields = discover_fields(test_pdf_files["simple_form"])
field_labels = {f["field"]: f["label"] for f in fields}
assert field_labels["name"] == "Name"
assert field_labels["email"] == "Email"
assert field_labels["phone"] == "Phone"
assert field_labels["country"] == "Country"
assert field_labels["birth_date"] == "Birth Date"
assert field_labels["agree_terms"] == "Agree Terms"
def test_pdf_field_options_extracted_correctly(self, test_pdf_files):
"""Test that dropdown options are extracted correctly."""
fields = discover_fields(test_pdf_files["simple_form"])
country_field = next(f for f in fields if f["field"] == "country")
assert country_field["type"] == "select"
assert country_field["options"] is not None
assert len(country_field["options"]) == 5
assert "USA" in country_field["options"]
assert "Canada" in country_field["options"]
assert "UK" in country_field["options"]
assert "Germany" in country_field["options"]
assert "France" in country_field["options"]
class TestCompleteWorkflow:
"""Test complete document lifecycle workflows."""
def test_complete_document_lifecycle(self, test_client, test_pdf_files, sample_auth_token):
"""Test complete document lifecycle: upload, get metadata, get fields, delete."""
# Upload document
with open(test_pdf_files["simple_form"], "rb") as f:
files = {"file": ("simple_form.pdf", f, "application/pdf")}
data = {"org_id": "test-org-123"}
headers = {"Authorization": sample_auth_token}
upload_response = test_client.post(
"/api/documents/upload",
files=files,
data=data,
headers=headers
)
if upload_response.status_code == 201:
document_id = upload_response.json()["document_id"]
# Get metadata
headers = {"Authorization": sample_auth_token}
metadata_response = test_client.get(
f"/api/documents/{document_id}",
params={"org_id": "test-org-123"},
headers=headers
)
# Get fields
fields_response = test_client.get(
f"/api/documents/{document_id}/fields",
params={"org_id": "test-org-123"},
headers=headers
)
# Get download URL
download_response = test_client.get(
f"/api/documents/{document_id}/download-url",
params={"org_id": "test-org-123"},
headers=headers
)
# Delete document
delete_response = test_client.delete(
f"/api/documents/{document_id}",
params={"org_id": "test-org-123"},
headers=headers
)
# Verify all operations succeeded
assert upload_response.status_code == 201
assert metadata_response.status_code in [200, 404] # May be 404 if S3 not available
assert fields_response.status_code in [200, 404]
assert download_response.status_code in [200, 404]
assert delete_response.status_code in [200, 404]