""" Comprehensive test suite for document-service. Tests document upload, retrieval, field discovery, and complete workflows. """ import pytest import os from fastapi.testclient import TestClient from unittest.mock import Mock, patch from moto import mock_aws import boto3 from app.main import app from app.pdf import discover_fields # Test data paths FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures") class TestHealthEndpoint: """Test health endpoint functionality.""" def test_health_endpoint(self, test_client): """Test health endpoint returns 200 OK.""" response = test_client.get("/health") assert response.status_code == 200 assert response.json() == {"status": "ok"} class TestDocumentUpload: """Test document upload functionality.""" def test_upload_simple_pdf_success(self, test_client, test_pdf_files, sample_auth_token): """Test uploading a simple PDF with form fields.""" with open(test_pdf_files["simple_form"], "rb") as f: files = {"file": ("simple_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} headers = {"Authorization": sample_auth_token} response = test_client.post( "/api/documents/upload", files=files, data=data, headers=headers ) assert response.status_code == 201 result = response.json() assert "document_id" in result assert "metadata" in result assert "download_url" in result assert result["metadata"]["document_type"] == "pdf" assert result["metadata"]["filename"] == "simple_form.pdf" def test_upload_complex_pdf_success(self, test_client, test_pdf_files, sample_auth_token): """Test uploading a complex PDF with multiple field types.""" with open(test_pdf_files["complex_form"], "rb") as f: files = {"file": ("complex_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} headers = {"Authorization": sample_auth_token} response = test_client.post( "/api/documents/upload", files=files, data=data, headers=headers ) assert response.status_code == 201 result = response.json() assert "document_id" in result assert result["metadata"]["document_type"] == "pdf" def test_upload_no_form_pdf_success(self, test_client, test_pdf_files, sample_auth_token): """Test uploading a PDF without form fields.""" with open(test_pdf_files["no_form"], "rb") as f: files = {"file": ("no_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} headers = {"Authorization": sample_auth_token} response = test_client.post( "/api/documents/upload", files=files, data=data, headers=headers ) assert response.status_code == 201 result = response.json() assert "document_id" in result def test_upload_without_auth_returns_401(self, test_client, test_pdf_files): """Test upload without auth returns 401.""" with open(test_pdf_files["simple_form"], "rb") as f: files = {"file": ("simple_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} response = test_client.post( "/api/documents/upload", files=files, data=data ) assert response.status_code == 401 assert "detail" in response.json() def test_upload_with_invalid_auth_returns_401(self, test_client, test_pdf_files): """Test upload with invalid auth returns 401.""" with open(test_pdf_files["simple_form"], "rb") as f: files = {"file": ("simple_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} headers = {"Authorization": "Invalid token"} response = test_client.post( "/api/documents/upload", files=files, data=data, headers=headers ) assert response.status_code == 401 def test_upload_missing_file_returns_400(self, test_client, sample_auth_token): """Test upload without file returns 400.""" data = {"org_id": "test-org-123"} headers = {"Authorization": sample_auth_token} response = test_client.post( "/api/documents/upload", data=data, headers=headers ) assert response.status_code == 422 # FastAPI validation error class TestDocumentMetadata: """Test document metadata retrieval.""" def test_get_document_metadata_success(self, test_client, sample_auth_token): """Test getting document metadata successfully.""" # This test would require a document to be uploaded first # For now, we'll test the endpoint structure headers = {"Authorization": sample_auth_token} response = test_client.get( "/api/documents/test-doc-456", params={"org_id": "test-org-123"}, headers=headers ) # Will return 404 since document doesn't exist, but endpoint is accessible assert response.status_code in [404, 403] def test_get_document_without_auth_returns_401(self, test_client): """Test getting document without auth returns 401.""" response = test_client.get("/api/documents/test-doc-456") assert response.status_code == 401 class TestDownloadUrl: """Test download URL generation.""" def test_get_download_url_success(self, test_client, sample_auth_token): """Test getting download URL successfully.""" headers = {"Authorization": sample_auth_token} response = test_client.get( "/api/documents/test-doc-456/download-url", params={"org_id": "test-org-123"}, headers=headers ) # Will return 404 since document doesn't exist, but endpoint is accessible assert response.status_code in [404, 403] def test_get_download_url_without_auth_returns_401(self, test_client): """Test getting download URL without auth returns 401.""" response = test_client.get("/api/documents/test-doc-456/download-url") assert response.status_code == 401 class TestPDFFieldDiscovery: """Test PDF field discovery functionality.""" def test_get_pdf_fields_simple_form(self, test_client, test_pdf_files, sample_auth_token): """Test getting PDF fields from simple form.""" # First upload the document with open(test_pdf_files["simple_form"], "rb") as f: files = {"file": ("simple_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} headers = {"Authorization": sample_auth_token} upload_response = test_client.post( "/api/documents/upload", files=files, data=data, headers=headers ) if upload_response.status_code == 201: document_id = upload_response.json()["document_id"] # Get fields headers = {"Authorization": sample_auth_token} response = test_client.get( f"/api/documents/{document_id}/fields", params={"org_id": "test-org-123"}, headers=headers ) assert response.status_code == 200 result = response.json() assert "fields" in result assert len(result["fields"]) == 6 # name, email, phone, country, birth_date, agree_terms # Check field types field_types = {f["field"]: f["type"] for f in result["fields"]} assert field_types["name"] == "string" assert field_types["email"] == "string" assert field_types["phone"] == "string" assert field_types["country"] == "select" assert field_types["birth_date"] == "date" assert field_types["agree_terms"] == "boolean" def test_get_pdf_fields_complex_form(self, test_client, test_pdf_files, sample_auth_token): """Test getting PDF fields from complex form.""" # First upload the document with open(test_pdf_files["complex_form"], "rb") as f: files = {"file": ("complex_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} headers = {"Authorization": sample_auth_token} upload_response = test_client.post( "/api/documents/upload", files=files, data=data, headers=headers ) if upload_response.status_code == 201: document_id = upload_response.json()["document_id"] # Get fields headers = {"Authorization": sample_auth_token} response = test_client.get( f"/api/documents/{document_id}/fields", params={"org_id": "test-org-123"}, headers=headers ) assert response.status_code == 200 result = response.json() assert "fields" in result assert len(result["fields"]) == 16 # All fields from complex form def test_get_pdf_fields_no_form_returns_empty_list(self, test_client, test_pdf_files, sample_auth_token): """Test getting PDF fields from PDF without form fields.""" # First upload the document with open(test_pdf_files["no_form"], "rb") as f: files = {"file": ("no_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} headers = {"Authorization": sample_auth_token} upload_response = test_client.post( "/api/documents/upload", files=files, data=data, headers=headers ) if upload_response.status_code == 201: document_id = upload_response.json()["document_id"] # Get fields headers = {"Authorization": sample_auth_token} response = test_client.get( f"/api/documents/{document_id}/fields", params={"org_id": "test-org-123"}, headers=headers ) assert response.status_code == 200 result = response.json() assert "fields" in result assert len(result["fields"]) == 0 def test_get_pdf_fields_without_auth_returns_401(self, test_client): """Test getting PDF fields without auth returns 401.""" response = test_client.get("/api/documents/test-doc-456/fields") assert response.status_code == 401 class TestDocumentDeletion: """Test document deletion functionality.""" def test_delete_document_success(self, test_client, sample_auth_token): """Test deleting document successfully.""" headers = {"Authorization": sample_auth_token} response = test_client.delete( "/api/documents/test-doc-456", params={"org_id": "test-org-123"}, headers=headers ) # Will return 404 since document doesn't exist, but endpoint is accessible assert response.status_code in [404, 403] def test_delete_document_without_auth_returns_401(self, test_client): """Test deleting document without auth returns 401.""" response = test_client.delete("/api/documents/test-doc-456") assert response.status_code == 401 class TestPDFFieldDiscoveryDirect: """Test PDF field discovery directly (without API).""" def test_simple_form_pdf_fields(self, test_pdf_files): """Test field discovery on simple form PDF.""" fields = discover_fields(test_pdf_files["simple_form"]) assert len(fields) == 6 field_names = [f["field"] for f in fields] assert "name" in field_names assert "email" in field_names assert "phone" in field_names assert "country" in field_names assert "birth_date" in field_names assert "agree_terms" in field_names # Check field types field_types = {f["field"]: f["type"] for f in fields} assert field_types["name"] == "string" assert field_types["email"] == "string" assert field_types["phone"] == "string" assert field_types["country"] == "select" assert field_types["birth_date"] == "date" assert field_types["agree_terms"] == "boolean" def test_complex_form_pdf_fields(self, test_pdf_files): """Test field discovery on complex form PDF.""" fields = discover_fields(test_pdf_files["complex_form"]) assert len(fields) == 16 field_names = [f["field"] for f in fields] # Check for expected fields assert "first_name" in field_names assert "last_name" in field_names assert "email" in field_names assert "country" in field_names assert "gender" in field_names assert "agree_terms" in field_names assert "signature" in field_names # Check field types field_types = {f["field"]: f["type"] for f in fields} assert field_types["first_name"] == "string" assert field_types["country"] == "select" assert field_types["gender"] == "boolean" assert field_types["agree_terms"] == "boolean" assert field_types["signature"] == "string" def test_no_form_pdf_fields(self, test_pdf_files): """Test field discovery on PDF without form fields.""" fields = discover_fields(test_pdf_files["no_form"]) assert len(fields) == 0 def test_large_form_pdf_fields(self, test_pdf_files): """Test field discovery on large PDF without form fields.""" fields = discover_fields(test_pdf_files["large_form"]) assert len(fields) == 0 def test_pdf_field_labels_generated_correctly(self, test_pdf_files): """Test that field labels are generated correctly.""" fields = discover_fields(test_pdf_files["simple_form"]) field_labels = {f["field"]: f["label"] for f in fields} assert field_labels["name"] == "Name" assert field_labels["email"] == "Email" assert field_labels["phone"] == "Phone" assert field_labels["country"] == "Country" assert field_labels["birth_date"] == "Birth Date" assert field_labels["agree_terms"] == "Agree Terms" def test_pdf_field_options_extracted_correctly(self, test_pdf_files): """Test that dropdown options are extracted correctly.""" fields = discover_fields(test_pdf_files["simple_form"]) country_field = next(f for f in fields if f["field"] == "country") assert country_field["type"] == "select" assert country_field["options"] is not None assert len(country_field["options"]) == 5 assert "USA" in country_field["options"] assert "Canada" in country_field["options"] assert "UK" in country_field["options"] assert "Germany" in country_field["options"] assert "France" in country_field["options"] class TestCompleteWorkflow: """Test complete document lifecycle workflows.""" def test_complete_document_lifecycle(self, test_client, test_pdf_files, sample_auth_token): """Test complete document lifecycle: upload, get metadata, get fields, delete.""" # Upload document with open(test_pdf_files["simple_form"], "rb") as f: files = {"file": ("simple_form.pdf", f, "application/pdf")} data = {"org_id": "test-org-123"} headers = {"Authorization": sample_auth_token} upload_response = test_client.post( "/api/documents/upload", files=files, data=data, headers=headers ) if upload_response.status_code == 201: document_id = upload_response.json()["document_id"] # Get metadata headers = {"Authorization": sample_auth_token} metadata_response = test_client.get( f"/api/documents/{document_id}", params={"org_id": "test-org-123"}, headers=headers ) # Get fields fields_response = test_client.get( f"/api/documents/{document_id}/fields", params={"org_id": "test-org-123"}, headers=headers ) # Get download URL download_response = test_client.get( f"/api/documents/{document_id}/download-url", params={"org_id": "test-org-123"}, headers=headers ) # Delete document delete_response = test_client.delete( f"/api/documents/{document_id}", params={"org_id": "test-org-123"}, headers=headers ) # Verify all operations succeeded assert upload_response.status_code == 201 assert metadata_response.status_code in [200, 404] # May be 404 if S3 not available assert fields_response.status_code in [200, 404] assert download_response.status_code in [200, 404] assert delete_response.status_code in [200, 404]