{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:N6RK7NHVJQFUPPBD5UH4JPUXGC","short_pith_number":"pith:N6RK7NHV","schema_version":"1.0","canonical_sha256":"6fa2afb4f54c0b47bc23ed0fc4be9730809cf2a434a5e49fae721a5a926d01eb","source":{"kind":"arxiv","id":"2501.06659","version":2},"attestation_state":"computed","paper":{"title":"Visual Template Inference for Data Extraction from Documents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.DB","authors_text":"Aditya G. Parameswaran, Alvin Cheung, Mawil Hasan, Rohan Kosalge, Yiming Lin","submitted_at":"2025-01-11T23:07:04Z","abstract_excerpt":"Many templatized documents are programmatically generated from structured data following a visual template. Such documents include invoices, tax documents, financial reports, and purchase orders. Effective data extraction from these documents is crucial to support downstream analytical tasks. Current data extraction tools often struggle with complex document layouts, incur high latency and/or cost on large datasets, and require significant human effort. The key insight of our tool, TWIX, is to infer the underlying template used to create such documents, and then extract the data, rather than e"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2501.06659","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.DB","submitted_at":"2025-01-11T23:07:04Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"69979ec0b0c135e5ff539259a0e0b7fbb2d4a0ba23db71a388518571b4b455c7","abstract_canon_sha256":"5904112b23a1ed31ff4e1f0f4507073fe2e9deb1ee5fcc0a57c19e2a0700a210"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T02:07:01.710502Z","signature_b64":"VRKo6tmzzsl8ZdRstU8ucu5n1wEdNSTq6Y9L/iJMFTQMgx7yU574CfY0owM1SAWY2TANUINXnoqJi8agCacYCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6fa2afb4f54c0b47bc23ed0fc4be9730809cf2a434a5e49fae721a5a926d01eb","last_reissued_at":"2026-06-09T02:07:01.709457Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T02:07:01.709457Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Visual Template Inference for Data Extraction from Documents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.DB","authors_text":"Aditya G. Parameswaran, Alvin Cheung, Mawil Hasan, Rohan Kosalge, Yiming Lin","submitted_at":"2025-01-11T23:07:04Z","abstract_excerpt":"Many templatized documents are programmatically generated from structured data following a visual template. Such documents include invoices, tax documents, financial reports, and purchase orders. Effective data extraction from these documents is crucial to support downstream analytical tasks. Current data extraction tools often struggle with complex document layouts, incur high latency and/or cost on large datasets, and require significant human effort. The key insight of our tool, TWIX, is to infer the underlying template used to create such documents, and then extract the data, rather than e"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2501.06659","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2501.06659/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2501.06659","created_at":"2026-06-09T02:07:01.709604+00:00"},{"alias_kind":"arxiv_version","alias_value":"2501.06659v2","created_at":"2026-06-09T02:07:01.709604+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2501.06659","created_at":"2026-06-09T02:07:01.709604+00:00"},{"alias_kind":"pith_short_12","alias_value":"N6RK7NHVJQFU","created_at":"2026-06-09T02:07:01.709604+00:00"},{"alias_kind":"pith_short_16","alias_value":"N6RK7NHVJQFUPPBD","created_at":"2026-06-09T02:07:01.709604+00:00"},{"alias_kind":"pith_short_8","alias_value":"N6RK7NHV","created_at":"2026-06-09T02:07:01.709604+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":5,"internal_anchor_count":5,"sample":[{"citing_arxiv_id":"2503.04338","citing_title":"In-depth Analysis of Graph-based RAG in a Unified Framework","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2509.00303","citing_title":"Access Paths for Efficient Ordering with Large Language Models","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2602.23061","citing_title":"MoDora: Tree-Based Semi-Structured Document Analysis System","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2604.01707","citing_title":"Memory in the LLM Era: Modular Architectures and Strategies in a Unified Framework","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02655","citing_title":"Semantic Data Processing with Holistic Data Understanding","ref_index":35,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC","json":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC.json","graph_json":"https://pith.science/api/pith-number/N6RK7NHVJQFUPPBD5UH4JPUXGC/graph.json","events_json":"https://pith.science/api/pith-number/N6RK7NHVJQFUPPBD5UH4JPUXGC/events.json","paper":"https://pith.science/paper/N6RK7NHV"},"agent_actions":{"view_html":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC","download_json":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC.json","view_paper":"https://pith.science/paper/N6RK7NHV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2501.06659&json=true","fetch_graph":"https://pith.science/api/pith-number/N6RK7NHVJQFUPPBD5UH4JPUXGC/graph.json","fetch_events":"https://pith.science/api/pith-number/N6RK7NHVJQFUPPBD5UH4JPUXGC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC/action/storage_attestation","attest_author":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC/action/author_attestation","sign_citation":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC/action/citation_signature","submit_replication":"https://pith.science/pith/N6RK7NHVJQFUPPBD5UH4JPUXGC/action/replication_record"}},"created_at":"2026-06-09T02:07:01.709604+00:00","updated_at":"2026-06-09T02:07:01.709604+00:00"}