{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:4OZHGM25QQMWHW2RN2AWY7LAJ7","short_pith_number":"pith:4OZHGM25","schema_version":"1.0","canonical_sha256":"e3b273335d841963db516e816c7d604fe74041b309d17a651b47edeebdd4947c","source":{"kind":"arxiv","id":"2509.26468","version":3},"attestation_state":"computed","paper":{"title":"fev-bench: A Realistic Benchmark for Time Series Forecasting","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Abdul Fatir Ansari, Caner Turkmen, Lorenzo Stella, Michael Bohlke-Schneider, Nick Erickson, Oleksandr Shchur, Pablo Guerron, Yuyang Wang","submitted_at":"2025-09-30T16:17:18Z","abstract_excerpt":"Benchmark quality is critical for meaningful evaluation and sustained progress in time series forecasting, particularly with the rise of pretrained models. Existing benchmarks often have limited domain coverage or overlook real-world settings such as tasks with covariates. Their aggregation procedures frequently lack statistical rigor, making it unclear whether observed performance differences reflect true improvements or random variation. Many benchmarks lack consistent evaluation infrastructure or are too rigid for integration into existing pipelines. To address these gaps, we propose fev-be"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2509.26468","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-30T16:17:18Z","cross_cats_sorted":[],"title_canon_sha256":"da2af2e187788e448c1e6f53bd9ae715fe23ad0818ba4cfeb093e10a7903772d","abstract_canon_sha256":"ad946717d47c86e743b2df91fa1df63a8740116f2cfb66e876e1dd9795942d2c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T02:17:09.934887Z","signature_b64":"wXDiL6G7TTKEgUdv7Kx1Um9mlkFW/MpAQRLfJiDrBxDE3ipIN0a/tqRsVJ0MJzKvrC1nj3aZVQkU8jDYU7BqCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e3b273335d841963db516e816c7d604fe74041b309d17a651b47edeebdd4947c","last_reissued_at":"2026-06-30T02:17:09.934101Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T02:17:09.934101Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"fev-bench: A Realistic Benchmark for Time Series Forecasting","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Abdul Fatir Ansari, Caner Turkmen, Lorenzo Stella, Michael Bohlke-Schneider, Nick Erickson, Oleksandr Shchur, Pablo Guerron, Yuyang Wang","submitted_at":"2025-09-30T16:17:18Z","abstract_excerpt":"Benchmark quality is critical for meaningful evaluation and sustained progress in time series forecasting, particularly with the rise of pretrained models. Existing benchmarks often have limited domain coverage or overlook real-world settings such as tasks with covariates. Their aggregation procedures frequently lack statistical rigor, making it unclear whether observed performance differences reflect true improvements or random variation. Many benchmarks lack consistent evaluation infrastructure or are too rigid for integration into existing pipelines. To address these gaps, we propose fev-be"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.26468","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.26468/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.26468","created_at":"2026-06-30T02:17:09.934190+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.26468v3","created_at":"2026-06-30T02:17:09.934190+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.26468","created_at":"2026-06-30T02:17:09.934190+00:00"},{"alias_kind":"pith_short_12","alias_value":"4OZHGM25QQMW","created_at":"2026-06-30T02:17:09.934190+00:00"},{"alias_kind":"pith_short_16","alias_value":"4OZHGM25QQMWHW2R","created_at":"2026-06-30T02:17:09.934190+00:00"},{"alias_kind":"pith_short_8","alias_value":"4OZHGM25","created_at":"2026-06-30T02:17:09.934190+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":11,"internal_anchor_count":11,"sample":[{"citing_arxiv_id":"2606.30410","citing_title":"Beyond IID: How General Are Tabular Foundation Models, Really?","ref_index":82,"is_internal_anchor":true},{"citing_arxiv_id":"2605.27286","citing_title":"Falcon-X: A Time Series Foundation Model for Heterogeneous Multivariate Modeling","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22586","citing_title":"A Foundation Model for Instruction-Conditioned In-Context Time Series Tasks","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13986","citing_title":"TabPFN-3: Technical Report","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22586","citing_title":"A Foundation Model for Instruction-Conditioned In-Context Time Series Tasks","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12200","citing_title":"Investigating simple target-covariate relationships for Chronos-2 and TabPFN-TS","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2604.28149","citing_title":"Explainable Load Forecasting with Covariate-Informed Time Series Foundation Models","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00015","citing_title":"TimeRFT: Stimulating Generalizable Time Series Forecasting for TSFMs via Reinforcement Finetuning","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27351","citing_title":"Heterogeneous Scientific Foundation Model Collaboration","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24705","citing_title":"Energy-Arena: A Dynamic Benchmark for Operational Energy Forecasting","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22328","citing_title":"FETS Benchmark: Foundation Models Outperform Dataset-specific Machine Learning in Energy Time Series Forecasting","ref_index":16,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7","json":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7.json","graph_json":"https://pith.science/api/pith-number/4OZHGM25QQMWHW2RN2AWY7LAJ7/graph.json","events_json":"https://pith.science/api/pith-number/4OZHGM25QQMWHW2RN2AWY7LAJ7/events.json","paper":"https://pith.science/paper/4OZHGM25"},"agent_actions":{"view_html":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7","download_json":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7.json","view_paper":"https://pith.science/paper/4OZHGM25","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.26468&json=true","fetch_graph":"https://pith.science/api/pith-number/4OZHGM25QQMWHW2RN2AWY7LAJ7/graph.json","fetch_events":"https://pith.science/api/pith-number/4OZHGM25QQMWHW2RN2AWY7LAJ7/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7/action/storage_attestation","attest_author":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7/action/author_attestation","sign_citation":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7/action/citation_signature","submit_replication":"https://pith.science/pith/4OZHGM25QQMWHW2RN2AWY7LAJ7/action/replication_record"}},"created_at":"2026-06-30T02:17:09.934190+00:00","updated_at":"2026-06-30T02:17:09.934190+00:00"}