{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2015:CBEUSRA7FAU2BJYQV6CRXQ35WD","short_pith_number":"pith:CBEUSRA7","schema_version":"1.0","canonical_sha256":"104949441f2829a0a710af851bc37db0dda00842143c1260ecf6b4589af7f568","source":{"kind":"arxiv","id":"1507.08396","version":1},"attestation_state":"computed","paper":{"title":"Tag-Weighted Topic Model For Large-scale Semi-Structured Documents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.IR","cs.LG","stat.ML"],"primary_cat":"cs.CL","authors_text":"Guan Huang, Jiefei Li, Rong Pan, Ruiyang Tan, Shuangyin Li","submitted_at":"2015-07-30T06:44:37Z","abstract_excerpt":"To date, there have been massive Semi-Structured Documents (SSDs) during the evolution of the Internet. These SSDs contain both unstructured features (e.g., plain text) and metadata (e.g., tags). Most previous works focused on modeling the unstructured text, and recently, some other methods have been proposed to model the unstructured text with specific tags. To build a general model for SSDs remains an important problem in terms of both model fitness and efficiency. We propose a novel method to model the SSDs by a so-called Tag-Weighted Topic Model (TWTM). TWTM is a framework that leverages b"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1507.08396","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2015-07-30T06:44:37Z","cross_cats_sorted":["cs.IR","cs.LG","stat.ML"],"title_canon_sha256":"9d66616dd453e854dc7e531c4755737045724b268365905f1af8ffb80ed489d4","abstract_canon_sha256":"76bc2ffe3ec18dbf582af0e1332f01133285be17c9a5aaa631e0648d808ccfdf"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:36:06.583020Z","signature_b64":"VvmQMUitat6/2Gn83dlc4WE3WD4b+jo9J731EWaGuCOWKWm6k9Omfca809i+h+kve/gWT+OeNtfmODvIDV0QDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"104949441f2829a0a710af851bc37db0dda00842143c1260ecf6b4589af7f568","last_reissued_at":"2026-05-18T01:36:06.582424Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:36:06.582424Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Tag-Weighted Topic Model For Large-scale Semi-Structured Documents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.IR","cs.LG","stat.ML"],"primary_cat":"cs.CL","authors_text":"Guan Huang, Jiefei Li, Rong Pan, Ruiyang Tan, Shuangyin Li","submitted_at":"2015-07-30T06:44:37Z","abstract_excerpt":"To date, there have been massive Semi-Structured Documents (SSDs) during the evolution of the Internet. These SSDs contain both unstructured features (e.g., plain text) and metadata (e.g., tags). Most previous works focused on modeling the unstructured text, and recently, some other methods have been proposed to model the unstructured text with specific tags. To build a general model for SSDs remains an important problem in terms of both model fitness and efficiency. We propose a novel method to model the SSDs by a so-called Tag-Weighted Topic Model (TWTM). TWTM is a framework that leverages b"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1507.08396","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1507.08396","created_at":"2026-05-18T01:36:06.582507+00:00"},{"alias_kind":"arxiv_version","alias_value":"1507.08396v1","created_at":"2026-05-18T01:36:06.582507+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1507.08396","created_at":"2026-05-18T01:36:06.582507+00:00"},{"alias_kind":"pith_short_12","alias_value":"CBEUSRA7FAU2","created_at":"2026-05-18T12:29:14.074870+00:00"},{"alias_kind":"pith_short_16","alias_value":"CBEUSRA7FAU2BJYQ","created_at":"2026-05-18T12:29:14.074870+00:00"},{"alias_kind":"pith_short_8","alias_value":"CBEUSRA7","created_at":"2026-05-18T12:29:14.074870+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD","json":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD.json","graph_json":"https://pith.science/api/pith-number/CBEUSRA7FAU2BJYQV6CRXQ35WD/graph.json","events_json":"https://pith.science/api/pith-number/CBEUSRA7FAU2BJYQV6CRXQ35WD/events.json","paper":"https://pith.science/paper/CBEUSRA7"},"agent_actions":{"view_html":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD","download_json":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD.json","view_paper":"https://pith.science/paper/CBEUSRA7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1507.08396&json=true","fetch_graph":"https://pith.science/api/pith-number/CBEUSRA7FAU2BJYQV6CRXQ35WD/graph.json","fetch_events":"https://pith.science/api/pith-number/CBEUSRA7FAU2BJYQV6CRXQ35WD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD/action/storage_attestation","attest_author":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD/action/author_attestation","sign_citation":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD/action/citation_signature","submit_replication":"https://pith.science/pith/CBEUSRA7FAU2BJYQV6CRXQ35WD/action/replication_record"}},"created_at":"2026-05-18T01:36:06.582507+00:00","updated_at":"2026-05-18T01:36:06.582507+00:00"}