{"id":14708,"date":"2024-12-03T13:59:21","date_gmt":"2024-12-03T05:59:21","guid":{"rendered":"https:\/\/www.aisharenet.com\/?p=14708"},"modified":"2024-12-03T13:59:21","modified_gmt":"2024-12-03T05:59:21","slug":"textdistiller","status":"publish","type":"post","link":"https:\/\/www.kdjingpai.com\/pt\/textdistiller\/","title":{"rendered":"TextDistiller\uff1a\u4e00\u952e\u603b\u7ed3\u4e00\u6574\u672c\u4e66\uff0c\u9ad8\u6548\u63d0\u70bc\u4e66\u7c4d\u5185\u5bb9\uff0c\u5feb\u901f\u638c\u63e1\u6838\u5fc3\u601d\u60f3"},"content":{"rendered":"<p>TextDistiller \u662f\u4e00\u6b3e\u5148\u8fdb\u7684\u4eba\u5de5\u667a\u80fd\u9a71\u52a8\u5de5\u5177\uff0c\u65e8\u5728\u5bf9\u4e66\u7c4d\u8fdb\u884c\u9010\u7ae0\u6216\u6574\u4f53\u603b\u7ed3\uff0c\u63d0\u4f9b\u7b80\u6d01\u800c\u5168\u9762\u7684\u6982\u8ff0\u3002\u901a\u8fc7\u4f7f\u7528 TextDistiller\uff0c\u7528\u6237\u80fd\u591f\u5feb\u901f\u638c\u63e1\u4efb\u4f55\u4e66\u7c4d\u7684\u6838\u5fc3\u601d\u60f3\u548c\u5173\u952e\u8981\u70b9\uff0c\u4ece\u800c\u8282\u7701\u65f6\u95f4\uff0c\u540c\u65f6\u4fdd\u6301\u5bf9\u5185\u5bb9\u7684\u7406\u89e3\u3002\u8be5\u5de5\u5177\u5229\u7528\u6700\u5148\u8fdb\u7684\u81ea\u7136\u8bed\u8a00\u5904\u7406\u6280\u672f\uff0c\u786e\u4fdd\u751f\u6210\u7684\u6458\u8981\u65e2\u51c6\u786e\u53c8\u6613\u8bfb\uff0c\u9002\u7528\u4e8e\u9700\u8981\u5feb\u901f\u83b7\u53d6\u548c\u7406\u89e3\u4e66\u7c4d\u4fe1\u606f\u7684\u4eba\u7fa4\u3002<\/p>\n<p>&nbsp;<\/p>\n<h2>\u529f\u80fd\u5217\u8868<\/h2>\n<ul>\n<li><strong>\u9010\u7ae0\u603b\u7ed3<\/strong>\uff1a\u63d0\u4f9b\u6bcf\u7ae0\u8be6\u7ec6\u6458\u8981\uff0c\u65b9\u4fbf\u7528\u6237\u5173\u6ce8\u7279\u5b9a\u7ae0\u8282\u5185\u5bb9\u3002<\/li>\n<li><strong>\u6574\u4e66\u6982\u8ff0<\/strong>\uff1a\u5bf9\u4e8e\u6ca1\u6709\u7ae0\u8282\u5212\u5206\u7684\u4e66\u7c4d\uff0c\u63d0\u4f9b\u6574\u4f53\u5185\u5bb9\u7684\u51dd\u7ec3\u603b\u7ed3\u3002<\/li>\n<li><strong>\u81ea\u7136\u8bed\u8a00\u5904\u7406<\/strong>\uff1a\u5229\u7528\u6700\u5148\u8fdb\u7684 NLP \u6280\u672f\uff0c\u786e\u4fdd\u6458\u8981\u5185\u5bb9\u7684\u51c6\u786e\u6027\u548c\u53ef\u8bfb\u6027\u3002<\/li>\n<li><strong>\u7528\u6237\u53cb\u597d\u754c\u9762<\/strong>\uff1a\u7b80\u6d01\u76f4\u89c2\u7684\u754c\u9762\u8bbe\u8ba1\uff0c\u4f7f\u6458\u8981\u8fc7\u7a0b\u53d8\u5f97\u7b80\u5355\u6613\u884c\u3002<\/li>\n<\/ul>\n<p>&nbsp;<\/p>\n<h2>\u4f7f\u7528\u5e2e\u52a9<\/h2>\n<h3>\u5b89\u88c5\u6d41\u7a0b<\/h3>\n<ol>\n<li>\u514b\u9686\u4ed3\u5e93\uff1a<code>git clone https:\/\/github.com\/johngai19\/TextDistiller.git<\/code><\/li>\n<li>\u5b89\u88c5\u6240\u9700\u4f9d\u8d56\uff1a<code>pip install -r requirements.txt<\/code><\/li>\n<li>\u8fd0\u884c\u547d\u4ee4\u884c\u754c\u9762\uff08CLI\uff09\uff1a<code>python3 bsCLI.py --path &lt;path-to-PDF-file&gt;<\/code><\/li>\n<li>\u8fd0\u884c Flask \u670d\u52a1\u5668\u5e76\u66f4\u65b0\u90ae\u4ef6\u914d\u7f6e\uff1a\n<ul>\n<li>\u66f4\u65b0 <code>mail.py<\/code> \u4e2d\u7684 <code>sender_address<\/code> \u548c <code>sender_pass<\/code>\u3002<\/li>\n<li>\u8fd0\u884c <code>views.py<\/code>\uff1a<code>python3 views.py<\/code><\/li>\n<\/ul>\n<\/li>\n<\/ol>\n<h3>\u4f7f\u7528\u6d41\u7a0b<\/h3>\n<h4>\u9010\u7ae0\u603b\u7ed3<\/h4>\n<ol>\n<li>\u5c06\u4e66\u7c4d PDF \u6587\u4ef6\u8def\u5f84\u4f5c\u4e3a\u53c2\u6570\u4f20\u9012\u7ed9\u547d\u4ee4\u884c\u5de5\u5177\u3002<\/li>\n<li>\u5de5\u5177\u4f1a\u81ea\u52a8\u5c06\u4e66\u7c4d\u6309\u7ae0\u8282\u5206\u5757\uff0c\u5e76\u751f\u6210\u6bcf\u7ae0\u7684\u8be6\u7ec6\u6458\u8981\u3002<\/li>\n<li>\u7528\u6237\u53ef\u4ee5\u67e5\u770b\u6bcf\u7ae0\u7684\u6838\u5fc3\u5185\u5bb9\uff0c\u5feb\u901f\u638c\u63e1\u4e66\u7c4d\u7684\u4e3b\u8981\u601d\u60f3\u3002<\/li>\n<\/ol>\n<h4>\u6574\u4e66\u6982\u8ff0<\/h4>\n<ol>\n<li>\u5bf9\u4e8e\u6ca1\u6709\u7ae0\u8282\u5212\u5206\u7684\u4e66\u7c4d\uff0c\u5de5\u5177\u4f1a\u5c06\u6574\u672c\u4e66\u4f5c\u4e3a\u4e00\u4e2a\u6574\u4f53\u8fdb\u884c\u5904\u7406\u3002<\/li>\n<li>\u751f\u6210\u7684\u6458\u8981\u5c06\u6db5\u76d6\u4e66\u7c4d\u7684\u6240\u6709\u91cd\u8981\u5185\u5bb9\uff0c\u63d0\u4f9b\u4e00\u4e2a\u5168\u9762\u7684\u6982\u8ff0\u3002<\/li>\n<\/ol>\n<h3>\u4e3b\u8981\u529f\u80fd\u64cd\u4f5c<\/h3>\n<ul>\n<li><strong>\u9010\u7ae0\u603b\u7ed3<\/strong>\uff1a\u5728\u547d\u4ee4\u884c\u4e2d\u8fd0\u884c <code>python3 bsCLI.py --path &lt;path-to-PDF-file&gt;<\/code>\uff0c\u5de5\u5177\u4f1a\u81ea\u52a8\u5904\u7406\u5e76\u751f\u6210\u6bcf\u7ae0\u6458\u8981\u3002<\/li>\n<li><strong>\u6574\u4e66\u6982\u8ff0<\/strong>\uff1a\u540c\u6837\u5728\u547d\u4ee4\u884c\u4e2d\u8fd0\u884c\u4e0a\u8ff0\u547d\u4ee4\uff0c\u5de5\u5177\u4f1a\u6839\u636e\u4e66\u7c4d\u7ed3\u6784\u81ea\u52a8\u9009\u62e9\u9002\u5f53\u7684\u5904\u7406\u65b9\u5f0f\u3002<\/li>\n<li><strong>\u67e5\u770b\u6458\u8981<\/strong>\uff1a\u751f\u6210\u7684\u6458\u8981\u5c06\u4ee5\u6587\u672c\u6587\u4ef6\u7684\u5f62\u5f0f\u4fdd\u5b58\u5728\u6307\u5b9a\u76ee\u5f55\uff0c\u7528\u6237\u53ef\u4ee5\u76f4\u63a5\u6253\u5f00\u67e5\u770b\u3002<\/li>\n<\/ul>\n<h3>\u7279\u8272\u529f\u80fd<\/h3>\n<ul>\n<li><strong>\u81ea\u7136\u8bed\u8a00\u5904\u7406\u6280\u672f<\/strong>\uff1aTextDistiller \u5229\u7528 T5-small \u9884\u8bad\u7ec3\u6a21\u578b\uff0c\u901a\u8fc7\u5206\u5757\u3001\u6807\u8bb0\u5316\u3001\u6458\u8981\u751f\u6210\u548c\u89e3\u7801\u7b49\u6b65\u9aa4\uff0c\u786e\u4fdd\u751f\u6210\u7684\u6458\u8981\u65e2\u51c6\u786e\u53c8\u6613\u8bfb\u3002<\/li>\n<li><strong>\u7528\u6237\u53cb\u597d\u754c\u9762<\/strong>\uff1a\u65e0\u8bba\u662f\u547d\u4ee4\u884c\u5de5\u5177\u8fd8\u662f Flask \u670d\u52a1\u5668\uff0cTextDistiller \u90fd\u63d0\u4f9b\u4e86\u7b80\u6d01\u76f4\u89c2\u7684\u64cd\u4f5c\u754c\u9762\uff0c\u7528\u6237\u53ef\u4ee5\u8f7b\u677e\u4e0a\u624b\u4f7f\u7528\u3002<\/li>\n<\/ul>\n<p>&nbsp;<\/p>\n<h2>TextDistiller \u7684\u5de5\u4f5c\u539f\u7406<\/h2>\n<p>TextDistiller \u5229\u7528 HuggingFace Transformers \u4e2d\u7684\u00a0<code>T5-small<\/code>\u00a0\u9884\u8bad\u7ec3\u6a21\u578b\u6765\u751f\u6210\u51c6\u786e\u4e14\u6613\u8bfb\u7684\u6458\u8981\u3002\u8be5\u8fc7\u7a0b\u5305\u62ec\uff1a<\/p>\n<ol>\n<li><strong>\u5206\u5757<\/strong>\uff1a\u5c06\u4e66\u7c4d\u5206\u5272\u6210\u82e5\u5e72\u5757\uff0c\u53ef\u4ee5\u6309\u7ae0\u8282\u5206\u5272\u6216\u4f5c\u4e3a\u4e00\u4e2a\u6574\u4f53\u3002<\/li>\n<li><strong>\u5206\u8bcd<\/strong>\uff1a\u4f7f\u7528\u00a0<code>T5Tokenizer<\/code>\u00a0\u5bf9\u8fd9\u4e9b\u5757\u8fdb\u884c\u5206\u8bcd\uff0c\u4ee5\u786e\u4fdd\u4e0e\u00a0<code>T5<\/code>\u00a0\u6a21\u578b\u517c\u5bb9\u3002<\/li>\n<li><strong>\u6458\u8981\u751f\u6210<\/strong>\uff1a\u7ecf\u8fc7\u5206\u8bcd\u5904\u7406\u7684\u6587\u672c\u901a\u8fc7\u00a0<code>T5ForConditionalGeneration<\/code>\u00a0\u6a21\u578b\u751f\u6210\u6458\u8981\u7684 <a href=\"https:\/\/www.kdjingpai.com\/tokenization\/\">Token<\/a> ID\u3002<\/li>\n<li><strong>\u89e3\u7801<\/strong>\uff1a\u4f7f\u7528\u00a0<code>T5Tokenizer<\/code>\u00a0\u7684\u00a0<code>decode()<\/code>\u00a0\u51fd\u6570\u5c06\u6458\u8981\u7684 Token ID \u89e3\u7801\u4e3a\u53ef\u8bfb\u7684\u6587\u672c\u3002<\/li>\n<\/ol>\n","protected":false},"excerpt":{"rendered":"<p>TextDistiller \u662f\u4e00\u6b3e\u5148\u8fdb\u7684\u4eba\u5de5\u667a\u80fd\u9a71\u52a8\u5de5\u5177\uff0c\u65e8\u5728\u5bf9\u4e66\u7c4d\u8fdb\u884c\u9010\u7ae0\u6216\u6574\u4f53\u603b\u7ed3\uff0c\u63d0\u4f9b\u7b80\u6d01\u800c\u5168\u9762\u7684\u6982\u8ff0\u3002\u901a\u8fc7\u4f7f\u7528 TextDistiller\uff0c\u7528\u6237\u80fd\u591f\u5feb\u901f\u638c\u63e1\u4efb\u4f55\u4e66\u7c4d\u7684\u6838\u5fc3\u601d\u60f3\u548c\u5173\u952e\u8981\u70b9\uff0c\u4ece\u800c\u8282\u7701\u65f6\u95f4\uff0c\u540c\u65f6\u4fdd\u6301\u5bf9\u5185\u5bb9\u7684\u7406\u89e3\u3002\u8be5\u5de5\u5177\u5229\u7528&#8230;<\/p>\n","protected":false},"author":1,"featured_media":61354,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[20],"tags":[230,236],"class_list":["post-14708","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-tool","tag-aikaiyuanxiangmu","tag-aihuiyishipinzongban"],"_links":{"self":[{"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/posts\/14708","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/comments?post=14708"}],"version-history":[{"count":0,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/posts\/14708\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/media\/61354"}],"wp:attachment":[{"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/media?parent=14708"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/categories?post=14708"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/tags?post=14708"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}