{"id":5641,"date":"2024-09-01T15:25:52","date_gmt":"2024-09-01T07:25:52","guid":{"rendered":"https:\/\/www.aisharenet.com\/?p=5641"},"modified":"2025-08-25T01:31:02","modified_gmt":"2025-08-24T17:31:02","slug":"unstructured","status":"publish","type":"post","link":"https:\/\/www.kdjingpai.com\/pt\/unstructured\/","title":{"rendered":"Unstructured\uff1a\u5f00\u6e90\u9884\u5904\u7406\u975e\u7ed3\u6784\u5316\u6587\u6863\uff0c\u65e0\u7ed3\u6784\u6570\u636e\u5904\u7406\u7684\u5229\u5668"},"content":{"rendered":"<p>Unstructured-IO \u63d0\u4f9b\u4e86\u4e00\u7cfb\u5217\u5f00\u6e90\u7ec4\u4ef6\uff0c\u7528\u4e8e\u5904\u7406\u548c\u9884\u5904\u7406\u56fe\u50cf\u548c\u6587\u672c\u6587\u6863\uff0c\u5982 PDF\u3001HTML\u3001Word \u6587\u6863\u7b49\u3002\u5176\u4e3b\u8981\u76ee\u6807\u662f\u7b80\u5316\u548c\u4f18\u5316\u6570\u636e\u5904\u7406\u5de5\u4f5c\u6d41\u7a0b\uff0c\u7279\u522b\u662f\u4e3a\u5927\u8bed\u8a00\u6a21\u578b\uff08LLM\uff09\u5e94\u7528\u63d0\u4f9b\u652f\u6301\u3002Unstructured-IO \u7684\u6a21\u5757\u5316\u529f\u80fd\u548c\u8fde\u63a5\u5668\u5f62\u6210\u4e86\u4e00\u4e2a\u7edf\u4e00\u7684\u7cfb\u7edf\uff0c\u4f7f\u6570\u636e\u7684\u6444\u53d6\u548c\u9884\u5904\u7406\u53d8\u5f97\u9ad8\u6548\u4e14\u9002\u5e94\u4e0d\u540c\u5e73\u53f0\u3002<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" class=\"aligncenter wp-image-5642\" title=\"Unstructured\uff1a\u5f00\u6e90\u9884\u5904\u7406\u975e\u7ed3\u6784\u5316\u6587\u6863\uff0c\u65e0\u7ed3\u6784\u6570\u636e\u5904\u7406\u7684\u5229\u5668-1\" src=\"https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2024\/09\/a5bda69abe21265.png\" alt=\"Unstructured\uff1a\u5f00\u6e90\u9884\u5904\u7406\u975e\u7ed3\u6784\u5316\u6587\u6863\uff0c\u65e0\u7ed3\u6784\u6570\u636e\u5904\u7406\u7684\u5229\u5668-1\" width=\"812\" height=\"402\" srcset=\"https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2024\/09\/a5bda69abe21265.png 1557w, https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2024\/09\/a5bda69abe21265-300x149.png 300w, https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2024\/09\/a5bda69abe21265-1024x507.png 1024w, https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2024\/09\/a5bda69abe21265-768x380.png 768w, https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2024\/09\/a5bda69abe21265-1536x761.png 1536w\" sizes=\"auto, (max-width: 812px) 100vw, 812px\" \/><\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<h2>\u529f\u80fd\u5217\u8868<\/h2>\n<ul>\n<li>\u6570\u636e\u6444\u53d6\u548c\u9884\u5904\u7406<\/li>\n<li>\u652f\u6301\u591a\u79cd\u6587\u6863\u7c7b\u578b\uff08PDF\u3001HTML\u3001Word \u7b49\uff09<\/li>\n<li>\u6a21\u5757\u5316\u529f\u80fd\u548c\u8fde\u63a5\u5668<\/li>\n<li>\u63d0\u4f9b\u5f00\u6e90 API \u548c\u5ba2\u6237\u7aef\u5e93<\/li>\n<li>\u652f\u6301 Docker \u5bb9\u5668\u5316\u90e8\u7f72<\/li>\n<li>\u63d0\u4f9b\u65e0\u670d\u52a1\u5668 API \u4ee5\u63d0\u9ad8\u6027\u80fd<\/li>\n<\/ul>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<h2>\u4f7f\u7528\u5e2e\u52a9<\/h2>\n<h3>\u5b89\u88c5\u6d41\u7a0b<\/h3>\n<ol>\n<li><strong>\u4f7f\u7528 Docker \u5bb9\u5668\u8fd0\u884c\u5e93<\/strong>\n<ul>\n<li>\u786e\u4fdd\u5df2\u5b89\u88c5 Docker\u3002<\/li>\n<li>\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\u4ee5\u4e0b\u8f7d\u5e76\u8fd0\u884c\u76f8\u5e94\u7684 Docker \u955c\u50cf\uff1a\n<pre><code class=\"language-bash\">docker pull downloads.unstructured.io\/unstructured-io\/unstructured:latest\r\ndocker run -it --<span class=\"hljs-built_in\">rm<\/span> downloads.unstructured.io\/unstructured-io\/unstructured:latest\r\n<\/code><\/pre>\n<\/li>\n<\/ul>\n<\/li>\n<li><strong>\u4ece PyPI \u5b89\u88c5\u5e93<\/strong>\n<ul>\n<li>\u4f7f\u7528 pip \u5b89\u88c5\uff1a\n<pre><code class=\"language-bash\">pip install unstructured\r\n<\/code><\/pre>\n<\/li>\n<\/ul>\n<\/li>\n<li><strong>\u672c\u5730\u5f00\u53d1\u5b89\u88c5<\/strong>\n<ul>\n<li>\u514b\u9686 GitHub \u4ed3\u5e93\uff1a\n<pre><code class=\"language-bash\">git <span class=\"hljs-built_in\">clone<\/span> https:\/\/github.com\/Unstructured-IO\/unstructured.git\r\n<span class=\"hljs-built_in\">cd<\/span> unstructured\r\npip install -e .\r\n<\/code><\/pre>\n<\/li>\n<\/ul>\n<\/li>\n<\/ol>\n<p>&nbsp;<\/p>\n<h3>\u4f7f\u7528\u6307\u5357<\/h3>\n<ol>\n<li><strong>\u6570\u636e\u6444\u53d6<\/strong>\n<ul>\n<li>\u4f7f\u7528 <code>unstructured<\/code> \u5e93\u6444\u53d6\u6587\u6863\uff1a\n<pre><code class=\"language-python\"><span class=\"hljs-keyword\">from<\/span> unstructured.partition.pdf <span class=\"hljs-keyword\">import<\/span> partition_pdf\r\ndocument = partition_pdf(<span class=\"hljs-string\">\"example.pdf\"<\/span>)\r\n<\/code><\/pre>\n<\/li>\n<\/ul>\n<\/li>\n<li><strong>\u6570\u636e\u9884\u5904\u7406<\/strong>\n<ul>\n<li>\u6e05\u7406\u548c\u5206\u5757\u6587\u6863\uff1a\n<pre><code class=\"language-python\"><span class=\"hljs-keyword\">from<\/span> unstructured.cleaners.core <span class=\"hljs-keyword\">import<\/span> clean\r\ncleaned_document = clean(document)\r\n<\/code><\/pre>\n<\/li>\n<\/ul>\n<\/li>\n<li><strong>\u8fde\u63a5\u5230\u6570\u636e\u6e90\u548c\u76ee\u6807<\/strong>\n<ul>\n<li>\u4f7f\u7528\u8fde\u63a5\u5668\u5c06\u6570\u636e\u4f20\u8f93\u5230\u76ee\u6807\u4f4d\u7f6e\uff1a\n<pre><code class=\"language-python\"><span class=\"hljs-keyword\">from<\/span> unstructured.connectors <span class=\"hljs-keyword\">import<\/span> send_to_destination\r\nsend_to_destination(cleaned_document, destination=<span class=\"hljs-string\">\"s3:\/\/bucket-name\"<\/span>)\r\n<\/code><\/pre>\n<\/li>\n<\/ul>\n<\/li>\n<li><strong>\u65e0\u670d\u52a1\u5668 API<\/strong>\n<ul>\n<li>\u6ce8\u518c\u5e76\u83b7\u53d6 API \u5bc6\u94a5\uff1a\n<ul>\n<li>\u8bbf\u95ee <a class=\"ac-anchor\" href=\"https:\/\/github.com\/Unstructured-IO\/unstructured-api\" target=\"_blank\" rel=\"noopener\">Unstructured API \u6ce8\u518c\u9875\u9762<\/a>\u3002<\/li>\n<li>\u83b7\u53d6 API \u5bc6\u94a5\u5e76\u5f00\u59cb\u4f7f\u7528\uff1a\n<pre><code class=\"language-python\"><span class=\"hljs-keyword\">import<\/span> requests\r\nheaders = {<span class=\"hljs-string\">\"Authorization\"<\/span>: <span class=\"hljs-string\">\"Bearer YOUR_API_KEY\"<\/span>}\r\nresponse = requests.post(<span class=\"hljs-string\">\"https:\/\/api.unstructured.io\/process\"<\/span>, headers=headers, json={<span class=\"hljs-string\">\"document\"<\/span>: <span class=\"hljs-string\">\"example.pdf\"<\/span>})\r\n<\/code><\/pre>\n<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<\/li>\n<\/ol>\n","protected":false},"excerpt":{"rendered":"<p>Unstructured-IO \u63d0\u4f9b\u4e86\u4e00\u7cfb\u5217\u5f00\u6e90\u7ec4\u4ef6\uff0c\u7528\u4e8e\u5904\u7406\u548c\u9884\u5904\u7406\u56fe\u50cf\u548c\u6587\u672c\u6587\u6863\uff0c\u5982 PDF\u3001HTML\u3001Word \u6587\u6863\u7b49\u3002\u5176\u4e3b\u8981\u76ee\u6807\u662f\u7b80\u5316\u548c\u4f18\u5316\u6570\u636e\u5904\u7406\u5de5\u4f5c\u6d41\u7a0b\uff0c\u7279\u522b\u662f\u4e3a\u5927\u8bed\u8a00\u6a21\u578b\uff08LLM\uff09\u5e94\u7528\u63d0\u4f9b\u652f\u6301\u3002Unstructured-IO &#8230;<\/p>\n","protected":false},"author":1,"featured_media":32782,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[20,499],"tags":[230,252],"class_list":["post-5641","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-tool","category-document-extraction","tag-aikaiyuanxiangmu","tag-markdown"],"_links":{"self":[{"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/posts\/5641","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/comments?post=5641"}],"version-history":[{"count":0,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/posts\/5641\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/media\/32782"}],"wp:attachment":[{"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/media?parent=5641"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/categories?post=5641"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.kdjingpai.com\/pt\/wp-json\/wp\/v2\/tags?post=5641"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}