{"id":21629,"date":"2025-02-20T19:43:55","date_gmt":"2025-02-20T11:43:55","guid":{"rendered":"https:\/\/www.aisharenet.com\/?p=21629"},"modified":"2025-02-20T19:43:55","modified_gmt":"2025-02-20T11:43:55","slug":"chinese-data-distill-from-r1","status":"publish","type":"post","link":"https:\/\/www.kdjingpai.com\/de\/chinese-data-distill-from-r1\/","title":{"rendered":"\u4e2d\u6587\u57fa\u4e8e\u6ee1\u8840 DeepSeek-R1 \u84b8\u998f\u6570\u636e\u96c6\uff0c\u652f\u6301\u4e2d\u6587R1\u84b8\u998fSFT\u6570\u636e\u96c6"},"content":{"rendered":"<p>\u4e2d\u6587DeepSeek-R1\u84b8\u998f\u6570\u636e\u96c6\u662f\u4e00\u4e2a\u5f00\u6e90\u7684\u4e2d\u6587\u6570\u636e\u96c6\uff0c\u5305\u542b110K\u6761\u6570\u636e\uff0c\u65e8\u5728\u652f\u6301\u673a\u5668\u5b66\u4e60\u548c\u81ea\u7136\u8bed\u8a00\u5904\u7406\u7814\u7a76\u3002\u8be5\u6570\u636e\u96c6\u7531\u5218\u806aNLP\u56e2\u961f\u53d1\u5e03\uff0c\u6570\u636e\u96c6\u4e0d\u4ec5\u5305\u542b\u6570\u5b66\u6570\u636e\uff0c\u8fd8\u5305\u62ec\u5927\u91cf\u7684\u901a\u7528\u7c7b\u578b\u6570\u636e\uff0c\u5982\u903b\u8f91\u63a8\u7406\u3001\u5c0f\u7ea2\u4e66\u3001\u77e5\u4e4e\u7b49\u3002\u6570\u636e\u96c6\u7684\u84b8\u998f\u8fc7\u7a0b\u4e25\u683c\u6309\u7167DeepSeek-R1\u5b98\u65b9\u63d0\u4f9b\u7684\u7ec6\u8282\u8fdb\u884c\uff0c\u786e\u4fdd\u6570\u636e\u7684\u9ad8\u8d28\u91cf\u548c\u591a\u6837\u6027\u3002\u7528\u6237\u53ef\u4ee5\u5728Hugging Face\u548cModelScope\u5e73\u53f0\u4e0a\u514d\u8d39\u4e0b\u8f7d\u548c\u4f7f\u7528\u8be5\u6570\u636e\u96c6\u3002<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" class=\"aligncenter size-full wp-image-21630\" title=\"\u4e2d\u6587\u57fa\u4e8e\u6ee1\u8840 DeepSeek-R1 \u84b8\u998f\u6570\u636e\u96c6\uff0c\u652f\u6301\u4e2d\u6587R1\u84b8\u998fSFT\u6570\u636e\u96c6-1\" src=\"https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2025\/02\/c84db0b86bc9277.png\" alt=\"\u4e2d\u6587\u57fa\u4e8e\u6ee1\u8840 DeepSeek-R1 \u84b8\u998f\u6570\u636e\u96c6\uff0c\u652f\u6301\u4e2d\u6587R1\u84b8\u998fSFT\u6570\u636e\u96c6-1\" width=\"1004\" height=\"565\" srcset=\"https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2025\/02\/c84db0b86bc9277.png 1004w, https:\/\/www.kdjingpai.com\/wp-content\/uploads\/2025\/02\/c84db0b86bc9277-768x432.png 768w\" sizes=\"auto, (max-width: 1004px) 100vw, 1004px\" \/><\/p>\n<p>&nbsp;<\/p>\n<h2>\u529f\u80fd\u5217\u8868<\/h2>\n<ul>\n<li><strong>\u591a\u6837\u5316\u6570\u636e\u7c7b\u578b<\/strong>\uff1a\u5305\u542b\u6570\u5b66\u3001\u903b\u8f91\u63a8\u7406\u3001\u901a\u7528\u7c7b\u578b\u6570\u636e\u7b49\u3002<\/li>\n<li><strong>\u9ad8\u8d28\u91cf\u6570\u636e<\/strong>\uff1a\u4e25\u683c\u6309\u7167DeepSeek-R1\u5b98\u65b9\u63d0\u4f9b\u7684\u7ec6\u8282\u8fdb\u884c\u84b8\u998f\u3002<\/li>\n<li><strong>\u514d\u8d39\u5f00\u6e90<\/strong>\uff1a\u7528\u6237\u53ef\u4ee5\u5728Hugging Face\u548cModelScope\u5e73\u53f0\u4e0a\u514d\u8d39\u4e0b\u8f7d\u3002<\/li>\n<li><strong>\u652f\u6301\u591a\u79cd\u5e94\u7528<\/strong>\uff1a\u9002\u7528\u4e8e\u673a\u5668\u5b66\u4e60\u3001\u81ea\u7136\u8bed\u8a00\u5904\u7406\u7b49\u591a\u79cd\u7814\u7a76\u9886\u57df\u3002<\/li>\n<li><strong>\u8be6\u7ec6\u6570\u636e\u5206\u5e03<\/strong>\uff1a\u63d0\u4f9b\u6570\u636e\u7684\u8be6\u7ec6\u5206\u7c7b\u548c\u6570\u91cf\u4fe1\u606f\u3002<\/li>\n<\/ul>\n<p>&nbsp;<\/p>\n<h2>\u4f7f\u7528\u5e2e\u52a9<\/h2>\n<h3>\u5b89\u88c5\u6d41\u7a0b<\/h3>\n<ol>\n<li>\u8bbf\u95eeHugging Face\u6216ModelScope\u5e73\u53f0\u3002<\/li>\n<li>\u641c\u7d22\u201cChinese-DeepSeek-R1-Distill-data-110k\u201d\u3002<\/li>\n<li>\u70b9\u51fb\u4e0b\u8f7d\u94fe\u63a5\uff0c\u9009\u62e9\u5408\u9002\u7684\u683c\u5f0f\u8fdb\u884c\u4e0b\u8f7d\u3002<\/li>\n<\/ol>\n<h3>\u4f7f\u7528\u65b9\u6cd5<\/h3>\n<ol>\n<li><strong>\u52a0\u8f7d\u6570\u636e\u96c6<\/strong>\uff1a\u5728Python\u73af\u5883\u4e2d\u4f7f\u7528<code>datasets<\/code>\u5e93\u52a0\u8f7d\u6570\u636e\u96c6\u3002<\/li>\n<\/ol>\n<pre><code>   from datasets import load_dataset\r\ndataset = load_dataset(\"Congliu\/Chinese-DeepSeek-R1-Distill-data-110k\")\r\n<\/code><\/pre>\n<ol start=\"2\">\n<li><strong>\u67e5\u770b\u6570\u636e<\/strong>\uff1a\u4f7f\u7528<code>dataset<\/code>\u5bf9\u8c61\u67e5\u770b\u6570\u636e\u96c6\u7684\u57fa\u672c\u4fe1\u606f\u548c\u6837\u672c\u3002<\/li>\n<\/ol>\n<pre><code>   print(dataset)\r\nprint(dataset['train'][0])\r\n<\/code><\/pre>\n<ol start=\"3\">\n<li><strong>\u6570\u636e\u9884\u5904\u7406<\/strong>\uff1a\u6839\u636e\u7814\u7a76\u9700\u6c42\u5bf9\u6570\u636e\u8fdb\u884c\u9884\u5904\u7406\uff0c\u5982\u5206\u8bcd\u3001\u53bb\u505c\u7528\u8bcd\u7b49\u3002<\/li>\n<\/ol>\n<pre><code>   from transformers import BertTokenizer\r\ntokenizer = BertTokenizer.from_pretrained('bert-base-chinese')\r\ntokenized_data = dataset.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True))\r\n<\/code><\/pre>\n<ol start=\"4\">\n<li><strong>\u6a21\u578b\u8bad\u7ec3<\/strong>\uff1a\u4f7f\u7528\u9884\u5904\u7406\u540e\u7684\u6570\u636e\u8fdb\u884c\u6a21\u578b\u8bad\u7ec3\u3002<\/li>\n<\/ol>\n<pre><code>   from transformers import BertForSequenceClassification, Trainer, TrainingArguments\r\nmodel = BertForSequenceClassification.from_pretrained('bert-base-chinese')\r\ntraining_args = TrainingArguments(output_dir='.\/results', num_train_epochs=3, per_device_train_batch_size=16)\r\ntrainer = Trainer(model=model, args=training_args, train_dataset=tokenized_data['train'])\r\ntrainer.train()\r\n<\/code><\/pre>\n<h3>\u7279\u8272\u529f\u80fd\u64cd\u4f5c\u6d41\u7a0b<\/h3>\n<ol>\n<li><strong>\u6570\u5b66\u6570\u636e\u5904\u7406<\/strong>\uff1a\u9488\u5bf9\u6570\u5b66\u7c7b\u578b\u6570\u636e\uff0c\u589e\u52a0\u63d0\u793a\u8bcd\u201c\u8bf7\u4e00\u6b65\u6b65\u63a8\u7406\uff0c\u5e76\u628a\u6700\u7ec8\u7b54\u6848\u653e\u5230 \\boxed {}\u201d\u3002<\/li>\n<\/ol>\n<pre><code>   def add_math_prompt(example):\r\nexample['text'] = \"\u8bf7\u4e00\u6b65\u6b65\u63a8\u7406\uff0c\u5e76\u628a\u6700\u7ec8\u7b54\u6848\u653e\u5230 \\\\boxed {}\u3002\" + example['text']\r\nreturn example\r\nmath_data = dataset.filter(lambda x: x['category'] == 'math').map(add_math_prompt)\r\n<\/code><\/pre>\n<ol start=\"2\">\n<li><strong>\u903b\u8f91\u63a8\u7406\u6570\u636e\u5904\u7406<\/strong>\uff1a\u5bf9\u903b\u8f91\u63a8\u7406\u6570\u636e\u8fdb\u884c\u7279\u6b8a\u5904\u7406\uff0c\u786e\u4fdd\u6570\u636e\u7684\u903b\u8f91\u6027\u548c\u4e00\u81f4\u6027\u3002<\/li>\n<\/ol>\n<pre><code>   def process_logic_data(example):\r\n# \u81ea\u5b9a\u4e49\u903b\u8f91\u5904\u7406\u4ee3\u7801\r\nreturn example\r\nlogic_data = dataset.filter(lambda x: x['category'] == 'logic').map(process_logic_data)<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u4e2d\u6587DeepSeek-R1\u84b8\u998f\u6570\u636e\u96c6\u662f\u4e00\u4e2a\u5f00\u6e90\u7684\u4e2d\u6587\u6570\u636e\u96c6\uff0c\u5305\u542b110K\u6761\u6570\u636e\uff0c\u65e8\u5728\u652f\u6301\u673a\u5668\u5b66\u4e60\u548c\u81ea\u7136\u8bed\u8a00\u5904\u7406\u7814\u7a76\u3002\u8be5\u6570\u636e\u96c6\u7531\u5218\u806aNLP\u56e2\u961f\u53d1\u5e03\uff0c\u6570\u636e\u96c6\u4e0d\u4ec5\u5305\u542b\u6570\u5b66\u6570\u636e\uff0c\u8fd8\u5305\u62ec\u5927\u91cf\u7684\u901a\u7528\u7c7b\u578b\u6570\u636e\uff0c\u5982\u903b\u8f91\u63a8\u7406\u3001\u5c0f\u7ea2\u4e66\u3001\u77e5\u4e4e\u7b49\u3002\u6570\u636e\u96c6\u7684\u84b8\u998f\u8fc7\u7a0b&#8230;<\/p>\n","protected":false},"author":1,"featured_media":61875,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[20],"tags":[230,365],"class_list":["post-21629","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-tool","tag-aikaiyuanxiangmu","tag-damoxingweidiao"],"_links":{"self":[{"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/posts\/21629","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/comments?post=21629"}],"version-history":[{"count":0,"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/posts\/21629\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/media\/61875"}],"wp:attachment":[{"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/media?parent=21629"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/categories?post=21629"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.kdjingpai.com\/de\/wp-json\/wp\/v2\/tags?post=21629"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}