{"id":13411,"date":"2024-08-01T15:24:08","date_gmt":"2024-08-01T07:24:08","guid":{"rendered":"https:\/\/nj.transwarp.cn:8180\/?p=13411"},"modified":"2025-12-17T17:20:47","modified_gmt":"2025-12-17T09:20:47","slug":"csvfile%e5%a4%a7%e6%96%87%e4%bb%b6%e5%85%a5%e5%ba%93%e9%97%ae%e9%a2%98","status":"publish","type":"post","link":"https:\/\/kbwp.transwarp.cn\/?p=13411","title":{"rendered":"csvfile\u5927\u6587\u4ef6\u5165\u5e93\u95ee\u9898"},"content":{"rendered":"<h3>\u6982\u8981\u63cf\u8ff0<\/h3>\n<p>CSV\u8868\u7684\u6570\u636e\u6765\u6e90\u662fCSV\u6587\u4ef6\u3002CSV\u6587\u4ef6\u662f\u7eaf\u6587\u672c\u6587\u4ef6\uff0c\u6587\u4ef6\u4e2d\u5305\u542b\u6570\u636e\u4ee5\u53ca\u5206\u9694\u7b26\u3002\u548cTEXT\u8868\u76f8\u4f3c\uff0cCSV\u8868\u6700\u5e38\u89c1\u7684\u4f7f\u7528\u573a\u666f\u662f\u7528\u4e8e\u5efa\u5916\u8868\uff0c\u5c06CSV\u6587\u4ef6\u4e2d\u7684\u6570\u636e\u5bfc\u5165ArgoDB\u3002\u5728\u8fc7\u53bb\uff0cCSV\u683c\u5f0f\u4e0d\u652f\u6301\u5728Map\u9636\u6bb5\u5bf9\u5927\u6587\u4ef6\u8fdb\u884c\u5207\u5272\uff0c\u5018\u82e5CSV\u5355\u6587\u4ef6\u8d85\u8fc71G\uff0c\u53ea\u80fd\u4f7f\u7528txt\u683c\u5f0f\u52a0\u4ee5\u5904\u7406\u3002\u5728\u65b0\u7248\u672c\u4e2d\uff0c\u5f15\u5165\u4e86\u4e00\u4e9b\u65b0\u529f\u80fd\u53ef\u4ee5\u63d0\u5347csv\u5efa\u5916\u8868\u7684\u6027\u80fd\u3002<\/p>\n<p>\u672c\u6587\u4e3b\u8981\u4ecb\u7ecd\u4e00\u4e9b\u52a0\u901f\u8bfb\u53d6csv\u6587\u4ef6\u7684\u65b9\u6cd5\u53ef\u4f9b\u53c2\u8003\u3002<\/p>\n<h3>\u8be6\u7ec6\u8bf4\u660e<\/h3>\n<p>\u6211\u4eec\u7b80\u5355\u8fd8\u539f\u4e0b\u95ee\u9898\u73b0\u8c61\uff0c\u901a\u8fc7<code>yes &#039;&quot;1&quot;,&quot;zhangsan&quot;,&quot;18&quot;&#039; | head -n 25000000 &gt; output.csv<\/code> \u6784\u9020\u4e00\u4e2a477m\u7684csvfile\u6587\u4ef6\uff0c\u901a\u8fc7hdfs\u547d\u4ee4put\u5230\/tmp\/0619\u76ee\u5f55\u4e0b\uff0c\u4f9b\u4e0b\u9762\u7684csvfile\u5916\u8868\u8bfb\u53d6\u3002<\/p>\n<pre><code class=\"language-sql\">CREATE external TABLE csvtest01(\nid INT ,name STRING,age INT)\nSTORED AS CSVFILE\nLOCATION '\/tmp\/0619'\nTBLPROPERTIES(\n'field.delim'=',',\n'quote.delim'='\"',\n'line.delim'='\\n') ;<\/code><\/pre>\n<p>\u6267\u884c <code>select count(*) from csvtest01<\/code> \u67e5\u8be2\uff0c\u89c2\u5bdfDBAService\u7684map\u9636\u6bb5\u53ef\u4ee5\u770b\u5230\u53ea\u8d77\u4e861\u4e2atask\uff0c\u6267\u884c\u65f6\u957f43s<\/p>\n<div style=\"box-shadow: 1px 1px 10px rgba(0,0,0,0.1); padding: 1px; display: inline-block; width: auto; margin-bottom: 10px;\">\n  <img decoding=\"async\" src=\"\/wp-content\/uploads\/2024\/06\/image-1718784076773.png\" style=\"display: block;\">\n<\/div>\n<h4>\u65b9\u6848\u4e00. \u4f7f\u7528TextInputFormat\u7684\u8f93\u5165\u683c\u5f0f<\/h4>\n<p><font color=red>\u4f18\u5148\u63a8\u8350\u8be5\u65b9\u6848\uff01\u6ce8\u610f\u8fd9\u91cc\u5e76\u4e0d\u662f\u7b80\u5355\u7684stored as textfile\u54e6~<\/font><\/p>\n<pre><code class=\"language-sql\">DROP TABLE IF EXISTS csv_test;\ncreate external table csv_test(\nid INT ,\nname STRING,\nage INT\n)ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.csv.serde.CSVSerde'\nSTORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'\nlocation \"\/tmp\/0619\";<\/code><\/pre>\n<p>\u6267\u884c <code>select count(*) from csv_test<\/code> \u67e5\u8be2\uff0c\u89c2\u5bdfDBAService\u7684map\u9636\u6bb5\u53ef\u4ee5\u770b\u5230\u53ea\u8d77\u4e864\u4e2atask\uff0c\u6267\u884c\u65f6\u957f14s<\/p>\n<p>TextInputFormat\u548cTextOutputFormat\u6307\u6587\u4ef6\u88ab\u521b\u5efa\u4e3a\u4e00\u4e2atext\u8868\uff0c\u8be5\u53c2\u6570\u4e0e\u666e\u901a\u7684text\u8868\u662f\u76f8\u540c\u7684\u3002\u800cCSVSerDe\u5219\u6307\u5b9a\u4e86\u4e00\u79cd\u5e8f\u5217\u5316\uff08Serializer\uff09\u548c\u53cd\u5e8f\u5217\u5316\u7684\u65b9\u5f0f\uff08Deserializer\uff09\u3002\u6b64\u65f6\uff0c\u8868\u683c\u4ecd\u7136\u80fd\u591f\u4ee5csv\u7684\u6837\u5f0f\u88ab\u6b63\u786e\u8bfb\u51fa\uff0c\u5018\u82e5\u521b\u5efa\u666e\u901a\u7684text\u8868\u5219\u65e0\u6cd5\u8bfb\u51fa\u3002\u5728\u5982\u6b64\u521b\u5efa\u8868\u683c\u65f6\uff0c\u4ecd\u7136\u53ef\u4ee5\u901a\u8fc7\u5728tblproperties\u4e2d\u6307\u5b9afield.delim\u3001quote.delim\u548cline.delim\u8fd9\u4e09\u4e2a\u53c2\u6570\u3002<\/p>\n<blockquote>\n<p>SERDE\u662f&quot;Serializer\/Deserializer&quot;\u7684\u7f29\u5199\uff0c\u7528\u4e8e\u6307\u5b9a\u5982\u4f55\u5e8f\u5217\u5316\u548c\u53cd\u5e8f\u5217\u5316Hive\u8868\u4e2d\u7684\u6570\u636e\u3002\u5728\u8fd9\u4e2a\u4f8b\u5b50\u4e2d\uff0c\u4f7f\u7528\u4e86org.apache.hadoop.hive.ql.io.csv.serde.CSVSerde\uff0c\u8fd9\u662fQuark\u63d0\u4f9b\u7684\u4e00\u4e2a\u7528\u4e8e\u5904\u7406CSV\u6587\u4ef6\u7684SerDe\u3002<\/p>\n<p>INPUTFORMAT\u548cOUTPUTFORMAT\u6307\u5b9a\u4e86Quark\u5728\u8bfb\u53d6\u548c\u5199\u5165\u8868\u6570\u636e\u65f6\u5e94\u4f7f\u7528\u7684Hadoop\u8f93\u5165\u548c\u8f93\u51fa\u683c\u5f0f\u3002<br \/>\nINPUTFORMAT &#8216;org.apache.hadoop.mapred.TextInputFormat&#8217;: \u6307\u5b9aHive\u5728\u8bfb\u53d6\u6570\u636e\u65f6\u5e94\u8be5\u4f7f\u7528Hadoop\u7684TextInputFormat\u3002\u8fd9\u662fHadoop\u9ed8\u8ba4\u7684\u6587\u4ef6\u8bfb\u53d6\u683c\u5f0f\uff0c\u7528\u4e8e\u8bfb\u53d6\u6587\u672c\u6587\u4ef6\u3002<br \/>\nOUTPUTFORMAT &#8216;org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat&#8217;: \u5f53Quark\u9700\u8981\u5199\u5165\u6570\u636e\u65f6\uff08\u5c3d\u7ba1\u5728\u8fd9\u4e2a\u5916\u90e8\u8868\u7684\u5b9a\u4e49\u4e2d\uff0c\u5b83\u5b9e\u9645\u4e0a\u4e0d\u4f1a\u76f4\u63a5\u5199\u5165\u6570\u636e\uff0c\u56e0\u4e3a\u8fd9\u662f\u4e00\u4e2a\u5916\u90e8\u8868\uff09\uff0c\u5b83\u4f1a\u4f7f\u7528HiveIgnoreKeyTextOutputFormat\u3002\u8fd9\u4e2aOutputFormat\u662fQuark\u7279\u6709\u7684\uff0c\u7528\u4e8e\u5199\u5165\u6587\u672c\u6587\u4ef6\uff0c\u4f46\u5ffd\u7565\u952e\uff08key\uff09\u90e8\u5206\uff08\u5728MapReduce\u4e2d\uff0c\u901a\u5e38\u4f1a\u6709\u952e\u548c\u503c\u4e24\u90e8\u5206\uff09\u3002<\/p>\n<\/blockquote>\n<h4>\u65b9\u6848\u4e8c. csv\u5207\u5206\u529f\u80fd<\/h4>\n<p>\u8be5\u529f\u80fd\u9700\u8981argodb5.x\u4ee5\u4e0a\u7248\u672c\u624d\u80fd\u591f\u652f\u6301\uff0c\u4f7f\u7528\u8d77\u6765\u6709\u4e00\u5b9a\u9650\u5236\uff0c\u8bfb\u53d6\u540e\u9700\u6821\u9a8c\u6570\u636e\u4e00\u81f4\u6027\uff01<\/p>\n<p><strong>\u4e00\u79cd\u65b9\u5f0f\u662f\u76f4\u63a5\u8bbe\u7f6e\u53c2\u6570 \uff08\u63a8\u8350\uff09\uff0c<\/strong><\/p>\n<pre><code class=\"language-sql\">set hive.conf.validation=false;\nset inceptor.conf.validation=false;\nset reserve.first.row=true;      --\u662f\u5426\u4fdd\u7559\u9996\u884c\uff0ctrue\u4e3a\u4fdd\u7559\nset csv_one_file_per_map=false;  --\u7ecf\u6d4b\u8bd5\uff0c\u8be5\u53c2\u6570\u8d77\u4f5c\u7528\uff0c\u53ef\u7075\u6d3b\u914d\u7f6e\nset csv_block_len_mb_per_map=128;  --\u6bcf128MB\u8d77\u4e00\u4e2amap task (\u7ecf\u6d4b\u8bd5\uff0c\u8be5\u53c2\u6570\u8d77\u4f5c\u7528\uff0c\u9ed8\u8ba4\u53d6128m blocksize\u7684\u5927\u5c0f)\n\n--\u518d\u76f4\u63a5\u67e5\u8be2csvfile\u8868\nselect count(*) from csv_test;<\/code><\/pre>\n<p><strong>\u53e6\u5916\u4e00\u79cd\u65b9\u5f0f\u662f\u5199\u5230csvfile\u7684tblproperties\u91cc\u9762\u53bb<\/strong><\/p>\n<pre><code class=\"language-sql\">DROP TABLE IF EXISTS csv_mapsplit;\ncreate external table csv_mapsplit\n(id INT ,name STRING,age INT)\nstored as csvfile\nlocation \"\/tmp\/0619\"\ntblproperties(\n\"reserve.first.row\"=\"true\",          --\u662f\u5426\u4fdd\u7559\u9996\u884c\n\"csv_one_file_per_map\"=\"false\",      --\u662f\u5426\u6bcf\u4e2a\u6587\u4ef6\u5355map task\u5904\u7406\n\"csv_block_len_mb_per_map\"=\"128\",     --\u6bcf128MB\u8d77\u4e00\u4e2amap task (\u7ecf\u6d4b\u8bd5\uff0c\u8be5\u53c2\u6570\u4e0d\u8d77\u4f5c\u7528\uff0c\u9ed8\u8ba4\u53d6128m blocksize\u7684\u5927\u5c0f)\n'mapreduce.csvinput.encoding'='gbk', --\u6307\u5b9a\u4f7f\u7528gbk\u7f16\u7801\u8bfb\u53d6\u6570\u636e\u3002\u4e0etext\u8868\u53c2\u6570\u4e0d\u540c\n'field.delim'=',',     --\u5b57\u6bb5\u5206\u9694\u7b26\uff0c\u5c5e\u6027\u7684\u503c\u6307\u5b9a\u5b57\u6bb5\u5206\u9694\u7b26\uff0c\u9ed8\u8ba4\u503c\u4e3a \u201c,\u201d\n'quote.delim'='\"',     --\u5c5e\u6027\u7684\u503c\u6307\u5b9a\u7528\u4ec0\u4e48\u5b57\u7b26\u4f5c\u4e3a\u5355\u4e2a\u5b57\u6bb5\u7684\u5206\u9694\u7b26\uff0c\u9ed8\u8ba4\u503c\u4e3a\u201c\"\u201d\u3002\n'line.delim'='\\n'      --\u5c5e\u6027\u7684\u503c\u6307\u5b9a\u884c\u5206\u9694\u7b26\uff0c\u9ed8\u8ba4\u503c\u4e3a\u201c\\n\u201d\u3002\n);<\/code><\/pre>\n<p>\u6267\u884c <code>select count(*) from csv_mapsplit<\/code> \u67e5\u8be2\uff0c\u89c2\u5bdfDBAService\u7684map\u9636\u6bb5\u53ef\u4ee5\u770b\u5230\u53ea\u8d77\u4e864\u4e2atask\uff0c\u6267\u884c\u65f6\u957f14s<\/p>\n<p>\u6ce8\u610f\uff1a<\/p>\n<ul>\n<li>\u5bfc\u5165\u4e0d\u5e26\u5934\u90e8\u7684csv\u5b58\u5728\u95ee\u9898\uff0c\u5982\u679c\u6570\u636e\u6e90\u6ca1\u6709csv\u5934\uff0c\u8bf7\u4f7f\u7528txt\u683c\u5f0f\u5bfc\u5165<\/li>\n<li>\u82e5CSV\u5916\u8868\u662f\u7528\u7684\u538b\u7f29\u540e\u7684CSV\u5efa\u7684\uff0c\u5219\u4f1a\u81ea\u52a8\u9000\u5316\u6210\u5355\u6587\u4ef6\u5355map\uff0c\u4e0d\u4f1a\u542f\u7528block split\u4f18\u5316<\/li>\n<li>\u7528\u6237\u5fc5\u987b\u81ea\u884c\u4fdd\u8bc1csv\u6570\u636e\u4e2d\uff0crecord\u5185\u90e8\u4e0d\u80fd\u6709lineSeparator\uff0c\u5373\u4f7f\u662fquote\u4e2d\u4e5f\u4e0d\u884c\uff1b\u5426\u5219\u53ef\u80fd\u5bfc\u81f4\u6570\u636e\u89e3\u6790\u9519\u8bef\uff08\u5e76\u4e14\u65e0\u6cd5\u62a5\u9519\uff09\u2014\u2014\u6700\u597d\u4ec5\u5728POC\u4e2d\/\u6570\u636e\u60c5\u51b5\u5b8c\u5168\u638c\u63e1\u7684\u60c5\u51b5\u4e0b\u4f7f\u7528\u672c\u4f18\u5316<\/li>\n<\/ul>\n<h4>\u65b9\u6848\u4e09. shell\u5207\u5206csv\u6587\u4ef6\u5206\u6279\u8bfb\u53d6<\/h4>\n<p><font color=red>\u4e00\u822c\u4e0d\u4f1a\u8003\u8651\u8fd9\u4e2a\u65b9\u6848\uff0c\u4ec5\u4f9b\u53c2\u8003<\/font><\/p>\n<p><strong>\u4ee5\u4e00\u4e2a10T\u7684\u6587\u4ef6\u4e3a\u4f8b\uff1a<\/strong><\/p>\n<p>a) \u5c061T\u6587\u4ef6\u89e3\u538b\u621010T<\/p>\n<p>b) \u901a\u8fc7<code>split -b 209715200 -d ..\/aaaa.csv outf -a 5<\/code>\uff0c10T\/200m ~ 52428.8\uff0c\u731c\u6d4b\u4f1a\u5207\u5206\u62105w+\u4e2a\u6570\u636e\u6587\u4ef6\uff0c\u6587\u4ef6\u540d\u5e94\u8be5\u662foutf00000~outf52429<\/p>\n<p>c) \u521b\u5efa6\u4e2ahdfs\u76ee\u5f55\uff0c<\/p>\n<pre><code class=\"language-shell\">for i in {00..05}; do hadoop fs -mkdir -p \/tmp\/csvf$i; done\n<\/code><\/pre>\n<p>d) \u5c065w+\u6587\u4ef6\u5206\u6279\u653e\u7f6e\u5230\u8fd96\u4e2a\u76ee\u5f55\u4e0b<\/p>\n<pre><code class=\"language-shell\">hadoop fs -put .\/outf0* \/tmp\/csvf00 \nhadoop fs -put .\/outf1* \/tmp\/csvf01\nhadoop fs -put .\/outf2* \/tmp\/csvf02\nhadoop fs -put .\/outf3* \/tmp\/csvf03\nhadoop fs -put .\/outf4* \/tmp\/csvf04\nhadoop fs -put .\/outf5* \/tmp\/csvf05<\/code><\/pre>\n<p>e) \u521b\u5efa6\u5f20\u8868\uff0c\u8bfb\u53d6csvfile\u6587\u4ef6<\/p>\n<pre><code class=\"language-shell\">CREATE external TABLE csv00\n(id INT ,name STRING,age INT)\nSTORED AS CSVFILE\n  LOCATION '\/tmp\/csvf00'\nTBLPROPERTIES(\n'field.delim'=',',\n'quote.delim'='\"',\n'line.delim'='\\n') ;<\/code><\/pre>\n<p>&#8230;\u5269\u4f595\u5f20\u4fee\u6539\u4e0b\u8868\u540d\u548chdfs\u8def\u5f84\u5373\u53ef\uff0c\u4ee5\u6b64\u7c7b\u63a8<\/p>\n<p>f) \u5c1d\u8bd5<code>select count(*)<\/code>\u770b\u80fd\u5426\u6b63\u5e38\u67e5\u8be2\u8fd9\u51e0\u5f20\u8868<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u6982\u8981\u63cf\u8ff0 CSV\u8868\u7684\u6570\u636e\u6765\u6e90\u662fCSV\u6587\u4ef6\u3002CSV\u6587\u4ef6\u662f\u7eaf\u6587\u672c\u6587\u4ef6\uff0c\u6587\u4ef6\u4e2d\u5305\u542b\u6570\u636e\u4ee5\u53ca\u5206\u9694\u7b26\u3002\u548cTEXT\u8868\u76f8\u4f3c\uff0c ..<\/p>\n<div class=\"clear-fix\"><\/div>\n<p><a href=\"https:\/\/kbwp.transwarp.cn\/?p=13411\" title=\"read more...\">Read more<\/a><\/p>\n","protected":false},"author":12,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[38],"tags":[],"class_list":["post-13411","post","type-post","status-publish","format-standard","hentry","category-configuration"],"acf":[],"_links":{"self":[{"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=\/wp\/v2\/posts\/13411","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=\/wp\/v2\/users\/12"}],"replies":[{"embeddable":true,"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=13411"}],"version-history":[{"count":5,"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=\/wp\/v2\/posts\/13411\/revisions"}],"predecessor-version":[{"id":17815,"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=\/wp\/v2\/posts\/13411\/revisions\/17815"}],"wp:attachment":[{"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=13411"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=13411"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/kbwp.transwarp.cn\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=13411"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}