How does Hive distribute the rows across the buckets? In general, the bucket number is determined by the expression hash_function(bucketing_column) mod num_buckets. (There's a '0x7FFFFFFF in there too, but that's not that important). The hash_function depends on the type of the bucketing column. For an int, it's easy, hash_int(i) == i. For example, if user_id were an int, and there were 10 buckets, we would expect all user_id's that end in 0 to be in bucket 1, all user_id's that end in a 1 to be in bucket 2, etc. For other datatypes, it's a little tricky. In particular, the hash of a BIGINT is not the same as the BIGINT. And the hash of a string or a complex datatype will be some number that's derived from the value, but not anything humanly-recognizable. For example, if user_id were a STRING, then the user_id's in bucket 1 would probably not end in 0. In general, distributing rows based on the hash will give you a even distribution in the buckets.
set hive.mapred.mode=unstrict; - to access partitioned table without where condition
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set hive.enforce.bucketing = true;
DROP TABLE IF EXISTS temp_user;
CREATE TEMPORARY TABLE temp_user(
firstname VARCHAR(64),
lastname VARCHAR(64),
address STRING,
country VARCHAR(64),
city VARCHAR(64),
state VARCHAR(64),
post STRING,
phone1 VARCHAR(64),
phone2 STRING,
email STRING,
web STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
vi /root/data_hive.csv
Rebbecca,Didio,171 E 24th St,AU,Leith,TA,7315,03-8174-9123,0458-665-290,rebbecca.didio@didio.com.au,http://www.brandtjonathanfesq.com.au
Stevie,Hallo,22222 Acoma St,AU,Proston,QL,4613,07-9997-3366,0497-622-620,stevie.hallo@hotmail.com,http://www.landrumtemporaryservices.com.au
Mariko,Stayer,534 Schoenborn St #51,AU,Hamel,WA,6215,08-5558-9019,0427-885-282,mariko_stayer@hotmail.com,http://www.inabinetmacreesq.com.au
Gerardo,Woodka,69206 Jackson Ave,AU,Talmalmo,NS,2640,02-6044-4682,0443-795-912,gerardo_woodka@hotmail.com,http://www.morrisdowningsherred.com.au
Chun,Richrdson,3 Aiea Heights #660,CA,Regina,SK,S4T 3L1,306-245-2534,306-697-2337,chun_richrdson@richrdson.org,http://www.hoytrobertfesq.com
Lelia,Thiemann,440 Town Center Dr,CA,Kamloops,BC,V2B 7W6,250-671-3851,250-798-7786,lelia.thiemann@yahoo.com,http://www.kleemandenaaesq.com
Cordell,Zinda,91 Argyle Rd,CA,Sherbrooke,QC,J1H 6E3,819-508-6057,819-313-7350,cordell_zinda@cox.net,http://www.kayejeffreyaesq.com
Dorothy,Aitken,4 Hanover Pike,CA,Mississauga,ON,L5V 1E5,905-554-3838,905-355-9556,dorothy.aitken@cox.net,http://www.mcmillonwendyaesq.com
Nobuko,Halsey,8139 I Hwy 10 #92,US,New Bedford,MA,2745,508-855-9887,508-897-7916,nobuko.halsey@yahoo.com,http://www.goemanwoodproductsinc.com
Lavonna,Wolny,5 Cabot Rd,US,Mc Lean,VA,22102,703-483-1970,703-892-2914,lavonna.wolny@hotmail.com,http://www.linhareskennethaesq.com
Lashaunda,Lizama,3387 Ryan Dr,US,Hanover,MD,21076,410-678-2473,410-912-6032,llizama@cox.net,http://www.earnhardtprinting.com
Mariann,Bilden,3125 Packer Ave #9851,US,Austin,TX,78753,512-223-4791,512-742-1149,mariann.bilden@aol.com,http://www.hpgindustrysinc.com
LOAD DATA LOCAL INPATH '/root/data_hive.csv' INTO TABLE temp_user;
DROP TABLE IF EXISTS bucketed_user;
CREATE TABLE bucketed_user(
firstname VARCHAR(64),
lastname VARCHAR(64),
address STRING,
city VARCHAR(64),
state VARCHAR(64),
post STRING,
phone1 VARCHAR(64),
phone2 STRING,
email STRING,
web STRING
) COMMENT 'A bucketed sorted user table' PARTITIONED BY (country VARCHAR(64)) CLUSTERED BY (state) SORTED BY (city) INTO 32 BUCKETS STORED AS TEXTFILE;
set hive.enforce.bucketing = true;
INSERT OVERWRITE TABLE bucketed_user PARTITION (country) SELECT firstname ,lastname , address , city , state , post , phone1 , phone2 , email , web , country FROM temp_user;
set hive.mapred.mode=unstrict; - to access partitioned table without where condition
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set hive.enforce.bucketing = true;
DROP TABLE IF EXISTS temp_user;
CREATE TEMPORARY TABLE temp_user(
firstname VARCHAR(64),
lastname VARCHAR(64),
address STRING,
country VARCHAR(64),
city VARCHAR(64),
state VARCHAR(64),
post STRING,
phone1 VARCHAR(64),
phone2 STRING,
email STRING,
web STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
vi /root/data_hive.csv
Rebbecca,Didio,171 E 24th St,AU,Leith,TA,7315,03-8174-9123,0458-665-290,rebbecca.didio@didio.com.au,http://www.brandtjonathanfesq.com.au
Stevie,Hallo,22222 Acoma St,AU,Proston,QL,4613,07-9997-3366,0497-622-620,stevie.hallo@hotmail.com,http://www.landrumtemporaryservices.com.au
Mariko,Stayer,534 Schoenborn St #51,AU,Hamel,WA,6215,08-5558-9019,0427-885-282,mariko_stayer@hotmail.com,http://www.inabinetmacreesq.com.au
Gerardo,Woodka,69206 Jackson Ave,AU,Talmalmo,NS,2640,02-6044-4682,0443-795-912,gerardo_woodka@hotmail.com,http://www.morrisdowningsherred.com.au
Chun,Richrdson,3 Aiea Heights #660,CA,Regina,SK,S4T 3L1,306-245-2534,306-697-2337,chun_richrdson@richrdson.org,http://www.hoytrobertfesq.com
Lelia,Thiemann,440 Town Center Dr,CA,Kamloops,BC,V2B 7W6,250-671-3851,250-798-7786,lelia.thiemann@yahoo.com,http://www.kleemandenaaesq.com
Cordell,Zinda,91 Argyle Rd,CA,Sherbrooke,QC,J1H 6E3,819-508-6057,819-313-7350,cordell_zinda@cox.net,http://www.kayejeffreyaesq.com
Dorothy,Aitken,4 Hanover Pike,CA,Mississauga,ON,L5V 1E5,905-554-3838,905-355-9556,dorothy.aitken@cox.net,http://www.mcmillonwendyaesq.com
Nobuko,Halsey,8139 I Hwy 10 #92,US,New Bedford,MA,2745,508-855-9887,508-897-7916,nobuko.halsey@yahoo.com,http://www.goemanwoodproductsinc.com
Lavonna,Wolny,5 Cabot Rd,US,Mc Lean,VA,22102,703-483-1970,703-892-2914,lavonna.wolny@hotmail.com,http://www.linhareskennethaesq.com
Lashaunda,Lizama,3387 Ryan Dr,US,Hanover,MD,21076,410-678-2473,410-912-6032,llizama@cox.net,http://www.earnhardtprinting.com
Mariann,Bilden,3125 Packer Ave #9851,US,Austin,TX,78753,512-223-4791,512-742-1149,mariann.bilden@aol.com,http://www.hpgindustrysinc.com
LOAD DATA LOCAL INPATH '/root/data_hive.csv' INTO TABLE temp_user;
DROP TABLE IF EXISTS bucketed_user;
CREATE TABLE bucketed_user(
firstname VARCHAR(64),
lastname VARCHAR(64),
address STRING,
city VARCHAR(64),
state VARCHAR(64),
post STRING,
phone1 VARCHAR(64),
phone2 STRING,
email STRING,
web STRING
) COMMENT 'A bucketed sorted user table' PARTITIONED BY (country VARCHAR(64)) CLUSTERED BY (state) SORTED BY (city) INTO 32 BUCKETS STORED AS TEXTFILE;
set hive.enforce.bucketing = true;
INSERT OVERWRITE TABLE bucketed_user PARTITION (country) SELECT firstname ,lastname , address , city , state , post , phone1 , phone2 , email , web , country FROM temp_user;
No comments:
Post a Comment