Python - write compressed log file into HDFS for hadoop hive mapreduce
import pyhdfs
from cStringIO import StringIO
import binascii
-snip-
#Set hdfs connection info
hdfsaddress = “namenode”
hdfsport = 12345
hdfsfn = “filename”
#gzip compression level
clevel = 1
-snip-
logger.info(“Writing compressed data into ” + hdfsfn + “.gz”)
#open hdfs file
fout = pyhdfs.open(hdfs, hdfsfn + “.gz”, “w”)
#compress the data and store it in compressed_data
buf = StringIO()
f = gzip.GzipFile(mode=’wb’, compresslevel=clevel,fileobj=buf)
try:
f.write(concatlog)
finally:
f.close()
compressed_data = buf.getvalue()
#write compressed data into hdfs
pyhdfs.write(hdfs,fout,compressed_data)
#close hdfs file
logger.info(“Writing task finished”)
pyhdfs.close(hdfs,fout)
-snip-