def generate_seg_file(input_file, output_seg_file):
"""Segment the sentences in each line in input_file"""
with open(input_file, 'r') as f:
lines = f.readlines() with open(output_seg_file, 'w') as f: for line in lines:
label, content = line.decode('utf-8').strip('\r\n').split('\t')
word_iter = jieba.cut(content)
word_content = ''
for word in word_iter:
word = word.strip(' ') if word != '':
word_content += word + ' '
out_line = '%s\t%s\n' % (label, word_content.strip(' '))
f.write(out_line.encode('utf-8'))