parallel processing - formatting large text files in python -
i have been stuck piece of code while trying determine if should use parallel programming.
the code takes text file 2 columns: first column contains word , second url.
in string_stripper_function() each row of text file formatted in specific way (hence call replace() function.).
then make comparison between first column , second column if word in first column contained in url in second column row written new file (call result.txt
).
moreover, if word in first column contains 4 capital letters , url in second column has number add row same new file (result.txt
).
now works , have checked multiple times, takes , exceedingly long time, few hours 1000000 rows on i7 machine 16gb of ram.
the file contains 1923014 lines (or rows if will); 97.9 mb.
so question is: performance-wise is there wrong code?
# -*- coding: utf-8 -*- """ created on sun apr 12 16:44:35 2015 @author: steve """ import re import multiprocessing mp import numpy np def hasnumbers(inputstring): return any(char.isdigit() char in inputstring) #this code strips urls main domain def url_stripper(url): url=url.replace('http://','') url=url.replace('https://','') url=url.replace('http','') url=url.replace('/',' ') url=url.split() return url def string_stripper_function(): open("homepages.txt") infile: i=0 l1=np.array([]) l2=np.array([]) l3=np.array([]) l4=np.array([]) line in infile: word_original=line.split()[0] url_original=line.split()[1] url=url_stripper(url_original) if len(url)==0: print 'lol no url fam' else: url=url[0] word=word_original.replace('_',' ') word=word.replace('-',' ') word=word.replace('(','') word=word.replace(')','') regex = re.compile(".*?\((.*?)\)") word_in_parenthesis = re.findall(regex, word) in xrange(len(word_in_parenthesis)): word=word.replace(word_in_parenthesis[i],'') word=word.replace('the ','') word=word.replace(' ','') word=word.replace(', ','') word=word.replace(' ,the ','') word=word.replace(',the ','') word=word.replace('...','') word=word.replace('a ','') word=word.replace(' ','') word=word.replace(', ','') word=word.replace(' ,a ','') word=word.replace(',a ','') word=word.replace('an ','') word=word.replace(' ','') word=word.replace(', ','') word=word.replace(' ,an ','') word=word.replace(',an ','') word=word.replace(',','') #condition 2&3 words=word.split() # print word.lower().split() # print url_original.lower() capital_letters=sum(1 c in word if c.isupper()) decision=hasnumbers(url) w in words: #comment following if w.lower() in url_original.lower(): if word_original not in l1: l1=np.append(l1,word_original) l2=np.append(l2,url_original) else: print "" #uncomment following domain # if w.lower() in url.lower(): # l1=np.append(l1,word_original) # l2=np.append(l2,url_original) elif capital_letters==4 , decision==true: if word_original not in l1: l1=np.append(l1,word_original) l2=np.append(l2,url_original) else: print "" # if word_original not in l1: # if word_original not in l3: # l3=np.append(l3,word_original) # l4=np.append(l4,url_original) else: print "" file = open("results.txt", "w") index in xrange(len(l1)): file.write( '%s \t %s\n' % (str(l1[index]),str(l2[index]))) file.close() # file1 = open("results_failedconditions.txt", "w") # index in xrange(len(l3)): # file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index]))) # file1.close() if __name__=="__main__": string_stripper_function()
firstly, question should asked on code review.
i have made few changes code take account few assumptions.
str.replace()
runs faster running on single large string iterating line line.- in
urlstripper
found://
because believe beginning time happens. - instead of using
.replace("/"," ")
,.split()
, should faster.split("/")
. can split own delimiter. see split in docs.
i made few other small changes. on testing, version of running test of 500 pages 1000 times, version took 0.054 secs, while version took 0.133 secs. here code:
# -*- coding: utf-8 -*- """ created on sun apr 12 16:44:35 2015 @author: steve @edit: ironmanmark20 """ import timer import re def hasnumbers(inputstring): return any(char.isdigit() char in inputstring) #this code strips urls main domain def url_stripper(url): try: index=url.index("://") #only happens @ beginning except: return url.split("/") #you can set splitter url=url[:index] return url.split("/") def string_stripper_function(): open("./homepages.txt") infile: i=0 l1=[] l2=[] #l3=[] #l4=[] lines_string="" #we'll use later uris=[]#needed uris line in infile: word_original=line.split()[0] url_original=line.split()[1] url=url_stripper(url_original) if len(url)==0: print 'lol no url fam' else: url=url[0] lines_string+=word_original lines_string+="/" #add delimiter don't search later uris+=[url_original] words=mass_list(lines_string) words=words[:len(words)-1] w in words: lines=lines_string.split("/")#split later use capital_letters=sum(1 c in w if c.isupper()) url_original=uris[words.index(w)] #get url each line decision=hasnumbers(url_original) #comment following if w.lower() in url_original.lower(): if word_original not in l1: l1+=[lines[words.index(w)]] l2+=[uris[words.index(w)]] # else: # print "" #uncomment following domain # if w.lower() in url.lower(): # l1=np.append(l1,word_original) # l2=np.append(l2,url_original) elif capital_letters==4 , decision==true: if word_original not in l1: l1+=[lines[words.index(w)]] l2+=[uris[words.index(w)]] # else: # print "" # if word_original not in l1: # if word_original not in l3: # l3=np.append(l3,word_original) # l4=np.append(l4,url_original) #else: # print "" file = open("results.txt", "w") in range(len(l1)): file.write(l1[i]+" "+l2[i]+"\n") file.close() # file1 = open("results_failedconditions.txt", "w") # index in xrange(len(l3)): # file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index]))) # file1.close() def mass_list(lines): word=lines.replace('_',' ') word=word.replace('-',' ') word=word.replace('(','') word=word.replace(')','') regex = re.compile(".*?\((.*?)\)") word_in_parenthesis = re.findall(regex, word) in xrange(len(word_in_parenthesis)): word=word.replace(word_in_parenthesis[i],'') word=word.replace('the ','') word=word.replace(' ','') word=word.replace(', ','') word=word.replace(' ,the ','') word=word.replace(',the ','') word=word.replace('...','') word=word.replace('a ','') word=word.replace(' ','') word=word.replace(', ','') word=word.replace(' ,a ','') word=word.replace(',a ','') word=word.replace('an ','') word=word.replace(' ','') word=word.replace(', ','') word=word.replace(' ,an ','') word=word.replace(',an ','') word=word.replace(',','') words=word.split('/') #changed split in arbitrary delimiter return words if __name__=="__main__": string_stripper_function()
Comments
Post a Comment