parallel processing - formatting large text files in python -


i have been stuck piece of code while trying determine if should use parallel programming.

the code takes text file 2 columns: first column contains word , second url.

in string_stripper_function() each row of text file formatted in specific way (hence call replace() function.).

then make comparison between first column , second column if word in first column contained in url in second column row written new file (call result.txt).

moreover, if word in first column contains 4 capital letters , url in second column has number add row same new file (result.txt).

now works , have checked multiple times, takes , exceedingly long time, few hours 1000000 rows on i7 machine 16gb of ram.

the file contains 1923014 lines (or rows if will); 97.9 mb.

so question is: performance-wise is there wrong code?

# -*- coding: utf-8 -*- """ created on sun apr 12 16:44:35 2015  @author: steve """ import re import multiprocessing mp import numpy np def hasnumbers(inputstring):     return any(char.isdigit() char in inputstring)     #this code strips urls main domain def url_stripper(url):     url=url.replace('http://','')     url=url.replace('https://','')     url=url.replace('http','')     url=url.replace('/',' ')     url=url.split()     return url   def string_stripper_function():    open("homepages.txt") infile:     i=0     l1=np.array([])     l2=np.array([])     l3=np.array([])     l4=np.array([])     line in infile:                 word_original=line.split()[0]         url_original=line.split()[1]         url=url_stripper(url_original)         if len(url)==0:             print 'lol no url fam'         else:             url=url[0]         word=word_original.replace('_',' ')         word=word.replace('-',' ')         word=word.replace('(','')         word=word.replace(')','')         regex = re.compile(".*?\((.*?)\)")         word_in_parenthesis = re.findall(regex, word)         in xrange(len(word_in_parenthesis)):             word=word.replace(word_in_parenthesis[i],'')         word=word.replace('the ','')         word=word.replace(' ','')         word=word.replace(', ','')         word=word.replace(' ,the ','')         word=word.replace(',the ','')         word=word.replace('...','')         word=word.replace('a ','')         word=word.replace(' ','')         word=word.replace(', ','')         word=word.replace(' ,a ','')         word=word.replace(',a ','')         word=word.replace('an ','')         word=word.replace(' ','')         word=word.replace(', ','')         word=word.replace(' ,an ','')         word=word.replace(',an ','')         word=word.replace(',','')         #condition 2&3         words=word.split() #        print word.lower().split() #        print url_original.lower()         capital_letters=sum(1 c in word if c.isupper())         decision=hasnumbers(url)         w in words:             #comment following              if w.lower() in url_original.lower():                 if word_original not in l1:                     l1=np.append(l1,word_original)                     l2=np.append(l2,url_original)                 else:                     print ""                 #uncomment following domain #            if w.lower() in url.lower(): #                    l1=np.append(l1,word_original) #                    l2=np.append(l2,url_original)             elif capital_letters==4 , decision==true:                 if word_original not in l1:                     l1=np.append(l1,word_original)                     l2=np.append(l2,url_original)                  else:                     print "" #        if word_original not in l1: #            if word_original not in l3: #                    l3=np.append(l3,word_original) #                    l4=np.append(l4,url_original)             else:                 print ""       file = open("results.txt", "w")     index in xrange(len(l1)):         file.write( '%s \t %s\n' % (str(l1[index]),str(l2[index])))     file.close() #    file1 = open("results_failedconditions.txt", "w") #    index in xrange(len(l3)): #        file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index]))) #    file1.close()  if __name__=="__main__":          string_stripper_function() 

firstly, question should asked on code review.

i have made few changes code take account few assumptions.

  1. str.replace() runs faster running on single large string iterating line line.
  2. in urlstripper found :// because believe beginning time happens.
  3. instead of using .replace("/"," ") , .split(), should faster .split("/"). can split own delimiter. see split in docs.

i made few other small changes. on testing, version of running test of 500 pages 1000 times, version took 0.054 secs, while version took 0.133 secs. here code:

# -*- coding: utf-8 -*- """ created on sun apr 12 16:44:35 2015  @author: steve @edit: ironmanmark20 """ import timer import re def hasnumbers(inputstring):     return any(char.isdigit() char in inputstring)     #this code strips urls main domain def url_stripper(url):     try:         index=url.index("://") #only happens @ beginning     except:         return url.split("/") #you can set splitter     url=url[:index]     return url.split("/")    def string_stripper_function():     open("./homepages.txt") infile:         i=0         l1=[]         l2=[]         #l3=[]         #l4=[]         lines_string="" #we'll use later         uris=[]#needed uris         line in infile:             word_original=line.split()[0]             url_original=line.split()[1]             url=url_stripper(url_original)             if len(url)==0:                 print 'lol no url fam'             else:                 url=url[0]             lines_string+=word_original             lines_string+="/" #add delimiter don't search later             uris+=[url_original]         words=mass_list(lines_string)         words=words[:len(words)-1]         w in words:             lines=lines_string.split("/")#split later use             capital_letters=sum(1 c in w if c.isupper())             url_original=uris[words.index(w)] #get url each line             decision=hasnumbers(url_original)              #comment following              if w.lower() in url_original.lower():                 if word_original not in l1:                     l1+=[lines[words.index(w)]]                     l2+=[uris[words.index(w)]]             #   else:             #       print ""                 #uncomment following domain #           if w.lower() in url.lower(): #                    l1=np.append(l1,word_original) #                    l2=np.append(l2,url_original)             elif capital_letters==4 , decision==true:                 if word_original not in l1:                     l1+=[lines[words.index(w)]]                     l2+=[uris[words.index(w)]]             #   else:             #       print "" #           if word_original not in l1: #               if word_original not in l3: #                    l3=np.append(l3,word_original) #                    l4=np.append(l4,url_original)             #else:             #   print ""       file = open("results.txt", "w")      in range(len(l1)):         file.write(l1[i]+" "+l2[i]+"\n")     file.close() #    file1 = open("results_failedconditions.txt", "w") #    index in xrange(len(l3)): #        file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index]))) #    file1.close() def mass_list(lines):         word=lines.replace('_',' ')         word=word.replace('-',' ')         word=word.replace('(','')         word=word.replace(')','')         regex = re.compile(".*?\((.*?)\)")         word_in_parenthesis = re.findall(regex, word)         in xrange(len(word_in_parenthesis)):             word=word.replace(word_in_parenthesis[i],'')         word=word.replace('the ','')         word=word.replace(' ','')         word=word.replace(', ','')         word=word.replace(' ,the ','')         word=word.replace(',the ','')         word=word.replace('...','')         word=word.replace('a ','')         word=word.replace(' ','')         word=word.replace(', ','')         word=word.replace(' ,a ','')         word=word.replace(',a ','')         word=word.replace('an ','')         word=word.replace(' ','')         word=word.replace(', ','')         word=word.replace(' ,an ','')         word=word.replace(',an ','')         word=word.replace(',','')         words=word.split('/') #changed split in arbitrary delimiter         return words  if __name__=="__main__":          string_stripper_function() 

Comments

Popular posts from this blog

jquery - How do you format the date used in the popover widget title of FullCalendar? -

Bubble Sort Manually a Linked List in Java -

asp.net mvc - SSO between MVCForum and Umbraco7 -