[aligner] Speed up the alignset reduction (closes #116942)

authorSimon Chabot <simon.chabot@logilab.fr>
changeset33cc52731e55
branchdefault
phasepublic
hiddenno
parent revision#7498d98cde7a [aligner] Enable the user to customize the equality_threshold (closes #116940)
child revision#e5f1e678e654 [doc] Little explanation on alignall_iterative() (closes #116943)
files modified by this revision
aligner.py
# HG changeset patch
# User Simon Chabot <simon.chabot@logilab.fr>
# Date 1360922620 -3600
# Fri Feb 15 11:03:40 2013 +0100
# Node ID 33cc52731e552f1cac65f4266ed0a59ca5954c54
# Parent 7498d98cde7a3caeada81a0fe8c4c29e66fa3062
[aligner] Speed up the alignset reduction (closes #116942)

diff --git a/aligner.py b/aligner.py
@@ -330,25 +330,20 @@
1 
2      #Compute the number of iterations that must be done to achieve the alignement
3      nb_iterations = len(alignfiles) * len(targetfiles)
4      current_it = 0
5 
6 -    doneids = set([]) #Contains the id of perfectly aligned data
7 -    cache = cache or {} #Contains the better known alignments
8 +    cache = cache or {} #Contains the better known alignements
9 +    #Contains the id of perfectly aligned data
10 +    doneids = set(_id for _id, (_, dist) in cache.iteritems()
11 +                          if dist < equality_threshold)
12 
13      try:
14          for alignfile in alignfiles:
15 -            alignset = parsefile(osp.join(aligndir, alignfile), **alignformat)
16 +            alignset = [a for a in parsefile(osp.join(aligndir, alignfile), **alignformat)
17 +                        if a[0] not in doneids]
18              for targetfile in targetfiles:
19 -                if doneids: #If some alignements are already perfect,
20 -                            #don't redo them !
21 -                    tmp_align = []
22 -                    for a in alignset:
23 -                        if a[0] not in doneids:
24 -                            tmp_align.append(a)
25 -                    alignset = tmp_align
26 -
27                  targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
28                  matched = conquer_and_divide_alignment(alignset, targetset,
29                                                         threshold,
30                                                         treatments=treatments,
31                                                         indexes=indexes,
@@ -371,10 +366,21 @@
32 
33                  current_it += 1
34                  sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
35                                                  nb_iterations))
36                  sys.stdout.flush()
37 +                if doneids:
38 +                    alignset = [a for a in alignset if a[0] not in doneids]
39 +                if not alignset: #All items have been aligned
40 +                    #TODO Increment current_it.
41 +                    #The progress of the alignment process is computed with
42 +                    #`current_it`. If all items of `alignset` are aligned, we
43 +                    #stop the alignment process for this `alignset`. If
44 +                    #`current_it` isn’t incremented, the progress shown will be
45 +                    #false.
46 +                    break
47 +
48      finally:
49          rmtree(aligndir)
50          rmtree(targetdir)
51 
52      return cache