[aligner] Add the alignall_iterative() function (closes #116932)

This function splits the files to align into smaller ones, and run the
alignment using a cache.
***
[aligner] Better display of progression

authorSimon Chabot <simon.chabot@logilab.fr>
changeset4b6119e623cf
branchdefault
phasepublic
hiddenno
parent revision#6afc3891e633 [dataio] Implements split_file() (closes #116931)
child revision#f942f2393fb2 [aligner] Enable the user to reuse the cache returned by alignall_iterative() (closes #116938)
files modified by this revision
aligner.py
test/data/alignfile.csv
test/data/targetfile.csv
test/test_alignment.py
# HG changeset patch
# User Simon Chabot <simon.chabot@logilab.fr>
# Date 1360854055 -3600
# Thu Feb 14 16:00:55 2013 +0100
# Node ID 4b6119e623cf80f4a82c1d4b4546f5d91f1ffbae
# Parent 6afc3891e63336ad8d8fe9fc9dae2f2c22648b4d
[aligner] Add the alignall_iterative() function (closes #116932)

This function splits the files to align into smaller ones, and run the
alignment using a cache.
***
[aligner] Better display of progression

diff --git a/aligner.py b/aligner.py
@@ -13,16 +13,21 @@
1  # details.
2  #
3  # You should have received a copy of the GNU Lesser General Public License along
4  # with this program. If not, see <http://www.gnu.org/licenses/>.
5 
6 +from os import listdir
7 +import os.path as osp
8 +from shutil import rmtree
9 +from tempfile import mkdtemp
10 +import sys
11 
12  from scipy.spatial import KDTree
13  from scipy.sparse import lil_matrix
14 
15  from nazca.minhashing import Minlsh
16 -from nazca.dataio import write_results
17 +from nazca.dataio import write_results, split_file, parsefile
18  import nazca.matrix as m
19 
20 
21  def normalize_set(rset, treatments):
22      """ Apply all the normalization functions to the given rset """
@@ -290,5 +295,75 @@
23                  yield alignset[alignid][0], targetset[targetid][0]
24      else:
25          for alignid in matched:
26              bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
27              yield alignset[alignid][0], targetset[bestid][0]
28 +
29 +def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
30 +                       threshold, size=10000, treatments=None, indexes=(1,1),
31 +                       mode='kdtree', neighbours_threshold=0.1, n_clusters=None,
32 +                       kwordsgram=1, siglen=200):
33 +
34 +    """ This function helps you to align *huge* files.
35 +        It takes your csv files as arguments and split them into smaller ones
36 +        (files of `size` lines), and runs the alignement on those files.
37 +
38 +        `alignformat` and `targetformat` are keyworded arguments given to the
39 +        nazca.dataio.parsefile function.
40 +    """
41 +
42 +    #Split the huge files into smaller ones
43 +    aligndir = mkdtemp()
44 +    targetdir = mkdtemp()
45 +    alignfiles = split_file(alignfile, aligndir, size)
46 +    targetfiles = split_file(targetfile, targetdir, size)
47 +
48 +    #Compute the number of iterations that must be done to achieve the alignement
49 +    nb_iterations = len(alignfiles) * len(targetfiles)
50 +    current_it = 0
51 +
52 +    doneids = set([]) #Contains the id of perfectly aligned data
53 +    cache = {} #Contains the better known alignements
54 +
55 +    try:
56 +        for alignfile in alignfiles:
57 +            alignset = parsefile(osp.join(aligndir, alignfile), **alignformat)
58 +            for targetfile in targetfiles:
59 +                if doneids: #If some alignements are already perfect,
60 +                            #don't redo them !
61 +                    tmp_align = []
62 +                    for a in alignset:
63 +                        if a[0] not in doneids:
64 +                            tmp_align.append(a)
65 +                    alignset = tmp_align
66 +
67 +                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
68 +                matched = conquer_and_divide_alignment(alignset, targetset,
69 +                                                       threshold,
70 +                                                       treatments=treatments,
71 +                                                       indexes=indexes,
72 +                                                       mode=mode,
73 +                                                       neighbours_threshold=neighbours_threshold,
74 +                                                       n_clusters=n_clusters,
75 +                                                       kwordsgram=kwordsgram,
76 +                                                       siglen=siglen,
77 +                                                       get_global_mat=False)
78 +                for alignid in matched:
79 +                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
80 +                    #Get the better known distance
81 +                    _, current_dist = cache.get(alignset[alignid][0], (None, None))
82 +                    if not current_dist or current_dist > dist:
83 +                        #If it's better, update the cache
84 +                        cache[alignset[alignid][0]] = (targetset[bestid][0], dist)
85 +                        if dist <= 0.01 :
86 +                            #If perfect, stop trying to align this one
87 +                            doneids.add(alignset[alignid][0])
88 +
89 +                current_it += 1
90 +                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
91 +                                                nb_iterations))
92 +                sys.stdout.flush()
93 +    finally:
94 +        rmtree(aligndir)
95 +        rmtree(targetdir)
96 +
97 +    return cache
diff --git a/test/data/alignfile.csv b/test/data/alignfile.csv
@@ -0,0 +1,4 @@
98 +V1	label1	6.14194444444	48.67
99 +V2	label2	6.2	49
100 +V3	label3	5.1	48
101 +V4	label4	5.2	48.1
diff --git a/test/data/targetfile.csv b/test/data/targetfile.csv
@@ -0,0 +1,3 @@
102 +T1	labelt1	6.17	48.7
103 +T2	labelt2	5.3	48.2
104 +T3	labelt3	6.25	48.91
diff --git a/test/test_alignment.py b/test/test_alignment.py
@@ -443,10 +443,33 @@
105                                              uniq=False))
106 
107          self.assertEqual(predict_matched, all_matched)
108          self.assertEqual(predict_uniq_matched, uniq_matched)
109 
110 +    def test_alignall_iterative(self):
111 +        matched = set([('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')])
112 +        treatments = {2: {'metric': 'geographical', 'matrix_normalized': False,
113 +                          'metric_params': {'units': 'km', 'in_radians': False}}}
114 +
115 +        _format={'indexes': [0, 1, (2, 3)]}
116 +        alignements = alig.alignall_iterative(path.join(TESTDIR, 'data',
117 +                                                        'alignfile.csv'),
118 +                                              path.join(TESTDIR, 'data',
119 +                                                        'targetfile.csv'),
120 +                                              _format, _format, threshold=30,
121 +                                              size=2, #very small files ;)
122 +                                              treatments=treatments,
123 +                                              indexes=(2,2),
124 +                                              neighbours_threshold=0.3)
125 +
126 +        predict_matched = set([(a, t) for (a, (t, _)) in
127 +                               alignements.iteritems()])
128 +        self.assertEqual(predict_matched, matched)
129 +
130 +
131 +
132 +
133 
134 
135  if __name__ == '__main__':
136      unittest2.main()
137