Provides Processings for all the distances, closes #248557

authorVincent Michel <vincent.michel@logilab.fr>
changesetb5371dba546e
branchdefault
phasepublic
hiddenno
parent revision#5689a4cc4915 [distance] Add safety belt on geographical distance units, closes #248555
child revision#56114a122a56 [blocking] Add compatibility for older version of sklearn, closes #252733
files modified by this revision
test/test_distances.py
utils/distances.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1400850993 -7200
# Fri May 23 15:16:33 2014 +0200
# Node ID b5371dba546e664b53755068e41fc7e5da8c2ea7
# Parent 5689a4cc4915c773ce9542ad0149003a18db0e72
Provides Processings for all the distances, closes #248557

diff --git a/test/test_distances.py b/test/test_distances.py
@@ -26,11 +26,12 @@
1  from dateutil import parser as dateparser
2 
3  from nazca.utils.distances import (levenshtein, soundex, soundexcode,
4                                     difflib_match,
5                                     jaccard, euclidean, geographical,
6 -                                   LevenshteinProcessing, SoundexProcessing)
7 +                                   LevenshteinProcessing, SoundexProcessing,
8 +                                   JaccardProcessing, DifflibProcessing, TemporalProcessing)
9 
10 
11  class DistancesTest(unittest.TestCase):
12 
13      def test_difflib_match(self):
@@ -174,8 +175,39 @@
14          _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo']
15          pdist = processing.pdist(_input)
16          self.assertEqual([0, 1, 1], pdist)
17 
18 
19 +class JaccardTestCase(unittest.TestCase):
20 +
21 +    def test_pdist(self):
22 +        processing = JaccardProcessing()
23 +        _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo']
24 +        pdist = processing.pdist(_input)
25 +        results = [0.666, 1, 0.666]
26 +        for ind, value in enumerate(pdist):
27 +            self.assertAlmostEqual(results[ind], value, 2)
28 +
29 +
30 +class DifflibTestCase(unittest.TestCase):
31 +
32 +    def test_pdist(self):
33 +        processing = DifflibProcessing()
34 +        _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo']
35 +        pdist = processing.pdist(_input)
36 +        results = [0.099, 0.238, 0.14]
37 +        for ind, value in enumerate(pdist):
38 +            self.assertAlmostEqual(results[ind], value, 2)
39 +
40 +
41 +class TemporalTestCase(unittest.TestCase):
42 +
43 +    def test_pdist(self):
44 +        processing = TemporalProcessing()
45 +        _input = ['14 aout 1991', '08/14/1991', '08/15/1992']
46 +        pdist = processing.pdist(_input)
47 +        self.assertEqual([0., 367, 367], pdist)
48 +
49 +
50  if __name__ == '__main__':
51      unittest.main()
52 
diff --git a/utils/distances.py b/utils/distances.py
@@ -476,5 +476,47 @@
53          distance_callback = partial(soundex, language=language, tokenizer=tokenizer)
54          super(SoundexProcessing, self).__init__(ref_attr_index,
55                                                  target_attr_index,
56                                                  distance_callback,
57                                                  weight, matrix_normalized)
58 +
59 +
60 +class JaccardProcessing(BaseProcessing):
61 +    """ A processing based on the jaccard distance.
62 +    """
63 +
64 +    def __init__(self, ref_attr_index=None, target_attr_index=None,
65 +                 tokenizer=None, weight=1, matrix_normalized=False):
66 +        distance_callback = partial(jaccard, tokenizer=tokenizer)
67 +        super(JaccardProcessing, self).__init__(ref_attr_index,
68 +                                                target_attr_index,
69 +                                                distance_callback,
70 +                                                weight, matrix_normalized)
71 +
72 +
73 +class DifflibProcessing(BaseProcessing):
74 +    """ A processing based on the difflib distance.
75 +    """
76 +
77 +    def __init__(self, ref_attr_index=None, target_attr_index=None,
78 +                 weight=1, matrix_normalized=False):
79 +        super(DifflibProcessing, self).__init__(ref_attr_index,
80 +                                                target_attr_index,
81 +                                                difflib_match,
82 +                                                weight, matrix_normalized)
83 +
84 +
85 +class TemporalProcessing(BaseProcessing):
86 +    """ A processing based on the temporal distance.
87 +    """
88 +
89 +    def __init__(self, ref_attr_index=None, target_attr_index=None,
90 +                 granularity=u'days', parserinfo=FrenchParserInfo,
91 +                 dayfirst=True, yearfirst=False,
92 +                 weight=1, matrix_normalized=False):
93 +        distance_callback = partial(temporal, granularity=granularity,
94 +                                    parserinfo=parserinfo,
95 +                                    dayfirst=dayfirst, yearfirst=yearfirst)
96 +        super(TemporalProcessing, self).__init__(ref_attr_index,
97 +                                                target_attr_index,
98 +                                                distance_callback,
99 +                                                weight, matrix_normalized)