# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1400850993 -7200
# Fri May 23 15:16:33 2014 +0200
# Node ID b5371dba546e664b53755068e41fc7e5da8c2ea7
# Parent 5689a4cc4915c773ce9542ad0149003a18db0e72
Provides Processings for all the distances, closes #248557
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1400850993 -7200
# Fri May 23 15:16:33 2014 +0200
# Node ID b5371dba546e664b53755068e41fc7e5da8c2ea7
# Parent 5689a4cc4915c773ce9542ad0149003a18db0e72
Provides Processings for all the distances, closes #248557
@@ -26,11 +26,12 @@
1 from dateutil import parser as dateparser 2 3 from nazca.utils.distances import (levenshtein, soundex, soundexcode, 4 difflib_match, 5 jaccard, euclidean, geographical, 6 - LevenshteinProcessing, SoundexProcessing) 7 + LevenshteinProcessing, SoundexProcessing, 8 + JaccardProcessing, DifflibProcessing, TemporalProcessing) 9 10 11 class DistancesTest(unittest.TestCase): 12 13 def test_difflib_match(self):
@@ -174,8 +175,39 @@
14 _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo'] 15 pdist = processing.pdist(_input) 16 self.assertEqual([0, 1, 1], pdist) 17 18 19 +class JaccardTestCase(unittest.TestCase): 20 + 21 + def test_pdist(self): 22 + processing = JaccardProcessing() 23 + _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo'] 24 + pdist = processing.pdist(_input) 25 + results = [0.666, 1, 0.666] 26 + for ind, value in enumerate(pdist): 27 + self.assertAlmostEqual(results[ind], value, 2) 28 + 29 + 30 +class DifflibTestCase(unittest.TestCase): 31 + 32 + def test_pdist(self): 33 + processing = DifflibProcessing() 34 + _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo'] 35 + pdist = processing.pdist(_input) 36 + results = [0.099, 0.238, 0.14] 37 + for ind, value in enumerate(pdist): 38 + self.assertAlmostEqual(results[ind], value, 2) 39 + 40 + 41 +class TemporalTestCase(unittest.TestCase): 42 + 43 + def test_pdist(self): 44 + processing = TemporalProcessing() 45 + _input = ['14 aout 1991', '08/14/1991', '08/15/1992'] 46 + pdist = processing.pdist(_input) 47 + self.assertEqual([0., 367, 367], pdist) 48 + 49 + 50 if __name__ == '__main__': 51 unittest.main() 52
@@ -476,5 +476,47 @@
53 distance_callback = partial(soundex, language=language, tokenizer=tokenizer) 54 super(SoundexProcessing, self).__init__(ref_attr_index, 55 target_attr_index, 56 distance_callback, 57 weight, matrix_normalized) 58 + 59 + 60 +class JaccardProcessing(BaseProcessing): 61 + """ A processing based on the jaccard distance. 62 + """ 63 + 64 + def __init__(self, ref_attr_index=None, target_attr_index=None, 65 + tokenizer=None, weight=1, matrix_normalized=False): 66 + distance_callback = partial(jaccard, tokenizer=tokenizer) 67 + super(JaccardProcessing, self).__init__(ref_attr_index, 68 + target_attr_index, 69 + distance_callback, 70 + weight, matrix_normalized) 71 + 72 + 73 +class DifflibProcessing(BaseProcessing): 74 + """ A processing based on the difflib distance. 75 + """ 76 + 77 + def __init__(self, ref_attr_index=None, target_attr_index=None, 78 + weight=1, matrix_normalized=False): 79 + super(DifflibProcessing, self).__init__(ref_attr_index, 80 + target_attr_index, 81 + difflib_match, 82 + weight, matrix_normalized) 83 + 84 + 85 +class TemporalProcessing(BaseProcessing): 86 + """ A processing based on the temporal distance. 87 + """ 88 + 89 + def __init__(self, ref_attr_index=None, target_attr_index=None, 90 + granularity=u'days', parserinfo=FrenchParserInfo, 91 + dayfirst=True, yearfirst=False, 92 + weight=1, matrix_normalized=False): 93 + distance_callback = partial(temporal, granularity=granularity, 94 + parserinfo=parserinfo, 95 + dayfirst=dayfirst, yearfirst=yearfirst) 96 + super(TemporalProcessing, self).__init__(ref_attr_index, 97 + target_attr_index, 98 + distance_callback, 99 + weight, matrix_normalized)