[distances] Add difflib match distance, closes #234655

authorVincent Michel <vincent.michel@logilab.fr>
changeset68d1ae13c3b0
branchdefault
phasepublic
hiddenno
parent revision#0ef0ac00ae61 Added tag nazca-centos-version-0.4.3 for changeset 25af84e64f42
child revision#dd4a0f979759 display ExceptionErrors in sparklquery
files modified by this revision
test/test_distances.py
utils/distances.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1396878977 0
# Mon Apr 07 13:56:17 2014 +0000
# Node ID 68d1ae13c3b0bed1bb6709be1826369a71c7054c
# Parent 0ef0ac00ae6131417c90646e28fb31933ed635b9
[distances] Add difflib match distance, closes #234655

diff --git a/test/test_distances.py b/test/test_distances.py
@@ -24,15 +24,23 @@
1  import random
2  random.seed(6) ### Make sure tests are repeatable
3  from dateutil import parser as dateparser
4 
5  from nazca.utils.distances import (levenshtein, soundex, soundexcode,
6 +                                   difflib_match,
7                                     jaccard, euclidean, geographical,
8                                     LevenshteinProcessing)
9 
10 
11  class DistancesTest(unittest.TestCase):
12 +
13 +    def test_difflib_match(self):
14 +        self.assertEqual(round(difflib_match('Victor Hugo', 'Victor Hugo'), 2), 0.)
15 +        self.assertEqual(round(difflib_match('Victor Hugo', 'Victor Wugo'), 2), 0.09)
16 +        self.assertEqual(round(difflib_match('niche', 'chiens'), 2), 0.45)
17 +        self.assertEqual(round(difflib_match('bonjour', 'bonjour !'), 2), 0.13)
18 +
19      def test_levenshtein(self):
20          self.assertEqual(levenshtein('niche', 'chiens'), 5)
21          self.assertEqual(levenshtein('bonjour', 'bonjour !'), 1)
22          self.assertEqual(levenshtein('bon', 'bonjour'), 4)
23          self.assertEqual(levenshtein('Victor Hugo', 'Hugo Victor'), 0)
diff --git a/utils/distances.py b/utils/distances.py
@@ -13,10 +13,11 @@
24  # details.
25  #
26  # You should have received a copy of the GNU Lesser General Public License along
27  # with this program. If not, see <http://www.gnu.org/licenses/>.
28 
29 +import difflib
30  from functools import partial
31  from math import cos, sqrt, pi #Needed for geographical distance
32  try:
33      from dateutil import parser as dateparser
34      DATEUTIL_ENABLED = True
@@ -233,10 +234,22 @@
35          J(A, B) = (A \cap B)/(A \cup B)
36          d(A, B) = 1 - J(A, B)
37      """
38      return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
39 
40 +def difflib_match(stra, strb):
41 +    """ Approximate matching.
42 +    Extract of SequenceMatched documentation
43 +    '[...] The basic algorithm predates, and is a little fancier than, an algorithm
44 +    published in the late 1980's by Ratcliff and Obershelp under the
45 +    hyperbolic name "gestalt pattern matching"[...]'
46 +
47 +    A value smaller than 0.4 means that sequences are close matches (we take
48 +    1 - difflib.SequenceMatched)
49 +    """
50 +    return 1.0 - difflib.SequenceMatcher(None, stra, strb).ratio()
51 +
52 
53  ###############################################################################
54  ### TEMPORAL DISTANCES ########################################################
55  ###############################################################################
56  if DATEUTIL_ENABLED: