1
1

levenshtein.hpp 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. #pragma once
  2. #include <string>
  3. #include <list>
  4. #include <limits.h>
  5. #include "jsonElement.hh"
  6. #define LEVENSHTEIN_SENSIBILITY (0.7f)
  7. float levenshteinPercent(const std::string &a, const std::string &b);
  8. template<class T> float levenshteinPercent(const std::list<T *> *a, const std::list<T *> *b);
  9. bool levenshteinStrictCompare(const char &a, const char &b);
  10. bool levenshteinStrictCompare(const JSonElement *a, const JSonElement *b);
  11. bool levenshteinCompare(const char &a, const char &b);
  12. bool levenshteinCompare(const JSonElement *a, const JSonElement *b);
  13. template<typename SIZE, class ITERATOR, class SUBTYPE>
  14. static SIZE **_levenshteinMatrice(const ITERATOR &aBegin, const ITERATOR &aEnd, const ITERATOR &bBegin, const ITERATOR &bEnd, const size_t lenA, const size_t lenB)
  15. {
  16. size_t i, j;
  17. SIZE **matrice = new SIZE*[lenA +1]();
  18. ITERATOR a = aBegin;
  19. ITERATOR b;
  20. matrice[0] = new SIZE[lenB +1]();
  21. for (j=0; j <= lenB; j++)
  22. matrice[0][j] = j;
  23. for (i =1; a != aEnd; ++i, ++a)
  24. {
  25. matrice[i] = new SIZE[lenB +1]();
  26. matrice[i][0] = i;
  27. b = bBegin;
  28. for (j =1; b != bEnd; ++j, ++b)
  29. matrice[i][j] = std::min(std::min(
  30. matrice[i -1][j] +1,
  31. matrice[i][j -1] +1),
  32. matrice[i -1][j -1] + ((levenshteinCompare(*a, *b) > LEVENSHTEIN_SENSIBILITY) ? 0 : 1));
  33. }
  34. return matrice;
  35. };
  36. template<typename SIZE, typename ITERATOR, class SUBTYPE>
  37. static float _levenshteinPercent(ITERATOR aBegin, ITERATOR aEnd, ITERATOR bBegin, ITERATOR bEnd, size_t lenA, size_t lenB)
  38. {
  39. const size_t maxSize = std::max(lenA, lenB);
  40. while (aBegin != aEnd && bBegin != bEnd && levenshteinCompare(*aBegin, *bBegin))
  41. {
  42. aBegin++;
  43. bBegin++;
  44. lenA--;
  45. lenB--;
  46. }
  47. if (!lenA && !lenB) return 1.f;
  48. if (!lenA) return (float) lenB / maxSize;
  49. if (!lenB) return (float) lenA / maxSize;
  50. SIZE **matrice = _levenshteinMatrice<SIZE, ITERATOR, SUBTYPE>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  51. size_t i;
  52. const SIZE result = matrice[lenA][lenB];
  53. for (i=0; i < lenA; ++i)
  54. delete[] matrice[i];
  55. delete[] matrice;
  56. return 1 - ((float)result / maxSize);
  57. };
  58. template<class T> float levenshteinPercent(const std::list<T *> *a, const std::list<T *> *b)
  59. {
  60. const size_t lenA = a->size();
  61. const size_t lenB = b->size();
  62. typename std::list<T*>::const_iterator aBegin = a->cbegin();
  63. typename std::list<T*>::const_iterator aEnd = a->cend();
  64. typename std::list<T*>::const_iterator bBegin = b->cbegin();
  65. typename std::list<T*>::const_iterator bEnd = b->cend();
  66. if (lenA < UCHAR_MAX && lenB < UCHAR_MAX)
  67. return _levenshteinPercent<unsigned char, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  68. if (lenA < USHRT_MAX && lenB < USHRT_MAX)
  69. return _levenshteinPercent<unsigned short, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  70. return _levenshteinPercent<unsigned int, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  71. }
  72. enum ePath: char
  73. {
  74. add = '+',
  75. rem = '-',
  76. mod = '!',
  77. equ = '='
  78. };
  79. template<typename SIZE, class ITERATOR, class SUBTYPE>
  80. static size_t _levenshteinShortestPath(std::list<ePath> &result, ITERATOR aBegin, ITERATOR aEnd, ITERATOR bBegin, ITERATOR bEnd, size_t lenA, size_t lenB)
  81. {
  82. const size_t initLenA = lenA;
  83. const size_t initLenB = lenB;
  84. result.clear();
  85. while (aBegin != aEnd && bBegin != bEnd && levenshteinCompare(*aBegin, *bBegin))
  86. {
  87. aBegin++;
  88. bBegin++;
  89. lenA--;
  90. lenB--;
  91. }
  92. if (!lenA && !lenB)
  93. {
  94. for (size_t i=0; i < initLenA; ++i)
  95. result.push_back(ePath::equ);
  96. return 0;
  97. }
  98. else if (!lenA)
  99. {
  100. size_t i;
  101. for (i=0; i < initLenB - lenB; ++i)
  102. result.push_back(ePath::equ);
  103. for (; i < initLenB; ++i)
  104. result.push_back(ePath::rem);
  105. return lenB;
  106. }
  107. else if (!lenB)
  108. {
  109. size_t i;
  110. for (i=0; i < initLenA - lenA; ++i)
  111. result.push_back(ePath::equ);
  112. for (; i < initLenA; ++i)
  113. result.push_back(ePath::add);
  114. return lenA;
  115. }
  116. SIZE **matrice = _levenshteinMatrice<SIZE, ITERATOR, SUBTYPE>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  117. size_t i = lenA;
  118. size_t j = lenB;
  119. const size_t levenDist = matrice[i][j];
  120. while (i || j)
  121. {
  122. if (i && (!j || matrice[i][j] > matrice[i-1][j]))
  123. {
  124. result.push_front(ePath::add);
  125. --i;
  126. }
  127. else if (j && (!i || matrice[i][j] > matrice[i][j -1]))
  128. {
  129. result.push_front(ePath::rem);
  130. --j;
  131. }
  132. else if (i && j)
  133. {
  134. result.push_front(matrice[i][j] == matrice[i-1][j-1] ? ePath::equ : ePath::mod);
  135. --i;
  136. --j;
  137. }
  138. else if (i)
  139. {
  140. result.push_front(ePath::add);
  141. --i;
  142. }
  143. else if (j)
  144. {
  145. result.push_front(ePath::rem);
  146. --j;
  147. }
  148. }
  149. for (i = initLenA - lenA; i; --i)
  150. result.push_front(ePath::equ);
  151. // Clean matrice
  152. for (i=0; i < lenA +1; ++i)
  153. delete[] matrice[i];
  154. delete[] matrice;
  155. return levenDist;
  156. };
  157. template<class T>
  158. size_t levenshteinShortestPath(std::list<ePath> &result, const std::list<T*> *a, const std::list<T *> *b)
  159. {
  160. const size_t lenA = a->size();
  161. const size_t lenB = b->size();
  162. if (lenA < UCHAR_MAX && lenB < UCHAR_MAX)
  163. return _levenshteinShortestPath<unsigned char, typename std::list<T *>::const_iterator, T *>(result, a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  164. if (lenA < USHRT_MAX && lenB < USHRT_MAX)
  165. return _levenshteinShortestPath<unsigned short, typename std::list<T *>::const_iterator, T *>(result, a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  166. return _levenshteinShortestPath<unsigned int, typename std::list<T *>::const_iterator, T *>(result, a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  167. }