levenshtein.hpp 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #pragma once
  2. #include <string>
  3. #include <list>
  4. #include <limits.h>
  5. #include "jsonElement.hh"
  6. #include "levenshteinCache.hh"
  7. #define LEVENSHTEIN_SENSIBILITY (0.7f)
  8. float levenshteinPercent(const std::string &a, const std::string &b);
  9. template<class T> float levenshteinPercent(const std::list<T *> *a, const std::list<T *> *b);
  10. bool levenshteinStrictCompare(const char &a, const char &b);
  11. bool levenshteinStrictCompare(const JSonElement *a, const JSonElement *b);
  12. bool levenshteinCompare(const char &a, const char &b);
  13. bool levenshteinCompare(const JSonElement *a, const JSonElement *b);
  14. template<typename SIZE, class ITERATOR, class SUBTYPE>
  15. static SIZE **_levenshteinMatrice(const ITERATOR &aBegin, const ITERATOR &aEnd, const ITERATOR &bBegin, const ITERATOR &bEnd, const size_t lenA, const size_t lenB)
  16. {
  17. size_t i, j;
  18. SIZE **matrice = new SIZE*[lenA +1]();
  19. ITERATOR a = aBegin;
  20. ITERATOR b;
  21. matrice[0] = new SIZE[lenB +1]();
  22. for (j=0; j <= lenB; j++)
  23. matrice[0][j] = j;
  24. for (i =1; a != aEnd; ++i, ++a)
  25. {
  26. matrice[i] = new SIZE[lenB +1]();
  27. matrice[i][0] = i;
  28. b = bBegin;
  29. for (j =1; b != bEnd; ++j, ++b)
  30. matrice[i][j] = std::min(std::min(
  31. matrice[i -1][j] +1,
  32. matrice[i][j -1] +1),
  33. matrice[i -1][j -1] + ((levenshteinCompare(*a, *b) > LEVENSHTEIN_SENSIBILITY) ? 0 : 1));
  34. }
  35. return matrice;
  36. };
  37. template<typename SIZE, typename ITERATOR, class SUBTYPE>
  38. static float _levenshteinPercent(ITERATOR aBegin, ITERATOR aEnd, ITERATOR bBegin, ITERATOR bEnd, size_t lenA, size_t lenB)
  39. {
  40. const size_t maxSize = std::max(lenA, lenB);
  41. while (aBegin != aEnd && bBegin != bEnd && levenshteinCompare(*aBegin, *bBegin))
  42. {
  43. aBegin++;
  44. bBegin++;
  45. lenA--;
  46. lenB--;
  47. }
  48. if (!lenA && !lenB) return 1.f;
  49. if (!lenA) return (float) lenB / maxSize;
  50. if (!lenB) return (float) lenA / maxSize;
  51. SIZE **matrice = _levenshteinMatrice<SIZE, ITERATOR, SUBTYPE>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  52. size_t i;
  53. const SIZE result = matrice[lenA][lenB];
  54. for (i=0; i < lenA; ++i)
  55. delete[] matrice[i];
  56. delete[] matrice;
  57. return 1 - ((float)result / maxSize);
  58. };
  59. template<class T> float levenshteinPercent(const std::list<T *> *a, const std::list<T *> *b)
  60. {
  61. const size_t lenA = a->size();
  62. const size_t lenB = b->size();
  63. typename std::list<T*>::const_iterator aBegin = a->cbegin();
  64. typename std::list<T*>::const_iterator aEnd = a->cend();
  65. typename std::list<T*>::const_iterator bBegin = b->cbegin();
  66. typename std::list<T*>::const_iterator bEnd = b->cend();
  67. if (lenA < UCHAR_MAX && lenB < UCHAR_MAX)
  68. return _levenshteinPercent<unsigned char, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  69. if (lenA < USHRT_MAX && lenB < USHRT_MAX)
  70. return _levenshteinPercent<unsigned short, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  71. return _levenshteinPercent<unsigned int, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  72. }
  73. template<typename SIZE, class ITERATOR, class SUBTYPE>
  74. static size_t _levenshteinShortestPath(std::list<eLevenshteinOperator> &result, ITERATOR aBegin, ITERATOR aEnd, ITERATOR bBegin, ITERATOR bEnd, size_t lenA, size_t lenB)
  75. {
  76. const size_t initLenA = lenA;
  77. const size_t initLenB = lenB;
  78. result.clear();
  79. while (aBegin != aEnd && bBegin != bEnd && levenshteinCompare(*aBegin, *bBegin))
  80. {
  81. aBegin++;
  82. bBegin++;
  83. lenA--;
  84. lenB--;
  85. }
  86. if (!lenA && !lenB)
  87. {
  88. for (size_t i=0; i < initLenA; ++i)
  89. result.push_back(eLevenshteinOperator::equ);
  90. return 0;
  91. }
  92. else if (!lenA)
  93. {
  94. size_t i;
  95. for (i=0; i < initLenB - lenB; ++i)
  96. result.push_back(eLevenshteinOperator::equ);
  97. for (; i < initLenB; ++i)
  98. result.push_back(eLevenshteinOperator::rem);
  99. return lenB;
  100. }
  101. else if (!lenB)
  102. {
  103. size_t i;
  104. for (i=0; i < initLenA - lenA; ++i)
  105. result.push_back(eLevenshteinOperator::equ);
  106. for (; i < initLenA; ++i)
  107. result.push_back(eLevenshteinOperator::add);
  108. return lenA;
  109. }
  110. SIZE **matrice = _levenshteinMatrice<SIZE, ITERATOR, SUBTYPE>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  111. size_t i = lenA;
  112. size_t j = lenB;
  113. const size_t levenDist = matrice[i][j];
  114. while (i || j)
  115. {
  116. if (i && (!j || matrice[i][j] > matrice[i-1][j]))
  117. {
  118. result.push_front(eLevenshteinOperator::add);
  119. --i;
  120. }
  121. else if (j && (!i || matrice[i][j] > matrice[i][j -1]))
  122. {
  123. result.push_front(eLevenshteinOperator::rem);
  124. --j;
  125. }
  126. else if (i && j)
  127. {
  128. result.push_front(matrice[i][j] == matrice[i-1][j-1] ? eLevenshteinOperator::equ : eLevenshteinOperator::mod);
  129. --i;
  130. --j;
  131. }
  132. else if (i)
  133. {
  134. result.push_front(eLevenshteinOperator::add);
  135. --i;
  136. }
  137. else if (j)
  138. {
  139. result.push_front(eLevenshteinOperator::rem);
  140. --j;
  141. }
  142. }
  143. for (i = initLenA - lenA; i; --i)
  144. result.push_front(eLevenshteinOperator::equ);
  145. //TODO
  146. LevenshteinCache<JSonElement *>::instance();
  147. // Clean matrice
  148. for (i=0; i < lenA +1; ++i)
  149. delete[] matrice[i];
  150. delete[] matrice;
  151. return levenDist;
  152. };
  153. template<class T>
  154. size_t levenshteinShortestPath(std::list<eLevenshteinOperator> &result, const std::list<T*> *a, const std::list<T *> *b)
  155. {
  156. const size_t lenA = a->size();
  157. const size_t lenB = b->size();
  158. if (lenA < UCHAR_MAX && lenB < UCHAR_MAX)
  159. return _levenshteinShortestPath<unsigned char, typename std::list<T *>::const_iterator, T *>(result, a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  160. if (lenA < USHRT_MAX && lenB < USHRT_MAX)
  161. return _levenshteinShortestPath<unsigned short, typename std::list<T *>::const_iterator, T *>(result, a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  162. return _levenshteinShortestPath<unsigned int, typename std::list<T *>::const_iterator, T *>(result, a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  163. }