levenshtein.hh 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #pragma once
  2. #include <string>
  3. #include <list>
  4. #include <limits.h>
  5. #include "jsonElement.hh"
  6. #include <iostream>
  7. #define LEVENSHTEIN_SENSIBILITY (0.7f)
  8. float levenshteinPercent(const std::string &a, const std::string &b);
  9. template<class T> float levenshteinPercent(const std::list<T *> *a, const std::list<T *> *b);
  10. bool levenshteinStrictCompare(const char &a, const char &b);
  11. bool levenshteinStrictCompare(const JSonElement *a, const JSonElement *b);
  12. bool levenshteinCompare(const char &a, const char &b);
  13. bool levenshteinCompare(const JSonElement *a, const JSonElement *b);
  14. template<typename SIZE, class ITERATOR, class SUBTYPE>
  15. static SIZE **_levenshteinMatrice(const ITERATOR &aBegin, const ITERATOR &aEnd, const ITERATOR &bBegin, const ITERATOR &bEnd, const size_t lenA, const size_t lenB)
  16. {
  17. size_t i, j;
  18. SIZE **matrice = new SIZE*[lenA +1]();
  19. ITERATOR a = aBegin;
  20. ITERATOR b;
  21. matrice[0] = new SIZE[lenB +1]();
  22. for (j=0; j <= lenB; j++)
  23. matrice[0][j] = j;
  24. for (i =1; a != aEnd; ++i, ++a)
  25. {
  26. matrice[i] = new SIZE[lenB +1]();
  27. matrice[i][0] = i;
  28. b = bBegin;
  29. for (j =1; b != bEnd; ++j, ++b)
  30. matrice[i][j] = std::min(std::min(
  31. matrice[i -1][j] +1,
  32. matrice[i][j -1] +1),
  33. matrice[i -1][j -1] + ((levenshteinCompare(*a, *b) > LEVENSHTEIN_SENSIBILITY) ? 0 : 1));
  34. }
  35. std::cerr << "<------" << std::endl;
  36. for (size_t i=0; i <= lenA; ++i)
  37. {
  38. for (size_t j=0; j <= lenB; ++j)
  39. std::cerr << (size_t) matrice[i][j] << "\t";
  40. std::cerr << std::endl;
  41. }
  42. return matrice;
  43. };
  44. template<typename SIZE, typename ITERATOR, class SUBTYPE>
  45. static float _levenshteinPercent(ITERATOR aBegin, ITERATOR aEnd, ITERATOR bBegin, ITERATOR bEnd, size_t lenA, size_t lenB)
  46. {
  47. const size_t maxSize = std::max(lenA, lenB);
  48. while (aBegin != aEnd && bBegin != bEnd && levenshteinCompare(*aBegin, *bBegin))
  49. {
  50. aBegin++;
  51. bBegin++;
  52. lenA--;
  53. lenB--;
  54. }
  55. if (!lenA && !lenB) return 1.f;
  56. if (!lenA) return (float) lenB / maxSize;
  57. if (!lenB) return (float) lenA / maxSize;
  58. SIZE **matrice = _levenshteinMatrice<SIZE, ITERATOR, SUBTYPE>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  59. size_t i;
  60. const SIZE result = matrice[lenA][lenB];
  61. for (i=0; i < lenA; ++i)
  62. delete[] matrice[i];
  63. delete[] matrice;
  64. return 1 - ((float)result / maxSize);
  65. };
  66. template<class T> float levenshteinPercent(const std::list<T *> *a, const std::list<T *> *b)
  67. {
  68. const size_t lenA = a->size();
  69. const size_t lenB = b->size();
  70. typename std::list<T*>::const_iterator aBegin = a->cbegin();
  71. typename std::list<T*>::const_iterator aEnd = a->cend();
  72. typename std::list<T*>::const_iterator bBegin = b->cbegin();
  73. typename std::list<T*>::const_iterator bEnd = b->cend();
  74. if (lenA < UCHAR_MAX && lenB < UCHAR_MAX)
  75. return _levenshteinPercent<unsigned char, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  76. if (lenA < USHRT_MAX && lenB < USHRT_MAX)
  77. return _levenshteinPercent<unsigned short, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  78. return _levenshteinPercent<unsigned int, typename std::list<T *>::const_iterator, T *>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  79. }
  80. enum ePath: char
  81. {
  82. add = '+',
  83. rem = '-',
  84. mod = '!',
  85. equ = '='
  86. };
  87. template<typename SIZE, class ITERATOR, class SUBTYPE>
  88. static std::list<ePath> _levenshteinShortestPath(ITERATOR aBegin, ITERATOR aEnd, ITERATOR bBegin, ITERATOR bEnd, size_t lenA, size_t lenB)
  89. {
  90. std::list<ePath> result(std::max(lenA, lenB));
  91. while (aBegin != aEnd && bBegin != bEnd && levenshteinCompare(*aBegin, *bBegin))
  92. {
  93. aBegin++;
  94. bBegin++;
  95. lenA--;
  96. lenB--;
  97. }
  98. if (!lenA && !lenB)
  99. ; //TODO create deque<ePath>(std::max(lenA, lenB) populated with '='
  100. else if (!lenA)
  101. ; //TODO create deque<ePath>(std::max(lenA, lenB) populated with '=' then '-' and return it
  102. else if (!lenB)
  103. ; //TODO create deque<ePath>(std::max(lenA, lenB) populated with '=' then '+' and return it
  104. SIZE **matrice = _levenshteinMatrice<SIZE, ITERATOR, SUBTYPE>(aBegin, aEnd, bBegin, bEnd, lenA, lenB);
  105. size_t i;
  106. //TODO find shortest path
  107. // - goto bottom right
  108. // - go back to top left going decrement ONLY (or =, if not possible)
  109. // Clean matrice
  110. for (i=0; i < lenA; ++i)
  111. delete[] matrice[i];
  112. delete[] matrice;
  113. return result;
  114. };
  115. template<class T>
  116. std::list<ePath> levenshteinShortestPath(const std::list<T*> *a, const std::list<T *> *b)
  117. {
  118. const size_t lenA = a->size();
  119. const size_t lenB = b->size();
  120. if (lenA < UCHAR_MAX && lenB < UCHAR_MAX)
  121. return _levenshteinShortestPath<unsigned char, typename std::list<T *>::const_iterator, T *>(a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  122. if (lenA < USHRT_MAX && lenB < USHRT_MAX)
  123. return _levenshteinShortestPath<unsigned short, typename std::list<T *>::const_iterator, T *>(a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  124. return _levenshteinShortestPath<unsigned int, typename std::list<T *>::const_iterator, T *>(a->cbegin(), a->cend(), b->cbegin(), b->cend(), lenA, lenB);
  125. }