From f6d3169bfc7d7297b4eb21d5e6845ce5e58baddd Mon Sep 17 00:00:00 2001 From: "\"Michael Hauser-Raspe\"" Date: Sat, 25 Jun 2022 00:41:32 +0100 Subject: [PATCH] Create fuzzy scoring and fuzzy sort. --- inc/utils.h | 9 ++- src/utils.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++- src/wofi.c | 60 ++++++++---------- 3 files changed, 209 insertions(+), 37 deletions(-) diff --git a/inc/utils.h b/inc/utils.h index f644a4a..f995835 100644 --- a/inc/utils.h +++ b/inc/utils.h @@ -22,6 +22,11 @@ #include +typedef double score_t; +#define SCORE_MAX INFINITY +#define SCORE_MIN -INFINITY +#define MATCH_FUZZY_MAX_LEN 256 + time_t utils_get_time_millis(void); void utils_sleep_millis(time_t millis); @@ -34,6 +39,8 @@ size_t utils_min3(size_t n1, size_t n2, size_t n3); size_t utils_distance(const char* haystack, const char* needle); -void utils_mkdir(char* path, mode_t mode); +score_t utils_fuzzy_score(const char *haystack, const char *needle); + +void utils_mkdir(char *path, mode_t mode); #endif diff --git a/src/utils.c b/src/utils.c index c34bfe0..edc2f50 100644 --- a/src/utils.c +++ b/src/utils.c @@ -17,13 +17,15 @@ #include +#include +#include #include #include +#include #include #include #include #include -#include #include #include @@ -109,6 +111,179 @@ size_t utils_distance(const char* haystack, const char* needle) { return arr[str1_len][str2_len]; } +// leading gap +#define SCORE_GAP_LEADING -0.005 +// trailing gap +#define SCORE_GAP_TRAILING -0.005 +// gap in the middle +#define SCORE_GAP_INNER -0.01 +// we matched the characters consecutively +#define SCORE_MATCH_CONSECUTIVE 1.0 +// we got a consecutive match, but insensitive is on +// and we didn't match the case. +#define SCORE_MATCH_NOT_MATCH_CASE 0.9 +// we are matching after a slash +#define SCORE_MATCH_SLASH 0.9 +// we are matching after a space dash or hyphen +#define SCORE_MATCH_WORD 0.8 +// we are matching a camel case letter +#define SCORE_MATCH_CAPITAL 0.7 +// we are matching after a dot +#define SCORE_MATCH_DOT 0.6 + +#define SWAP(x, y, T) \ + do { \ + T SWAP = x; \ + x = y; \ + y = SWAP; \ + } while (0) + +#define max(a, b) (((a) > (b)) ? (a) : (b)) + +static void precompute_bonus(const char *haystack, score_t *match_bonus) { + /* Which positions are beginning of words */ + int m = strlen(haystack); + char last_ch = '\0'; + for (int i = 0; i < m; i++) { + char ch = haystack[i]; + + score_t score = 0; + if (isalnum(ch)) { + if (!last_ch || last_ch == '/') { + score = SCORE_MATCH_SLASH; + } else if (last_ch == '-' || last_ch == '_' || + last_ch == ' ') { + score = SCORE_MATCH_WORD; + } else if (last_ch >= 'a' && last_ch <= 'z' && + ch >= 'A' && ch <= 'Z') { + /* CamelCase */ + score = SCORE_MATCH_CAPITAL; + } else if (last_ch == '.') { + score = SCORE_MATCH_DOT; + } + } + + match_bonus[i] = score; + last_ch = ch; + } +} + +static inline bool match_with_case(char a, char b, bool insensitive) { + if(insensitive) { + return tolower(a) == tolower(b); + } else { + return a == b; + } +} + +static inline void match_row(int row, score_t* curr_D, score_t* curr_M, + const score_t* last_D, const score_t * last_M, + const char* needle, const char* haystack, int n, int m, score_t* match_bonus) { + int i = row; + + score_t prev_score = SCORE_MIN; + score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; + + for (int j = 0; j < m; j++) { + if (match_with_case(needle[i], haystack[j], true)) { + score_t score = SCORE_MIN; + if (!i) { + // first line we fill in a row for non-matching + score = (j * SCORE_GAP_LEADING) + match_bonus[j]; + } else if (j) { /* i > 0 && j > 0*/ + // we definitely match case insensitively already so if + // our character isn't the same then we have a + // different case + score_t consecutive_bonus = needle[i] == haystack[j] ? SCORE_MATCH_CONSECUTIVE : SCORE_MATCH_NOT_MATCH_CASE; + + score = max(last_M[j - 1] + match_bonus[j], + /* consecutive match, doesn't stack + with match_bonus */ + last_D[j - 1] + consecutive_bonus); + } + curr_D[j] = score; + curr_M[j] = prev_score = max(score, prev_score + gap_score); + } else { + curr_D[j] = SCORE_MIN; + curr_M[j] = prev_score = prev_score + gap_score; + } + } +} + +// Fuzzy matching scoring. Adapted from +// https://github.com/jhawthorn/fzy/blob/master/src/match.c and +// https://github.com/jhawthorn/fzy/blob/master/ALGORITHM.md +// For a fuzzy match string needle being searched for in haystack we provide a +// number score for how well we match. +// We create two matrices of size needle_len (n) by haystack_len (m). +// The first matrix is the score matrix. Each position (i,j) within this matrix +// consists of the score that corresponds to the score that would be generated +// by matching the first i characters of the needle with the first j +// characters of the haystack. Gaps have a fixed penalty for having a gap along +// with a linear penalty for gap size (c.f. gotoh's algorithm). +// matches give a positive score, with a slight weight given to matches after +// certain special characters (i.e. the first character after a `/` will be +// "almost" consecutive but lower than an actual consecutive match). +// Our second matrix is our diagonal matrix where we store the best match +// that ends at a match. This allows us to calculate our gap penalties alongside +// our consecutive match scores. +// In addition, since we only rely on the current, and previous row of the +// matrices and we only want to compute the score, we only store those scores +// and reuse the previous rows (rather than storing the entire (n*m) matrix). +// In addition we've simplified some of the algorithm compared to fzy to +// improve legibility. (Can reimplement lookup tables later if wanted.) +// Also, the reference algorithm does not take into account case sensitivity +// which has been implemented here. + + +score_t utils_fuzzy_score(const char* haystack, const char* needle) { + if(!*needle) + return SCORE_MIN; + + int n = strlen(needle); + int m = strlen(haystack); + score_t match_bonus[m]; + precompute_bonus(haystack, match_bonus); + + if(m > MATCH_FUZZY_MAX_LEN || n > m) { + /* + * Unreasonably large candidate: return no score + * If it is a valid match it will still be returned, it will + * just be ranked below any reasonably sized candidates + */ + return SCORE_MIN; + } else if(n == m) { + /* Since this method can only be called with a haystack which + * matches needle. If the lengths of the strings are equal the + * strings themselves must also be equal (ignoring case). + */ + return SCORE_MAX; + } + + /* + * D[][] Stores the best score for this position ending with a match. + * M[][] Stores the best possible score at this position. + */ + score_t D[2][MATCH_FUZZY_MAX_LEN], M[2][MATCH_FUZZY_MAX_LEN]; + + score_t* last_D, *last_M; + score_t* curr_D, *curr_M; + + last_D = D[0]; + last_M = M[0]; + curr_D = D[1]; + curr_M = M[1]; + + for (int i = 0; i < n; i++) { + match_row(i, curr_D, curr_M, last_D, last_M, needle, haystack, n, m, match_bonus); + + SWAP(curr_D, last_D, score_t *); + SWAP(curr_M, last_M, score_t *); + } + + return last_M[m - 1]; +} + void utils_mkdir(char* path, mode_t mode) { if(access(path, F_OK) != 0) { char* tmp = strdup(path); diff --git a/src/wofi.c b/src/wofi.c index bc83270..0ec0ef3 100644 --- a/src/wofi.c +++ b/src/wofi.c @@ -1111,43 +1111,33 @@ static gboolean do_multi_filter(GtkFlowBoxChild* row, gpointer data) { return ret; } -static gint fuzzy_sort(const gchar* text1, const gchar* text2) { - char* _filter = strdup(filter); - size_t len = strlen(_filter); - - char* t1 = strdup(text1); - size_t t1l = strlen(t1); - - char* t2 = strdup(text2); - size_t t2l = strlen(t2); - - if(insensitive) { - for(size_t count = 0; count < len; ++count) { - char chr = _filter[count]; - if(isalpha(chr)) { - _filter[count] = tolower(chr); - } - } - for(size_t count = 0; count < t1l; ++count) { - char chr = t1[count]; - if(isalpha(chr)) { - t1[count] = tolower(chr); - } - } - for(size_t count = 0; count < t2l; ++count) { - char chr = t2[count]; - if(isalpha(chr)) { - t2[count] = tolower(chr); - } +static gint fuzzy_sort(const gchar *text1, const gchar *text2) { + gboolean match1 = do_fuzzy_strcomp(filter, text1); + gboolean match2 = do_fuzzy_strcomp(filter, text2); + // both filters match do fuzzy scoring + if(match1 && match2) { + score_t dist1 = utils_fuzzy_score(text1, filter); + score_t dist2 = utils_fuzzy_score(text2, filter); + if (dist1 == dist2) { + // same same + return 0; + } else if (dist1 > dist2) { // highest score wins. + // text1 goes first + return -1; + } else { + // text2 goes first + return 1; } + } else if(match1) { + // text1 goes first + return -1; + } else if(match2) { + // text2 goes first + return 1; + } else { + // same same. + return 0; } - - size_t dist1 = utils_distance(t1, _filter); - size_t dist2 = utils_distance(t2, _filter); - free(_filter); - free(t1); - free(t2); - return dist1 - dist2; } // we sort based on how early in the string all the matches are.