Create fuzzy scoring and fuzzy sort.
This commit is contained in:
parent
a554ae22f3
commit
f6d3169bfc
@ -22,6 +22,11 @@
|
|||||||
|
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
|
||||||
|
typedef double score_t;
|
||||||
|
#define SCORE_MAX INFINITY
|
||||||
|
#define SCORE_MIN -INFINITY
|
||||||
|
#define MATCH_FUZZY_MAX_LEN 256
|
||||||
|
|
||||||
time_t utils_get_time_millis(void);
|
time_t utils_get_time_millis(void);
|
||||||
|
|
||||||
void utils_sleep_millis(time_t millis);
|
void utils_sleep_millis(time_t millis);
|
||||||
@ -34,6 +39,8 @@ size_t utils_min3(size_t n1, size_t n2, size_t n3);
|
|||||||
|
|
||||||
size_t utils_distance(const char* haystack, const char* needle);
|
size_t utils_distance(const char* haystack, const char* needle);
|
||||||
|
|
||||||
void utils_mkdir(char* path, mode_t mode);
|
score_t utils_fuzzy_score(const char *haystack, const char *needle);
|
||||||
|
|
||||||
|
void utils_mkdir(char *path, mode_t mode);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
177
src/utils.c
177
src/utils.c
@ -17,13 +17,15 @@
|
|||||||
|
|
||||||
#include <utils.h>
|
#include <utils.h>
|
||||||
|
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <libgen.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
|
#include <stdbool.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <libgen.h>
|
|
||||||
|
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
@ -109,6 +111,179 @@ size_t utils_distance(const char* haystack, const char* needle) {
|
|||||||
return arr[str1_len][str2_len];
|
return arr[str1_len][str2_len];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// leading gap
|
||||||
|
#define SCORE_GAP_LEADING -0.005
|
||||||
|
// trailing gap
|
||||||
|
#define SCORE_GAP_TRAILING -0.005
|
||||||
|
// gap in the middle
|
||||||
|
#define SCORE_GAP_INNER -0.01
|
||||||
|
// we matched the characters consecutively
|
||||||
|
#define SCORE_MATCH_CONSECUTIVE 1.0
|
||||||
|
// we got a consecutive match, but insensitive is on
|
||||||
|
// and we didn't match the case.
|
||||||
|
#define SCORE_MATCH_NOT_MATCH_CASE 0.9
|
||||||
|
// we are matching after a slash
|
||||||
|
#define SCORE_MATCH_SLASH 0.9
|
||||||
|
// we are matching after a space dash or hyphen
|
||||||
|
#define SCORE_MATCH_WORD 0.8
|
||||||
|
// we are matching a camel case letter
|
||||||
|
#define SCORE_MATCH_CAPITAL 0.7
|
||||||
|
// we are matching after a dot
|
||||||
|
#define SCORE_MATCH_DOT 0.6
|
||||||
|
|
||||||
|
#define SWAP(x, y, T) \
|
||||||
|
do { \
|
||||||
|
T SWAP = x; \
|
||||||
|
x = y; \
|
||||||
|
y = SWAP; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define max(a, b) (((a) > (b)) ? (a) : (b))
|
||||||
|
|
||||||
|
static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
||||||
|
/* Which positions are beginning of words */
|
||||||
|
int m = strlen(haystack);
|
||||||
|
char last_ch = '\0';
|
||||||
|
for (int i = 0; i < m; i++) {
|
||||||
|
char ch = haystack[i];
|
||||||
|
|
||||||
|
score_t score = 0;
|
||||||
|
if (isalnum(ch)) {
|
||||||
|
if (!last_ch || last_ch == '/') {
|
||||||
|
score = SCORE_MATCH_SLASH;
|
||||||
|
} else if (last_ch == '-' || last_ch == '_' ||
|
||||||
|
last_ch == ' ') {
|
||||||
|
score = SCORE_MATCH_WORD;
|
||||||
|
} else if (last_ch >= 'a' && last_ch <= 'z' &&
|
||||||
|
ch >= 'A' && ch <= 'Z') {
|
||||||
|
/* CamelCase */
|
||||||
|
score = SCORE_MATCH_CAPITAL;
|
||||||
|
} else if (last_ch == '.') {
|
||||||
|
score = SCORE_MATCH_DOT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match_bonus[i] = score;
|
||||||
|
last_ch = ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool match_with_case(char a, char b, bool insensitive) {
|
||||||
|
if(insensitive) {
|
||||||
|
return tolower(a) == tolower(b);
|
||||||
|
} else {
|
||||||
|
return a == b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void match_row(int row, score_t* curr_D, score_t* curr_M,
|
||||||
|
const score_t* last_D, const score_t * last_M,
|
||||||
|
const char* needle, const char* haystack, int n, int m, score_t* match_bonus) {
|
||||||
|
int i = row;
|
||||||
|
|
||||||
|
score_t prev_score = SCORE_MIN;
|
||||||
|
score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER;
|
||||||
|
|
||||||
|
for (int j = 0; j < m; j++) {
|
||||||
|
if (match_with_case(needle[i], haystack[j], true)) {
|
||||||
|
score_t score = SCORE_MIN;
|
||||||
|
if (!i) {
|
||||||
|
// first line we fill in a row for non-matching
|
||||||
|
score = (j * SCORE_GAP_LEADING) + match_bonus[j];
|
||||||
|
} else if (j) { /* i > 0 && j > 0*/
|
||||||
|
// we definitely match case insensitively already so if
|
||||||
|
// our character isn't the same then we have a
|
||||||
|
// different case
|
||||||
|
score_t consecutive_bonus = needle[i] == haystack[j] ? SCORE_MATCH_CONSECUTIVE : SCORE_MATCH_NOT_MATCH_CASE;
|
||||||
|
|
||||||
|
score = max(last_M[j - 1] + match_bonus[j],
|
||||||
|
/* consecutive match, doesn't stack
|
||||||
|
with match_bonus */
|
||||||
|
last_D[j - 1] + consecutive_bonus);
|
||||||
|
}
|
||||||
|
curr_D[j] = score;
|
||||||
|
curr_M[j] = prev_score = max(score, prev_score + gap_score);
|
||||||
|
} else {
|
||||||
|
curr_D[j] = SCORE_MIN;
|
||||||
|
curr_M[j] = prev_score = prev_score + gap_score;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fuzzy matching scoring. Adapted from
|
||||||
|
// https://github.com/jhawthorn/fzy/blob/master/src/match.c and
|
||||||
|
// https://github.com/jhawthorn/fzy/blob/master/ALGORITHM.md
|
||||||
|
// For a fuzzy match string needle being searched for in haystack we provide a
|
||||||
|
// number score for how well we match.
|
||||||
|
// We create two matrices of size needle_len (n) by haystack_len (m).
|
||||||
|
// The first matrix is the score matrix. Each position (i,j) within this matrix
|
||||||
|
// consists of the score that corresponds to the score that would be generated
|
||||||
|
// by matching the first i characters of the needle with the first j
|
||||||
|
// characters of the haystack. Gaps have a fixed penalty for having a gap along
|
||||||
|
// with a linear penalty for gap size (c.f. gotoh's algorithm).
|
||||||
|
// matches give a positive score, with a slight weight given to matches after
|
||||||
|
// certain special characters (i.e. the first character after a `/` will be
|
||||||
|
// "almost" consecutive but lower than an actual consecutive match).
|
||||||
|
// Our second matrix is our diagonal matrix where we store the best match
|
||||||
|
// that ends at a match. This allows us to calculate our gap penalties alongside
|
||||||
|
// our consecutive match scores.
|
||||||
|
// In addition, since we only rely on the current, and previous row of the
|
||||||
|
// matrices and we only want to compute the score, we only store those scores
|
||||||
|
// and reuse the previous rows (rather than storing the entire (n*m) matrix).
|
||||||
|
// In addition we've simplified some of the algorithm compared to fzy to
|
||||||
|
// improve legibility. (Can reimplement lookup tables later if wanted.)
|
||||||
|
// Also, the reference algorithm does not take into account case sensitivity
|
||||||
|
// which has been implemented here.
|
||||||
|
|
||||||
|
|
||||||
|
score_t utils_fuzzy_score(const char* haystack, const char* needle) {
|
||||||
|
if(!*needle)
|
||||||
|
return SCORE_MIN;
|
||||||
|
|
||||||
|
int n = strlen(needle);
|
||||||
|
int m = strlen(haystack);
|
||||||
|
score_t match_bonus[m];
|
||||||
|
precompute_bonus(haystack, match_bonus);
|
||||||
|
|
||||||
|
if(m > MATCH_FUZZY_MAX_LEN || n > m) {
|
||||||
|
/*
|
||||||
|
* Unreasonably large candidate: return no score
|
||||||
|
* If it is a valid match it will still be returned, it will
|
||||||
|
* just be ranked below any reasonably sized candidates
|
||||||
|
*/
|
||||||
|
return SCORE_MIN;
|
||||||
|
} else if(n == m) {
|
||||||
|
/* Since this method can only be called with a haystack which
|
||||||
|
* matches needle. If the lengths of the strings are equal the
|
||||||
|
* strings themselves must also be equal (ignoring case).
|
||||||
|
*/
|
||||||
|
return SCORE_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* D[][] Stores the best score for this position ending with a match.
|
||||||
|
* M[][] Stores the best possible score at this position.
|
||||||
|
*/
|
||||||
|
score_t D[2][MATCH_FUZZY_MAX_LEN], M[2][MATCH_FUZZY_MAX_LEN];
|
||||||
|
|
||||||
|
score_t* last_D, *last_M;
|
||||||
|
score_t* curr_D, *curr_M;
|
||||||
|
|
||||||
|
last_D = D[0];
|
||||||
|
last_M = M[0];
|
||||||
|
curr_D = D[1];
|
||||||
|
curr_M = M[1];
|
||||||
|
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
match_row(i, curr_D, curr_M, last_D, last_M, needle, haystack, n, m, match_bonus);
|
||||||
|
|
||||||
|
SWAP(curr_D, last_D, score_t *);
|
||||||
|
SWAP(curr_M, last_M, score_t *);
|
||||||
|
}
|
||||||
|
|
||||||
|
return last_M[m - 1];
|
||||||
|
}
|
||||||
|
|
||||||
void utils_mkdir(char* path, mode_t mode) {
|
void utils_mkdir(char* path, mode_t mode) {
|
||||||
if(access(path, F_OK) != 0) {
|
if(access(path, F_OK) != 0) {
|
||||||
char* tmp = strdup(path);
|
char* tmp = strdup(path);
|
||||||
|
60
src/wofi.c
60
src/wofi.c
@ -1111,43 +1111,33 @@ static gboolean do_multi_filter(GtkFlowBoxChild* row, gpointer data) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static gint fuzzy_sort(const gchar* text1, const gchar* text2) {
|
static gint fuzzy_sort(const gchar *text1, const gchar *text2) {
|
||||||
char* _filter = strdup(filter);
|
gboolean match1 = do_fuzzy_strcomp(filter, text1);
|
||||||
size_t len = strlen(_filter);
|
gboolean match2 = do_fuzzy_strcomp(filter, text2);
|
||||||
|
// both filters match do fuzzy scoring
|
||||||
char* t1 = strdup(text1);
|
if(match1 && match2) {
|
||||||
size_t t1l = strlen(t1);
|
score_t dist1 = utils_fuzzy_score(text1, filter);
|
||||||
|
score_t dist2 = utils_fuzzy_score(text2, filter);
|
||||||
char* t2 = strdup(text2);
|
if (dist1 == dist2) {
|
||||||
size_t t2l = strlen(t2);
|
// same same
|
||||||
|
return 0;
|
||||||
if(insensitive) {
|
} else if (dist1 > dist2) { // highest score wins.
|
||||||
for(size_t count = 0; count < len; ++count) {
|
// text1 goes first
|
||||||
char chr = _filter[count];
|
return -1;
|
||||||
if(isalpha(chr)) {
|
} else {
|
||||||
_filter[count] = tolower(chr);
|
// text2 goes first
|
||||||
}
|
return 1;
|
||||||
}
|
|
||||||
for(size_t count = 0; count < t1l; ++count) {
|
|
||||||
char chr = t1[count];
|
|
||||||
if(isalpha(chr)) {
|
|
||||||
t1[count] = tolower(chr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(size_t count = 0; count < t2l; ++count) {
|
|
||||||
char chr = t2[count];
|
|
||||||
if(isalpha(chr)) {
|
|
||||||
t2[count] = tolower(chr);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
} else if(match1) {
|
||||||
|
// text1 goes first
|
||||||
|
return -1;
|
||||||
|
} else if(match2) {
|
||||||
|
// text2 goes first
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
// same same.
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t dist1 = utils_distance(t1, _filter);
|
|
||||||
size_t dist2 = utils_distance(t2, _filter);
|
|
||||||
free(_filter);
|
|
||||||
free(t1);
|
|
||||||
free(t2);
|
|
||||||
return dist1 - dist2;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// we sort based on how early in the string all the matches are.
|
// we sort based on how early in the string all the matches are.
|
||||||
|
Loading…
Reference in New Issue
Block a user