nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit bc6dbd4791c68e9460556665759b210a944c9401
parent a17b7c76a1711b656324f9839b757d9c5db030d8
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed,  7 Sep 2011 23:59:13 +0200

add NON_ALPHA_SCORE

Diffstat:
nlsplit.c | 5++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -41,6 +41,8 @@ int size; #define NEWLINE_SCORE 2 /* whitespace is a last-resort split */ #define WHITESPACE_SCORE 1 +/* non-alpha is a last-resort split */ +#define NON_ALPHA_SCORE 0.5 #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) @@ -118,7 +120,6 @@ int push(point *points, int hd, int *tl, float confidence, long position, long o return (*tl = push2(points, hd, *tl, confidence + old, position)); } - int split() { /* perform the split */ @@ -237,6 +238,8 @@ int split() { word_has_uppercase = 0; if (c >= 'A' && c <= 'Z') word_has_uppercase = 1; + if (!(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z')) + current += NON_ALPHA_SCORE; // TODO more punct if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')') last_punct = c;