nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

nlsplit.c (10375B)


      1 /* nlsplit by a3nm (2011) */
      2 
      3 #include <stdio.h>
      4 #include <stdlib.h>
      5 #include <assert.h>
      6 
      7 const char help[] =
      8   "Split natural language text from stdin in pieces less than SIZE bytes.\n"
      9   "If MINCONFIDENCE is set, always cut above this confidence threshold.\n";
     10 
     11 /* above this confidence value, we always split */
     12 float min_confidence = -1;
     13 
     14 /* maximal size of pieces */
     15 int size;
     16 
     17 /* for tab-space equivalence in indentation counts */
     18 #define TAB_LEN 2
     19 
     20 /* score of the last split */
     21 #define EOF_SCORE 999999
     22 /* score per \n, for multiple \n's */
     23 #define MULTIPLE_LINE_SCORE 1000
     24 /* if file contains lines longer than this, then one \n is a safer split */
     25 #define LONG_LINES_THRESHOLD 100
     26 /* if first word of line fits on previous line with this margin, safer split */
     27 #define DELTA_THRESHOLD 10
     28 /* if indentation changes, safer split */
     29 #define INDENT_CHANGE_SCORE 18
     30 /* punctuations where split is safe */
     31 #define FINAL_PUNCT_SCORE 20
     32 /* punctuations where split is less safe */
     33 #define SEMIFINAL_PUNCT_SCORE 18
     34 /* punctuations where split isn't really safe */
     35 #define OTHER_PUNCT_SCORE 10
     36 /* whitespace and case can help punctuation splits up to this limit */
     37 #define MAX_PUNCT_SITUATION_SCORE 3
     38 /* one \n with next line starting by non-lowercase is a safer split */
     39 #define NEWLINE_WITH_NON_LOWERCASE_SCORE 3
     40 /* \n is a last-resort split */
     41 #define NEWLINE_SCORE 2
     42 /* whitespace is a last-resort split */
     43 #define WHITESPACE_SCORE 1
     44 /* non-alpha is a last-resort split */
     45 #define NON_ALPHA_SCORE 0.5
     46 
     47 #define E_SYNTAX 1
     48 #define E_MEMORY 2
     49 
     50 #define MAX(a, b) (((a) > (b)) ? (a) : (b))
     51 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
     52 #define MID(l, r, s) ((((l) + ((r) + ((l) > (r) ? (s) : 0)))/2) % (s))
     53 
     54 /* a possible split point */
     55 typedef struct point {
     56   float confidence;
     57   long position;
     58 } point;
     59 
     60 void usage(char** argv) {
     61   /* show usage and exit */
     62   fprintf(stderr, "Usage: %s SIZE [MINCONFIDENCE]\n", argv[0]);
     63   fprintf(stderr, help);
     64   exit(E_SYNTAX);
     65 }
     66 
     67 float punct_score(char c) {
     68   /* get score of a punctuation */
     69   if (c == '?' || c == '!' || c == '.') return FINAL_PUNCT_SCORE;
     70   if (c == ',' || c == ';' || c == ':') return SEMIFINAL_PUNCT_SCORE;
     71   return OTHER_PUNCT_SCORE;
     72 }
     73 
     74 float get(point *points, int hd, int tl, long position) {
     75   /* binary search of position in circular buffer (points, hd, tl) */
     76   /* return the position of the best candidate */
     77 
     78   if (hd == tl) return hd;
     79   int m = MID(hd, tl, size);
     80   if (points[m].position == position)
     81     return m;
     82   if (points[m].position > position)
     83     return get(points, hd, m, position);
     84   else
     85     return get(points, (m+1) % size, tl, position);
     86 }
     87 
     88 int push2(point *points, int hd, int tl, float confidence, long position) {
     89   /* insert (confidence, position) in circular buffer (points, hd, tl) */
     90   /* buffer is kept sorted, existing elements below confidence are erased) */
     91   /* insertion position is found by binary search on confidence */
     92   /* return new value for tl */
     93 
     94   assert(hd < size);
     95   assert(tl < size);
     96   if (hd == tl) {
     97     /* insert at tl */
     98     points[tl].position = position;
     99     points[tl].confidence = confidence;
    100     /* increment otl */
    101     return (tl + 1) % size;
    102   }
    103   int m = MID(hd, tl, size);
    104   /* in case of tie, drop elements */
    105   if (points[m].confidence <= confidence)
    106     return push2(points, hd, m, confidence, position);
    107   else
    108     return push2(points, (m+1) % size, tl, confidence, position);
    109 }
    110 
    111 int push(point *points, int hd, int *tl, float confidence, long position,
    112     long offset) {
    113   /* insert (confidence, position) in circular buffer (points, hd, tl) */
    114   /* if position is already in buffer, insert sum of confidence values */
    115   /* perform update of tl, return new value of tl */
    116   /* buffer is sorted by decreasing confidence, and increasing position */
    117   /* except in rare cases where we insert at an old position */
    118   /* which could make us underestimate slightly confidence for some splits */
    119 
    120   assert(hd < size);
    121   assert(*tl < size);
    122 
    123   /* refuse insert of positions before offset */
    124   if (position <= offset) return *tl;
    125   int old_pos = get(points, hd, *tl, position);
    126   float old = points[old_pos].position == position ?
    127       points[old_pos].confidence : 0;
    128   return (*tl = push2(points, hd, *tl, confidence + old, position));
    129 }
    130 
    131 int split() {
    132   /* perform the split */
    133 
    134   /* counter of pieces */
    135   int npiece = 0;
    136 
    137   /* circular buffer for the current piece */
    138   char *piece;
    139   /* offset is the absolute offset relative to BOF */
    140   /* pos is the offset within the piece */
    141   int offset = 0, pos = 0;
    142 
    143   /* circular sorted buffer for the current split points */
    144   /* this is kept sorted by decreasing confidence and increasing position */
    145   point *points;
    146   /* head and tail of the buffer */
    147   int hd = 0, tl = 0;
    148 
    149   /* current and last character */
    150   char c = 0, last = 0;
    151   /* for eof detection */
    152   int c_int;
    153 
    154   /* confidence for split at current position */
    155   float current = 0;
    156 
    157   /* number of successive newlines, indentation of current and last line */
    158   int n_newlines = 0, indent = 0, last_indent = 0;
    159 
    160   /* length of current and last line, maximal line length */
    161   int l_line = 0, l_last_line = 0, max_l_line = 0;
    162 
    163   /* length of first word, number of words on line, uppercase in current word */
    164   int l_first_word = 0, n_words = 0, word_has_uppercase = 0;
    165 
    166   /* last punctuation seen, number of whitespace characters after punctuation */
    167   int last_punct = 0, whitespace_after_punct = 0;
    168   
    169   int i;
    170 
    171   /* allocate memory */
    172   piece = malloc(size * sizeof(char));
    173   points = malloc(size * sizeof(point));
    174   if (!piece || !points) {
    175     perror("malloc");
    176     return E_MEMORY;
    177   }
    178 
    179   /* read file, break after splitting at EOF */
    180   /* do not break when reading EOF, because we must output last chunk */
    181   while (1) {
    182     /* read char */
    183     last = c;
    184     c_int = getchar();
    185     c = c_int;
    186 
    187     /* cut if we have to */
    188     assert(pos <= size);
    189     if (c_int == EOF || pos == size ||
    190         (hd != tl && min_confidence > 0 &&
    191           points[hd].confidence >= min_confidence)) {
    192       /* pop old entries */
    193       while (hd != tl && points[hd].position <= offset)
    194         hd = (hd + 1) % size;
    195       if (hd == tl) {
    196         /* we have no points, we must create one */
    197         push(points, hd, &tl, 0, offset + pos, offset);
    198         assert(points[hd].position > offset);
    199       }
    200       assert(points[hd].position > offset);
    201       if (c_int == EOF) {
    202         /* special EOF cut */
    203         points[hd].confidence = EOF_SCORE;
    204         points[hd].position = offset + pos;
    205       }
    206       printf("-- chunk %d length %ld confidence %f\n",
    207           npiece, points[hd].position - offset, points[hd].confidence);
    208       /* output the data */
    209       for (i = offset; i < points[hd].position ; i++)
    210         putchar(piece[i % size]);
    211       putchar('\n');
    212       /* update offset and pos */
    213       pos = (offset + pos) - points[hd].position;
    214       assert(pos < size);
    215       offset = points[hd].position;
    216       /* pop the point */
    217       hd = (hd + 1) % size;
    218       /* increment piece counter */
    219       npiece++;
    220     }
    221 
    222     /* break if we must */
    223     if (c_int == EOF) break;
    224 
    225     /* add char */
    226     piece[(offset + (pos++)) % size] = c;
    227 
    228     /* produce split points */
    229     current = 0;
    230     if (c == '\n' || c == ' ' || c == '\t')
    231       if (last_punct) whitespace_after_punct++;
    232     if (c == '\n') {
    233       n_words = 0;
    234       l_first_word = 0;
    235       last_indent = indent;
    236       indent = 0;
    237       l_last_line = l_line;
    238       l_line = 0;
    239       max_l_line = MAX(max_l_line, l_line);
    240       n_newlines++;
    241       if (n_newlines > 1) {
    242         current += MULTIPLE_LINE_SCORE * n_newlines;
    243       } else {
    244         if (max_l_line > LONG_LINES_THRESHOLD)
    245           current += (max_l_line - LONG_LINES_THRESHOLD);
    246         current += NEWLINE_SCORE;
    247       }
    248     } else {
    249       n_newlines = 0;
    250       l_line++;
    251       if (c == ' ' || c == '\t') {
    252         current += WHITESPACE_SCORE;
    253         if (l_first_word == 0) {
    254           /* we are still in indentation */
    255           indent += c == '\t' ? 2 : 1;
    256         } else {
    257           if (!n_words) {
    258             /* we have just read the first word */
    259             int delta = max_l_line - l_last_line - l_first_word - 1;
    260             if (delta > DELTA_THRESHOLD) {
    261               /* first word of current line would fit on previous line */
    262               push(points, hd, &tl, delta, offset + pos - l_line - 1, offset);
    263             }
    264           }
    265           n_words += 1;
    266         }
    267       } else {
    268         if (last_punct) {
    269           /* punctuation followed by much whitespace and an uppercase letter */
    270           /* and having a preceding word without uppercase is safer */
    271           int situation_score = MIN(MAX_PUNCT_SITUATION_SCORE,
    272               whitespace_after_punct +
    273               ((c >= 'A' && c <= 'Z') ? 1 : 0) + (word_has_uppercase ? 0 : 1));
    274           push(points, hd, &tl, situation_score + punct_score(last_punct),
    275               offset + pos - whitespace_after_punct, offset);
    276           whitespace_after_punct = 0;
    277           last_punct = 0;
    278         }
    279 
    280         if (indent != last_indent)
    281           push(points, hd, &tl, INDENT_CHANGE_SCORE + abs(indent -
    282                 last_indent), offset + pos - l_line, offset);
    283         if (!n_words && !l_first_word)
    284           /* first char of the line */
    285           if (!(c >= 'a' && c <= 'z'))
    286               push(points, hd, &tl, NEWLINE_WITH_NON_LOWERCASE_SCORE -
    287                   NEWLINE_SCORE, offset + pos - l_line, offset);
    288         if (n_words == 0)
    289           l_first_word++;
    290         if (c >= 'a' && c <= 'z' &&
    291             (last == ' ' || last == '\t' || last == '\n'))
    292           word_has_uppercase = 0;
    293         if (c >= 'A' && c <= 'Z')
    294           word_has_uppercase = 1;
    295         if (!(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z'))
    296           current += NON_ALPHA_SCORE;
    297         // TODO more punct
    298         if (c == '!' || c == '.' || c == ',' || c == ';' || c == ':' || c == '?'
    299             || c == '(' || c == ')')
    300           last_punct = c;
    301       }
    302     }
    303 
    304     /* push point if we have one */
    305     if (current > 0) {
    306       push(points, hd, &tl, current, offset + pos, offset);
    307     }
    308   }
    309 
    310   free(piece);
    311   free(points);
    312 
    313   return 0;
    314 }
    315 
    316 int main(int argc, char **argv) {
    317 
    318   if (argc < 2 || argc > 3) usage(argv);
    319   size = atoi(argv[1]);
    320   if (size <= 0) usage(argv);
    321   if (argc == 3) min_confidence = atoi(argv[2]);
    322   if (!min_confidence) usage(argv);
    323 
    324   return split(size);
    325 }
    326