nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit c3b66e815db167596142143c5e51401a1def64e3
parent 4ef6175f70791c20d6d6e43ef60b36cd61b19dd0
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun,  9 Oct 2011 19:41:34 +0200

testing

Diffstat:
nlsplit_test.c | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 62 insertions(+), 0 deletions(-)

diff --git a/nlsplit_test.c b/nlsplit_test.c @@ -0,0 +1,62 @@ +/* nlsplit_read for nlsplit by a3nm (2011) */ + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +const char help[] = + "Check and collect together output of nlsplit from stdin to stdout.\n" + "SIZE is the maximal size of a chunk, in bytes.\n"; + +/* maximal size of pieces */ +int size; + +#define E_SYNTAX 1 +#define E_MEMORY 2 + +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define MID(l, r, s) ((((l) + ((r) + ((l) > (r) ? (s) : 0)))/2) % (s)) + +void usage(char** argv) { + /* show usage and exit */ + fprintf(stderr, "Usage: %s SIZE\n", argv[0]); + fprintf(stderr, help); + exit(E_SYNTAX); +} + +int main(int argc, char **argv) { + + int last_piece = -1; + int piece, length; + int i; + char *chunk; + float confidence; + + if (argc != 2) usage(argv); + if (!(size = atoi(argv[1]))) usage(argv); + + chunk = malloc(size * sizeof(char)); + if (!chunk) { + perror("malloc"); + return E_MEMORY; + } + + while (scanf("-- piece %d length %d confidence %f", + &piece, &length, &confidence) == 3) { + assert(piece == last_piece + 1); + assert(length > 0); + assert(length <= size); + assert(confidence >= 0); + getchar(); // newline + for (i=0; i<length; i++) + putchar(getchar()); + getchar(); // newline + last_piece = piece; + } + + assert(feof(stdin)); + + return 0; +} +