utf8expr

expr(1) for UTF-8
git clone git://r-36.net/utf8expr
Log | Files | Refs | LICENSE

commit 4d85a682220a55d83e8c6460808329e72becca36
Author: Christoph Lohmann <20h@r-36.net>
Date:   Mon, 21 May 2012 17:35:22 +0200

Initial commit.

Diffstat:
LICENSE | 21+++++++++++++++++++++
Makefile | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arg.h | 41+++++++++++++++++++++++++++++++++++++++++
config.mk | 23+++++++++++++++++++++++
utf8expr.1 | 41+++++++++++++++++++++++++++++++++++++++++
utf8expr.c | 173+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 355 insertions(+), 0 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -0,0 +1,21 @@ +MIT/X Consortium License + +© 2012 Christoph Lohmann <20h@r-36.net> + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile @@ -0,0 +1,56 @@ +# utf8expr – expr(1) for utf8 +# See LICENSE file for copyright and license details. + +include config.mk + +SRC = ${NAME}.c +OBJ = ${SRC:.c=.o} + +all: options ${NAME} + +options: + @echo ${NAME} build options: + @echo "CFLAGS = ${CFLAGS}" + @echo "LDFLAGS = ${LDFLAGS}" + @echo "CC = ${CC}" + +.c.o: + @echo CC $< + @${CC} -c ${CFLAGS} $< + +${OBJ}: config.mk + +${NAME}: ${OBJ} + @echo CC -o $@ + @${CC} -o $@ ${OBJ} ${LDFLAGS} + +clean: + @echo cleaning + @rm -f ${NAME} ${OBJ} ${NAME}-${VERSION}.tar.gz + +dist: clean + @echo creating dist tarball + @mkdir -p ${NAME}-${VERSION} + @cp -R LICENSE Makefile config.mk \ + ${SRC} ${NAME}.8 *.h ${NAME}-${VERSION} + @tar -cf ${NAME}-${VERSION}.tar ${NAME}-${VERSION} + @gzip ${NAME}-${VERSION}.tar + @rm -rf ${NAME}-${VERSION} + +install: all + @echo installing executable file to ${DESTDIR}${PREFIX}/bin + @mkdir -p ${DESTDIR}${PREFIX}/bin + @cp -f ${NAME} ${DESTDIR}${PREFIX}/bin + @chmod 755 ${DESTDIR}${PREFIX}/bin/${NAME} + @echo installing manual page to ${DESTDIR}${MANPREFIX}/man1 + @mkdir -p ${DESTDIR}${MANPREFIX}/man1 + @cp -f ${NAME}.1 ${DESTDIR}${MANPREFIX}/man1 + @chmod 644 ${DESTDIR}${MANPREFIX}/man1/${NAME}.1 + +uninstall: + @echo removing executable file from ${DESTDIR}${PREFIX}/bin + @rm -f ${DESTDIR}${PREFIX}/bin/${NAME} + @echo removing manual page from ${DESTDIR}${PREFIX}/man1 + @rm -f ${DESTDIR}${MANPREFIX}/man1/${NAME}.1 + +.PHONY: all options clean dist install uninstall diff --git a/arg.h b/arg.h @@ -0,0 +1,41 @@ +/* + * Copy me if you can. + * by 20h + */ + +#ifndef __ARG_H__ +#define __ARG_H__ + +extern char *argv0; + +#define USED(x) ((void)(x)) + +#define ARGBEGIN for (argv0 = *argv, argv++, argc--;\ + argv[0] && argv[0][1]\ + && argv[0][0] == '-';\ + argc--, argv++) {\ + char _argc;\ + char **_argv;\ + if (argv[0][1] == '-' && argv[0][2] == '\0') {\ + argv++;\ + argc--;\ + break;\ + }\ + for (argv[0]++, _argv = argv; argv[0][0];\ + argv[0]++) {\ + if (_argv != argv)\ + break;\ + _argc = argv[0][0];\ + switch (_argc) + +#define ARGEND }\ + USED(_argc);\ + }\ + USED(argv);\ + USED(argc); + +#define EARGF(x) ((argv[1] == NULL)? ((x), abort(), (char *)0) :\ + (argc--, argv++, argv[0])) + +#endif + diff --git a/config.mk b/config.mk @@ -0,0 +1,23 @@ +# nldev metadata +NAME = utf8expr +VERSION = 0.8 + +# Customize below to fit your system + +# paths +PREFIX = /usr/local +MANPREFIX = ${PREFIX}/share/man + +# includes and libs +INCS = -I. -I/usr/include +LIBS = -L/usr/lib -lc + +# flags +CPPFLAGS = -DVERSION=\"${VERSION}\" +CFLAGS = -g -std=c99 -pedantic -Wall -O0 ${INCS} ${CPPFLAGS} +LDFLAGS = -static -g ${LIBS} +#LDFLAGS = -s ${LIBS} + +# compiler and linker +CC = cc + diff --git a/utf8expr.1 b/utf8expr.1 @@ -0,0 +1,41 @@ +.Dd May 21, 2012 +.Dt UTF8EXPR 1 +.Os +. +.Sh NAME +.Nm utf8expr +.Nd evalute UTF-8 expressions +. +.Sh SYNOPSIS +.Nm +.Bk -words +EXPRESSION +. +.Sh DESCRIPTION +.Bd -filled +.Nm +will evalute a subset of the expr(1) syntax with taking care +of UTF-8 characters. +.Ed +. +.Sh EXPRESSIONS +.Pp +.Bl -tag -width ".Fl test Ao Ar string Ac" +. +.It substr STRING POS LENGTH +substring of STRING, POS counted from 1 +. +.It index STRING CHARS +index in STRING where any CHARS is found, or 0 +. +.It length STRING +length of STRING +. +.Sh AUTHORS +See the LICENSE file for the authors of this software. +. +.Sh LICENSE +.Nm +is released under the MIT/X Consortium License. +. + diff --git a/utf8expr.c b/utf8expr.c @@ -0,0 +1,173 @@ +/* + * Copy me if you can. + * by 20h + */ + +#include <unistd.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <libgen.h> + +#include "arg.h" + +char *argv0; + +/* + * Idea taken from: + * http://canonical.org/~kragen/strlen-utf8.html + */ +size_t +utf8strlen(char *s) +{ + size_t i; + + i = 0; + for (; s[0]; s++) { + if ((s[0] & 0xc0) != 0x80) + i++; + } + + return i; +} + +char * +utf8strchr(char *s, char *c) +{ + size_t j, cl; + + cl = strlen(c); + if (cl == 0) + return NULL; + + for (j = 0; ; s++) { + if (j > 6) + return NULL; + j++; + + if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') { + if (cl == j) { + if (!memcmp(&s[-j], c, cl)) + return &s[-j]; + } + j = 0; + + if (s[0] == '\0') + break; + } + } + + return NULL; +} + +char * +utf8substr(char *s, size_t pos, size_t *length) +{ + size_t i, j, rl; + char *ret; + + if (*length < 1) + return NULL; + + ret = NULL; + rl = 0; + for (i = 0, j = 0; *length > 0; s++) { + if (j > 6) + return NULL; + j++; + + if (ret != NULL) + rl++; + + if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') { + if (i >= pos) { + if (ret == NULL) { + ret = &s[-j]; + rl = j; + } + (*length)--; + } + i++; + j = 0; + + if (s[0] == '\0') + break; + } + } + + *length = rl; + return ret; +} + +size_t +utf8index(char *s, char *chars) +{ + size_t i, j; + char c[7]; + + j = 0; + for (i = 0; ; s++) { + if (j > 6) + return 0; + j++; + + if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') { + memset(c, 0, sizeof(c)); + memmove(c, &s[-j], j); + if (utf8strchr(chars, c)) + return i; + i++; + j = 0; + + if (s[0] == '\0') + break; + } + } + + return 0; +} + +void +usage(void) +{ + fprintf(stderr, "usage: %s [substr|index|length] str [args ...]\n", + basename(argv0)); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + char *s; + size_t len; + + argv0 = argv[0]; + + if (argc < 3) + usage(); + + switch(argv[1][0]) { + case 'i': + if (argc < 4) + usage(); + printf("%ld\n", utf8index(argv[2], argv[3])); + break; + case 'l': + printf("%ld\n", utf8strlen(argv[2])); + break; + case 's': + if (argc < 5) + usage(); + len = atoi(argv[4]); + s = utf8substr(argv[2], atoi(argv[3]), &len); + if (s == NULL) + return -1; + printf("%.*s\n", (int)len, s); + break; + default: + usage(); + }; + + return 0; +} +