utf8expr

expr(1) for UTF-8
git clone git://r-36.net/utf8expr
Log | Files | Refs | LICENSE

utf8expr.c (2190B)


      1 /*
      2  * Copy me if you can.
      3  * by 20h
      4  */
      5 
      6 #include <unistd.h>
      7 #include <string.h>
      8 #include <stdlib.h>
      9 #include <stdio.h>
     10 #include <libgen.h>
     11 
     12 #include "arg.h"
     13 
     14 char *argv0;
     15 
     16 /*
     17  * Idea taken from:
     18  *	http://canonical.org/~kragen/strlen-utf8.html
     19  */
     20 size_t
     21 utf8strlen(char *s)
     22 {
     23 	size_t i;
     24 
     25 	i = 0;
     26 	for (; s[0]; s++) {
     27 		if ((s[0] & 0xc0) != 0x80)
     28 			i++;
     29 	}
     30 
     31 	return i;
     32 }
     33 
     34 char *
     35 utf8strchr(char *s, char *c)
     36 {
     37 	size_t j, cl;
     38 
     39 	cl = strlen(c);
     40 	if (cl == 0)
     41 		return NULL;
     42 
     43 	for (j = 0; ; s++) {
     44 		if (j > 6)
     45 			return NULL;
     46 		j++;
     47 
     48 		if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') {
     49 			if (cl == j) {
     50 				if (!memcmp(&s[-j], c, cl))
     51 					return &s[-j];
     52 			}
     53 			j = 0;
     54 
     55 			if (s[0] == '\0')
     56 				break;
     57 		}
     58 	}
     59 
     60 	return NULL;
     61 }
     62 
     63 char *
     64 utf8substr(char *s, size_t pos, size_t *length)
     65 {
     66 	size_t i, j, rl;
     67 	char *ret;
     68 
     69 	if (*length < 1)
     70 		return NULL;
     71 
     72 	ret = NULL;
     73 	rl = 0;
     74 	for (i = 0, j = 0; *length > 0; s++) {
     75 		if (j > 6)
     76 			return NULL;
     77 		j++;
     78 
     79 		if (ret != NULL)
     80 			rl++;
     81 
     82 		if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') {
     83 			if (i >= pos) {
     84 				if (ret == NULL) {
     85 					ret = &s[-j];
     86 					rl = j;
     87 				}
     88 				(*length)--;
     89 			}
     90 			i++;
     91 			j = 0;
     92 
     93 			if (s[0] == '\0')
     94 				break;
     95 		}
     96 	}
     97 
     98 	*length = rl;
     99 	return ret;
    100 }
    101 
    102 size_t
    103 utf8index(char *s, char *chars)
    104 {
    105 	size_t i, j;
    106 	char c[7];
    107 
    108 	j = 0;
    109 	for (i = 0; ; s++) {
    110 		if (j > 6)
    111 			return 0;
    112 		j++;
    113 
    114 		if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') {
    115 			memset(c, 0, sizeof(c));
    116 			memmove(c, &s[-j], j);
    117 			if (utf8strchr(chars, c))
    118 				return i;
    119 			i++;
    120 			j = 0;
    121 
    122 			if (s[0] == '\0')
    123 				break;
    124 		}
    125 	}
    126 
    127 	return 0;
    128 }
    129 
    130 void
    131 usage(void)
    132 {
    133 	fprintf(stderr, "usage: %s [substr|index|length] str [args ...]\n",
    134 			basename(argv0));
    135 	exit(1);
    136 }
    137 
    138 int
    139 main(int argc, char *argv[])
    140 {
    141 	char *s;
    142 	size_t len;
    143 
    144 	argv0 = argv[0];
    145 
    146 	if (argc < 3)
    147 		usage();
    148 
    149 	switch(argv[1][0]) {
    150 	case 'i':
    151 		if (argc < 4)
    152 			usage();
    153 		printf("%ld\n", utf8index(argv[2], argv[3]));
    154 		break;
    155 	case 'l':
    156 		printf("%ld\n", utf8strlen(argv[2]));
    157 		break;
    158 	case 's':
    159 		if (argc < 5)
    160 			usage();
    161 		len = atoi(argv[4]);
    162 		s = utf8substr(argv[2], atoi(argv[3]), &len);
    163 		if (s == NULL)
    164 			return -1;
    165 		printf("%.*s\n", (int)len, s);
    166 		break;
    167 	default:
    168 		usage();
    169 	};
    170 
    171 	return 0;
    172 }
    173