2 * ModSecurity for Apache 2.x, http://www.modsecurity.org/
3 * Copyright (c) 2004-2009 Breach Security, Inc. (http://www.breach.com/)
5 * This product is released under the terms of the General Public Licence,
6 * version 2 (GPLv2). Please refer to the file LICENSE (included with this
7 * distribution) which contains the complete text of the licence.
9 * There are special exceptions to the terms and conditions of the GPL
10 * as it is applied to this software. View the full text of the exception in
11 * file MODSECURITY_LICENSING_EXCEPTION in the directory of this software
14 * If any of the files related to licensing are missing or if you have any
15 * other questions related to licensing please contact Breach Security, Inc.
16 * directly using the email address support@breach.com.
20 /* Aho-Corasick Matching */
26 #include "utf8tables.h"
29 #define acmp_utf8_char_t long
31 #define utf8_lcase(a) apr_tolower(a)
34 #include <apr_tables.h>
40 *******************************************************************************
41 *******************************************************************************
42 * Data structures for acmp parser
48 typedef struct acmp_node_t acmp_node_t;
49 typedef struct acmp_btree_node_t acmp_btree_node_t;
51 acmp_utf8_char_t letter;
53 acmp_callback_t callback;
63 acmp_btree_node_t *btree;
71 struct acmp_btree_node_t {
72 acmp_utf8_char_t letter;
73 acmp_btree_node_t *left;
74 acmp_btree_node_t *right;
79 * Data related to parser, not to individual nodes
85 int is_case_sensitive;
86 apr_pool_t *parent_pool;
90 apr_size_t longest_entry;
92 acmp_node_t *root_node;
94 const char *data_start;
99 apr_size_t *bp_buffer;
100 apr_size_t bp_buff_len;
102 acmp_node_t *active_node;
104 apr_size_t u8buff_len;
105 apr_size_t hit_count;
106 int is_failtree_done;
113 *******************************************************************************
114 *******************************************************************************
115 * Functions for UTF-8 support
120 * Returns length of utf-8 sequence based on its first byte
122 static int utf8_seq_len(const char *first_byte) {
123 return utf8_seq_lengths[(unsigned int)(unsigned char)first_byte[0]];
127 * Returns length of utf8-encoded text
129 static size_t utf8_strlen(const char *str) {
133 c += utf8_seq_len(c);
140 * Returns ucs code for given utf-8 sequence
142 static acmp_utf8_char_t utf8_decodechar(const char *str) {
143 int len = utf8_seq_len(str);
144 acmp_utf8_char_t ch = 0;
146 case 6: ch += (unsigned char)*str++; ch <<= 6;
147 case 5: ch += (unsigned char)*str++; ch <<= 6;
148 case 4: ch += (unsigned char)*str++; ch <<= 6;
149 case 3: ch += (unsigned char)*str++; ch <<= 6;
150 case 2: ch += (unsigned char)*str++; ch <<= 6;
151 case 1: ch += (unsigned char)*str++;
153 ch -= utf8_offsets[len - 1];
158 * Returns lowercase for given unicode character. Searches through
159 * utf8_lcase_map table, if it doesn't find the code assumes
160 * it doesn't have a lowercase variant and returns code itself.
162 static long utf8_lcase(acmp_utf8_char_t ucs_code) {
163 long mid, left, right;
165 right = UTF8_LCASEMAP_LEN * 2 + 1;
167 while (left <= right) {
168 mid = (left + right) >> 1;
169 mid -= (mid % 2); mid++;
170 if (ucs_code > utf8_lcase_map[mid])
172 else if (ucs_code < utf8_lcase_map[mid])
174 else if (ucs_code == utf8_lcase_map[mid])
175 return utf8_lcase_map[mid - 1];
182 *******************************************************************************
183 *******************************************************************************
184 * Code for local / static utility functions
188 * Returns length of given string for parser's encoding
190 static size_t acmp_strlen(ACMP *parser, const char *str) {
192 return (parser->is_utf8 == 0) ? strlen(str) : utf8_strlen(str);
199 * Turns string to array of ucs values, depending on parser's encoding
200 * str - string to convert, doesn't have to be NULL-terminated
201 * ucs_chars - where to write ucs values
202 * len - length of input string
204 static void acmp_strtoucs(ACMP *parser, const char *str, acmp_utf8_char_t *ucs_chars, int len) {
209 if (parser->is_utf8) {
210 for (i = 0; i < len; i++) {
211 *(ucs_chars++) = utf8_decodechar(c);
212 c += utf8_seq_len(c);
217 for (i = 0; i < len; i++) {
218 *(ucs_chars++) = *(c++);
224 * Returns node with given letter, or null if not found
226 static acmp_node_t *acmp_child_for_code(acmp_node_t *parent_node, acmp_utf8_char_t ucs_code) {
227 acmp_node_t *node = parent_node->child;
228 if (node == NULL) return NULL;
230 if (node->letter == ucs_code) return node;
231 node = node->sibling;
232 if (node == NULL) return NULL;
237 * Adds node to parent node, if it is not already there
239 static void acmp_add_node_to_parent(acmp_node_t *parent, acmp_node_t *child) {
240 acmp_node_t *node = NULL;
242 child->parent = parent;
243 if (parent->child == NULL) {
244 parent->child = child;
248 node = parent->child;
250 if (node == child) return;
251 if (node->sibling == NULL) {
252 node->sibling = child;
255 node = node->sibling;
260 * Copies values from one node to another, without child/sibling/fail pointers
261 * and without state variables.
263 static void acmp_clone_node_no_state(acmp_node_t *from, acmp_node_t *to) {
264 memcpy(to, from, sizeof(acmp_node_t));
272 * Copies sibling nodes and child node for from given "from" node to "to" node.
273 * Both nodes must already exist.
275 static void acmp_copy_nodes_recursive(acmp_node_t *from, acmp_node_t *to, apr_pool_t *pool) {
276 acmp_node_t *old_node = from->child, *new_node, *nn2;
277 if (old_node == NULL) return;
278 nn2 = apr_pcalloc(pool, sizeof(acmp_node_t));
279 /* ENH: Check alloc succeded */
280 acmp_clone_node_no_state(old_node, nn2);
283 acmp_copy_nodes_recursive(from->child, to->child, pool);
286 old_node = old_node->sibling;
287 if (old_node == NULL) break;
288 new_node = apr_pcalloc(pool, sizeof(acmp_node_t));
289 /* ENH: Check alloc succeded */
290 acmp_clone_node_no_state(old_node, new_node);
291 new_node->parent = to;
292 nn2->sibling = new_node;
294 acmp_copy_nodes_recursive(old_node, new_node, pool);
298 static inline acmp_node_t *acmp_btree_find(acmp_node_t *node, acmp_utf8_char_t letter) {
299 acmp_btree_node_t *bnode = node->btree;
301 if (bnode == NULL) return NULL;
302 if (bnode->letter == letter) return bnode->node;
303 if (bnode->letter > letter) {
306 bnode = bnode->right;
314 static inline acmp_node_t *acmp_goto(acmp_node_t *node, acmp_utf8_char_t letter) {
315 return acmp_btree_find(node, letter);
319 * Connects each node with its first fail node that is end of a phrase.
321 static void acmp_connect_other_matches(ACMP *parser, acmp_node_t *node) {
322 acmp_node_t *child, *om;
324 for (child = node->child; child != NULL; child = child->sibling) {
325 if (child->fail == NULL) continue;
326 for (om = child->fail; om != parser->root_node; om = om->fail) {
334 /* Go recursively through children of this node that have a child node */
335 for(child = node->child; child != NULL; child = child->sibling) {
336 if (child->child != NULL) acmp_connect_other_matches(parser, child);
341 * Adds leaves to binary tree, working from sorted array of keyword tree nodes
343 static void acmp_add_btree_leaves(acmp_btree_node_t *node, acmp_node_t *nodes[],
344 int pos, int lb, int rb, apr_pool_t *pool) {
346 int left = 0, right = 0;
347 if ((pos - lb) > 1) {
348 left = lb + (pos - lb) / 2;
349 node->left = apr_pcalloc(pool, sizeof(acmp_btree_node_t));
350 /* ENH: Check alloc succeded */
351 node->left->node = nodes[left];
352 node->left->letter = nodes[left]->letter;
354 fprintf(stderr, "%lc ->left %lc\n", (wint_t)node->node->letter, (wint_t)node->left->node->letter);
357 if ((rb - pos) > 1) {
358 right = pos + (rb - pos) / 2;
359 node->right = apr_pcalloc(pool, sizeof(acmp_btree_node_t));
360 /* ENH: Check alloc succeded */
361 node->right->node = nodes[right];
362 node->right->letter = nodes[right]->letter;
364 fprintf(stderr, "%lc ->right %lc\n", (wint_t)node->node->letter, (wint_t)node->right->node->letter);
367 if (node->right != NULL) {
368 acmp_add_btree_leaves(node->right, nodes, right, pos, rb, pool);
370 if (node->left != NULL) {
371 acmp_add_btree_leaves(node->left, nodes, left, lb, pos, pool);
376 * Builds balanced binary tree from children nodes of given node.
378 static void acmp_build_binary_tree(ACMP *parser, acmp_node_t *node) {
379 apr_size_t count, i, j;
380 acmp_node_t *child = node->child;
384 /* Build an array big enough */
385 for (count = 0; child != NULL; child = child->sibling) count++;
386 nodes = apr_pcalloc(parser->pool, count * sizeof(acmp_node_t *));
387 /* ENH: Check alloc succeded */
389 /* ENH: Combine this in the loop below - we do not need two loops */
391 for (i = 0; i < count; i++) {
393 child = child->sibling;
396 /* We have array with all children of the node and number of those children
398 for (i = 0; i < count - 1; i++)
399 for (j = i + 1; j < count; j++) {
402 if (nodes[i]->letter < nodes[j]->letter) continue;
408 node->btree = apr_pcalloc(parser->pool, sizeof(acmp_btree_node_t));
409 /* ENH: Check alloc succeded */
411 node->btree->node = nodes[pos];
412 node->btree->letter = nodes[pos]->letter;
413 acmp_add_btree_leaves(node->btree, nodes, pos, -1, count, parser->pool);
414 for (i = 0; i < count; i++) {
415 if (nodes[i]->child != NULL) acmp_build_binary_tree(parser, nodes[i]);
420 * Constructs fail paths on keyword trie
422 static apr_status_t acmp_connect_fail_branches(ACMP *parser) {
423 /* Already connected ? */
424 acmp_node_t *child, *node, *goto_node;
425 apr_array_header_t *arr, *arr2, *tmp;
427 if (parser->is_failtree_done != 0) return APR_SUCCESS;
429 parser->root_node->text = "";
430 arr = apr_array_make(parser->pool, 32, sizeof(acmp_node_t *));
431 arr2 = apr_array_make(parser->pool, 32, sizeof(acmp_node_t *));
433 parser->root_node->fail = parser->root_node;
435 /* All first-level children will fail back to root node */
436 for (child = parser->root_node->child; child != NULL; child = child->sibling) {
437 child->fail = parser->root_node;
438 *(acmp_node_t **)apr_array_push(arr) = child;
440 fprintf(stderr, "fail direction: *%s* => *%s*\n", child->text, child->fail->text);
445 while (apr_is_empty_array(arr) == 0) {
446 node = *(acmp_node_t **)apr_array_pop(arr);
447 node->fail = parser->root_node;
448 if (node->parent != parser->root_node) {
449 goto_node = acmp_child_for_code(node->parent->fail, node->letter);
450 node->fail = (goto_node != NULL) ? goto_node : parser->root_node;
453 fprintf(stderr, "fail direction: *%s* => *%s*\n", node->text, node->fail->text);
456 while (child != NULL) {
457 *(acmp_node_t **)apr_array_push(arr2) = child;
458 child = child->sibling;
461 if (apr_is_empty_array(arr2) != 0) break;
467 acmp_connect_other_matches(parser, parser->root_node);
468 if (parser->root_node->child != NULL) acmp_build_binary_tree(parser, parser->root_node);
469 parser->is_failtree_done = 1;
474 * Clears hit count of each node, called from acmp_reset()
476 static void acmp_clear_hit_count_recursive(acmp_node_t *node) {
477 for (; node != NULL; node = node->sibling) {
479 if (node->child != NULL) acmp_clear_hit_count_recursive(node->child);
484 * Called when a match is found
486 static void acmp_found(ACMP *parser, acmp_node_t *node) {
487 if (node->callback) {
488 node->callback(parser, node->callback_data,
489 parser->bp_buffer[(parser->char_pos - node->depth - 1) % parser->bp_buff_len],
490 parser->char_pos - node->depth - 1);
497 *******************************************************************************
498 *******************************************************************************
499 * Code for functions from header file
504 * flags - OR-ed values of ACMP_FLAG constants
505 * pool - apr_pool to use as parent pool, can be set to NULL
507 ACMP *acmp_create(int flags, apr_pool_t *pool) {
512 rc = apr_pool_create(&p, pool);
513 if (rc != APR_SUCCESS) return NULL;
515 parser = apr_pcalloc(p, sizeof(ACMP));
516 /* ENH: Check alloc succeded */
518 parser->parent_pool = pool;
520 parser->is_utf8 = (flags & ACMP_FLAG_UTF8) == 0 ? 0 : 1;
522 parser->is_case_sensitive = (flags & ACMP_FLAG_CASE_SENSITIVE) == 0 ? 0 : 1;
523 parser->root_node = apr_pcalloc(p, sizeof(acmp_node_t));
524 /* ENH: Check alloc succeded */
529 * Destroys previously created parser
531 void acmp_destroy(ACMP *parser) {
533 * All data is kept in parser's pool (including parser struct itself), so
534 * destroying the pool will destroy everything
536 apr_pool_destroy(parser->pool);
540 * Creates parser with same options and same patterns
541 * parser - ACMP parser to duplicate
542 * pool - parent pool to use, if left as NULL original parser's parent pool is used
544 ACMP *acmp_duplicate(ACMP *parser, apr_pool_t *pool) {
549 if (pool == NULL) pool = parser->parent_pool;
550 rc = apr_pool_create(&p, pool);
551 if (rc != APR_SUCCESS) return NULL;
553 new_parser = apr_pcalloc(p, sizeof(ACMP));
554 /* ENH: Check alloc succeded */
555 new_parser->pool = p;
556 new_parser->parent_pool = pool;
558 new_parser->is_utf8 = parser->is_utf8;
560 new_parser->is_case_sensitive = parser->is_case_sensitive;
561 new_parser->root_node = apr_pcalloc(p, sizeof(acmp_node_t));
562 /* ENH: Check alloc succeded */
563 new_parser->dict_count = parser->dict_count;
564 new_parser->longest_entry = parser->longest_entry;
565 acmp_copy_nodes_recursive(parser->root_node, new_parser->root_node, new_parser->pool);
566 acmp_prepare(new_parser);
571 * Creates fail tree and initializes buffer
573 apr_status_t acmp_prepare(ACMP *parser) {
576 if (parser->bp_buff_len < parser->longest_entry) {
577 parser->bp_buff_len = parser->longest_entry * 2;
578 parser->bp_buffer = apr_pcalloc(parser->pool, sizeof(apr_size_t) * parser->bp_buff_len);
579 /* ENH: Check alloc succeded */
582 st = acmp_connect_fail_branches(parser);
583 parser->active_node = parser->root_node;
584 if (st != APR_SUCCESS) return st;
585 parser->is_active = 1;
590 * Adds pattern to parser
591 * parser - ACMP parser
592 * pattern - string with pattern to match
593 * callback - Optional, pointer to an acmp_callback_t function
594 * data - pointer to data that will be passed to callback function, only used if callback
596 * len - Length of pattern in characters, if zero string length is used.
598 apr_status_t acmp_add_pattern(ACMP *parser, const char *pattern,
599 acmp_callback_t callback, void *data, apr_size_t len)
602 acmp_utf8_char_t *ucs_chars;
603 acmp_node_t *parent, *child;
605 if (parser->is_active != 0) return APR_EGENERAL;
607 length = (len == 0) ? acmp_strlen(parser, pattern) : len;
608 ucs_chars = apr_pcalloc(parser->pool, length * sizeof(acmp_utf8_char_t));
609 /* ENH: Check alloc succeded */
611 parent = parser->root_node;
612 acmp_strtoucs(parser, pattern, ucs_chars, length);
614 for (i = 0; i < length; i++) {
615 acmp_utf8_char_t letter = ucs_chars[i];
616 if (parser->is_case_sensitive == 0) {
617 letter = utf8_lcase(letter);
619 child = acmp_child_for_code(parent, letter);
621 child = apr_pcalloc(parser->pool, sizeof(acmp_node_t));
622 /* ENH: Check alloc succeded */
624 child->letter = letter;
626 child->text = apr_pcalloc(parser->pool, strlen(pattern) + 2);
627 /* ENH: Check alloc succeded */
628 for (j = 0; j <= i; j++) child->text[j] = pattern[j];
630 if (i == length - 1) {
631 if (child->is_last == 0) {
632 parser->dict_count++;
634 child->pattern = apr_pcalloc(parser->pool, strlen(pattern) + 2);
635 /* ENH: Check alloc succeded */
636 strcpy(child->pattern, pattern);
638 child->callback = callback;
639 child->callback_data = data;
641 acmp_add_node_to_parent(parent, child);
644 if (length > parser->longest_entry) parser->longest_entry = length;
645 parser->is_failtree_done = 0;
651 * Called to process incoming data stream
652 * data - ptr to incoming data
653 * len - size of data in bytes
655 apr_status_t acmp_process(ACMP *parser, const char *data, apr_size_t len) {
656 acmp_node_t *node, *go_to;
658 apr_size_t seq_length;
662 if (parser->is_failtree_done == 0) acmp_prepare(parser);
664 node = parser->active_node;
668 acmp_utf8_char_t letter;
670 parser->bp_buffer[parser->char_pos % parser->bp_buff_len] = parser->byte_pos;
672 if (parser->is_utf8) {
673 if (parser->u8buff_len > 0) {
674 /* Resuming partial utf-8 sequence */
675 seq_length = utf8_seq_len(parser->u8_buff);
677 parser->u8_buff[parser->u8buff_len++] = *data++;
678 if (parser->u8buff_len == seq_length) {
679 parser->u8buff_len = 0;
680 letter = utf8_decodechar(parser->u8_buff);
681 parser->byte_pos += seq_length;
687 /* not resuming partial sequence, reading from the stream */
688 seq_length = utf8_seq_len(data);
689 if ((data + seq_length) > end) {
690 while (data < end) parser->u8_buff[parser->u8buff_len++] = *data++;
693 letter = utf8_decodechar(data);
695 parser->byte_pos += seq_length;
706 if (parser->is_case_sensitive == 0) letter = utf8_lcase(letter);
709 while (go_to == NULL) {
710 acmp_node_t *n2 = acmp_goto(node, letter);
711 go_to = acmp_child_for_code(node, letter);
713 n2 = acmp_goto(node, letter);
716 if (go_to->is_last) {
717 acmp_found(parser, go_to);
720 if (node == parser->root_node) break;
721 if (go_to == NULL) node = node->fail;
723 if (go_to != NULL) node = go_to;
725 /* We need to collect other nodes that are last letters of phrase. These
726 * will be fail node of current node if it has is_last flag set, and
727 * fail node of that node, recursively down to root node.
730 if (go_to != parser->root_node) {
731 for (go_to = go_to->o_match; go_to != NULL; go_to = go_to->o_match) {
732 acmp_found(parser, go_to);
736 parser->active_node = node;
737 return parser->hit_count > 0 ? 1 : 0;
741 * Resets the state of parser so you can start using it with new set of data.
743 * No need to clear buffer since it will be re-initialized at first run of
746 void acmp_reset(ACMP *parser) {
747 parser->is_active = 0;
748 parser->byte_pos = 0;
749 parser->char_pos = 0;
750 parser->hit_count = 0;
751 parser->u8buff_len = 0;
752 acmp_clear_hit_count_recursive(parser->root_node);
756 * Creates an ACMPT struct that will use parser's tree, without duplicating its data
758 ACMPT *acmp_duplicate_quick(ACMP *parser, apr_pool_t *pool) {
759 apr_pool_t *p = (pool != NULL) ? pool : parser->pool;
760 ACMPT *dup = apr_pcalloc(p, sizeof(ACMPT));
761 /* ENH: Check alloc succeded */
762 dup->parser = parser;
767 * Process the data using ACMPT to keep state, and ACMPT's parser to keep the tree
769 apr_status_t acmp_process_quick(ACMPT *acmpt, const char **match, const char *data, apr_size_t len) {
771 acmp_node_t *node, *go_to;
774 if (acmpt->parser->is_failtree_done == 0) {
775 acmp_prepare(acmpt->parser);
778 parser = acmpt->parser;
779 if (acmpt->ptr == NULL) acmpt->ptr = parser->root_node;
784 acmp_utf8_char_t letter = (unsigned char)*data++;
786 if (parser->is_case_sensitive == 0) letter = utf8_lcase(letter);
789 while (go_to == NULL) {
790 go_to = acmp_goto(node, letter);
792 if (go_to->is_last) {
793 *match = go_to->text;
797 if (node == parser->root_node) break;
798 if (go_to == NULL) node = node->fail;
800 if (go_to != NULL) node = go_to;
802 /* If node has o_match, then we found a pattern */
803 if (node->o_match != NULL) {