#include "parser.h" #include <regex.h> size_t cleanSource(char* content) { regex_t reg; int errno; int nmatch; int i; char error[300]; char query[300]; regmatch_t* matched; size_t size; if (errno = regcomp(®, "<script[ >].*</script>", REG_ICASE | REG_EXTENDED)) { regerror(errno, ®, error, 300); fprintf(stderr, "REGEX compilation error: %s\n", error); exit(2); } nmatch = reg.re_nsub + 1; matched = calloc(nmatch, sizeof(regmatch_t)); errno = 0; while (!errno) { errno = regexec(®, content, nmatch, matched, 0); if (errno == 0) { size = 0; while (content[matched[0].rm_eo + size] != 0) { content[matched[0].rm_so + size] = content[matched[1].rm_eo + size]; size++; } content[matched[0].rm_so + size] = 0; } } free(matched); regfree(®); return strlen(content); } size_t cleanSourceStyle(char* content) { regex_t reg; int errno; int nmatch; int i; char error[300]; char query[300]; regmatch_t* matched; size_t size; if (errno = regcomp(®, "<style[ >].*</style>", REG_ICASE | REG_EXTENDED)) { regerror(errno, ®, error, 300); fprintf(stderr, "REGEX compilation error: %s\n", error); exit(2); } nmatch = reg.re_nsub + 1; matched = calloc(nmatch, sizeof(regmatch_t)); errno = 0; while (!errno) { errno = regexec(®, content, nmatch, matched, 0); if (errno == 0) { size = 0; while (content[matched[0].rm_eo + size] != 0) { content[matched[0].rm_so + size] = content[matched[1].rm_eo + size]; size++; } content[matched[0].rm_so + size] = 0; } } free(matched); regfree(®); return strlen(content); } /* _auto_next(parser) * Avance jusqu'au prochain caractère non vide et l'affiche */ char _auto_next(parser_t page) { char c; c = page->content[page->offset]; while (c != 0 && empty_c(c)) { c = page->content[++page->offset]; } return c; } char* _auto_readw(parser_t page) { char c; char* mot; size_t i; c = page->content[page->offset]; mot = malloc(31); i = 0; while (c != 0 && !empty_c(c) && c != '<') { if (i < 30) { mot[i] = c; mot[i + 1] = 0; i++; } c = page->content[++page->offset]; } return mot; } void _auto_cond(parser_t page, MYSQL* sql, char* pid) { char c; short b; char *mot; char query[300]; while (c = _auto_next(page)) { if (c == '<') //Le prochain mot est une balise { while (page->content[page->offset] != '>') { page->offset++; } page->offset++; } else //Le prochain mot est un mot { //printf("e%d:%c\n", (int)page->offset, page->content[page->offset]); mot = _auto_readw(page); if (strlen(mot) > 3) { sprintf(query, "INSERT INTO keywords (name, page) VALUES (\"%s\", \"%s\")", mot, pid); mysql_query(sql, query); } free(mot); //printf("mot %s;\n", _auto_readw(page)); } } } parser_t parse(char* Content, MYSQL* sql, char* pid) { parser_t page; page = malloc(PARSER_SIZE); page->content = Content; page->offset = 0; page->url = empty_stack(); page->mot = empty_stack(); _auto_cond(page, sql, pid); }