Bontiv-Sourceer source code viewer
Root | Help
./web-crawler/src/parser.c
#include "parser.h"
#include <regex.h>

size_t cleanSource(char* content)
{
  regex_t reg;
  int errno;
  int nmatch;
  int i;
  char error[300];
  char query[300];
  regmatch_t* matched;
  size_t size;

  if (errno = regcomp(&reg, "<script[ >].*</script>", REG_ICASE | REG_EXTENDED))
    {
      regerror(errno, &reg, error, 300);
      fprintf(stderr, "REGEX compilation error: %s\n", error);
      exit(2);
    }

  nmatch = reg.re_nsub + 1;
  matched = calloc(nmatch, sizeof(regmatch_t));

  errno = 0;
  while (!errno)
    {
      errno = regexec(&reg, content, nmatch, matched, 0);
      if (errno == 0)
	{
		size = 0;
		while (content[matched[0].rm_eo + size] != 0)
		{
			content[matched[0].rm_so + size] = content[matched[1].rm_eo + size];
			size++;
		}
		content[matched[0].rm_so + size] = 0;
	}
    }

  free(matched);
  regfree(&reg);
    return strlen(content);
}

size_t cleanSourceStyle(char* content)
{
  regex_t reg;
  int errno;
  int nmatch;
  int i;
  char error[300];
  char query[300];
  regmatch_t* matched;
  size_t size;

  if (errno = regcomp(&reg, "<style[ >].*</style>", REG_ICASE | REG_EXTENDED))
    {
      regerror(errno, &reg, error, 300);
      fprintf(stderr, "REGEX compilation error: %s\n", error);
      exit(2);
    }

  nmatch = reg.re_nsub + 1;
  matched = calloc(nmatch, sizeof(regmatch_t));

  errno = 0;
  while (!errno)
    {
      errno = regexec(&reg, content, nmatch, matched, 0);
      if (errno == 0)
	{
		size = 0;
		while (content[matched[0].rm_eo + size] != 0)
		{
			content[matched[0].rm_so + size] = content[matched[1].rm_eo + size];
			size++;
		}
		content[matched[0].rm_so + size] = 0;
	}
    }

  free(matched);
  regfree(&reg);
    return strlen(content);
}

/* _auto_next(parser)
 * Avance jusqu'au prochain caractère non vide et l'affiche
 */
char _auto_next(parser_t page)
{
  char c;

  c = page->content[page->offset];
  while (c != 0 && empty_c(c))
    {
      c = page->content[++page->offset];
    }

  return c;
}

char* _auto_readw(parser_t page)
{
  char c;
  char* mot;
  size_t i;

  c = page->content[page->offset];
  mot = malloc(31);
  i = 0;

  while (c != 0 && !empty_c(c) && c != '<')
    {
      if (i < 30)
	{
	  mot[i] = c;
	  mot[i + 1] = 0;
	  i++;
	}
      c = page->content[++page->offset];
    }

  return mot;
}

void _auto_cond(parser_t page, MYSQL* sql, char* pid)
{
  char c;
  short b;
  char *mot;
	char query[300];

  while (c = _auto_next(page))
    {
      if (c == '<') //Le prochain mot est une balise
	{
	  while (page->content[page->offset] != '>')
	    {
	      page->offset++;
	    }
	  page->offset++;
	}

      else //Le prochain mot est un mot
	{
	  //printf("e%d:%c\n", (int)page->offset, page->content[page->offset]);
		mot = _auto_readw(page);
		if (strlen(mot) > 3)
		{
			sprintf(query, "INSERT INTO keywords (name, page) VALUES (\"%s\", \"%s\")", mot, pid);
			mysql_query(sql, query);
		}
		free(mot);
	  //printf("mot %s;\n", _auto_readw(page));
	}
    }
}

parser_t parse(char* Content, MYSQL* sql, char* pid)
{
  parser_t page;

  page = malloc(PARSER_SIZE);
  page->content = Content;
  page->offset = 0;
  page->url = empty_stack();
  page->mot = empty_stack();

  _auto_cond(page, sql, pid);
}
Presented with Bontiv-Sourceer