Bontiv-Sourceer source code viewer
Root | Help
./web-crawler/src/link.c
#include "link.h"
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

LinkType WhatLink(char* link)
{
	if (strncmp(link, "http://", 7) == 0 || strncmp(link, "https://", 8) == 0)
		return AbsoluteLink;
	else if (link[0] == '/')
		return InternalLink;
	else
		return RelativeLink;
}

char* translateUrl(char* url, const char* base)
{
	LinkType type = WhatLink(url);
	regex_t reg;
	int nmatch, errno;
	char error[300];
	int end, size;
	regmatch_t* matched;
	

	if (type == AbsoluteLink)
		return url;
		
  if (errno = regcomp(&reg, "(http://[^/?]*)([^?#]*/)?", REG_ICASE | REG_EXTENDED))
    {
      regerror(errno, &reg, error, 300);
      fprintf(stderr, "REGEX compilation error: %s\n", error);
      exit(2);
    }
	
  nmatch = reg.re_nsub + 1;
  matched = calloc(nmatch, sizeof(regmatch_t));
    
    errno = regexec(&reg, base, nmatch, matched, 0);
      if (errno == REG_NOMATCH)
      {
	      fprintf(stderr, "Base %s non matched.\n", base);
	      free(matched);
	      return url;
      }
            
      if (type == InternalLink)
	end = matched[1].rm_eo;
      else
		end = matched[3].rm_eo;
      
      size = strlen(url);
      url = realloc(url, size + end + 1);
      
      url[size + end] = 0;

      while (--size >= 0)
      {
		url[size + end] = url[size];
      }

      memcpy(url, base, end);
      free(matched);
      return url;
}

void fnd_urls(char* content, char* base, MYSQL* sql)
{
  regex_t reg;
  int errno;
  int nmatch;
  int i;
  char error[300];
  char query[300];
  regmatch_t* matched;
  char* ctr;
  char* ctrde;
  size_t size;

  if (errno = regcomp(&reg, "href=\"([^\"]*)\"", REG_ICASE | REG_EXTENDED))
    {
      regerror(errno, &reg, error, 300);
      fprintf(stderr, "REGEX compilation error: %s\n", error);
      exit(2);
    }

  nmatch = reg.re_nsub + 1;
  matched = calloc(nmatch, sizeof(regmatch_t));

  errno = 0;
  while (!errno)
    {
      errno = regexec(&reg, content, nmatch, matched, 0);
      if (errno == 0)
	{
	  size = matched[1].rm_eo - matched[1].rm_so;
	  if (size)
	    {
	      /* Voila l'url decouverte */
		    ctr = malloc(size + 1);
		    strncpy(ctr, content + matched[1].rm_so, size);
		    ctr[size] = 0;
		    ctr = translateUrl(ctr, base);
		    		    
		    sprintf(query, "INSERT INTO page (url) VALUES(\"%s\")", ctr);
		    mysql_query(sql, query);

		    free(ctr);
		    content += matched[1].rm_eo;
	    }
	}
    }

  free(matched);
  regfree(&reg);
}
Presented with Bontiv-Sourceer