#include "link.h" #include <regex.h> #include <string.h> #include <stdio.h> #include <stdlib.h> LinkType WhatLink(char* link) { if (strncmp(link, "http://", 7) == 0 || strncmp(link, "https://", 8) == 0) return AbsoluteLink; else if (link[0] == '/') return InternalLink; else return RelativeLink; } char* translateUrl(char* url, const char* base) { LinkType type = WhatLink(url); regex_t reg; int nmatch, errno; char error[300]; int end, size; regmatch_t* matched; if (type == AbsoluteLink) return url; if (errno = regcomp(®, "(http://[^/?]*)([^?#]*/)?", REG_ICASE | REG_EXTENDED)) { regerror(errno, ®, error, 300); fprintf(stderr, "REGEX compilation error: %s\n", error); exit(2); } nmatch = reg.re_nsub + 1; matched = calloc(nmatch, sizeof(regmatch_t)); errno = regexec(®, base, nmatch, matched, 0); if (errno == REG_NOMATCH) { fprintf(stderr, "Base %s non matched.\n", base); free(matched); return url; } if (type == InternalLink) end = matched[1].rm_eo; else end = matched[3].rm_eo; size = strlen(url); url = realloc(url, size + end + 1); url[size + end] = 0; while (--size >= 0) { url[size + end] = url[size]; } memcpy(url, base, end); free(matched); return url; } void fnd_urls(char* content, char* base, MYSQL* sql) { regex_t reg; int errno; int nmatch; int i; char error[300]; char query[300]; regmatch_t* matched; char* ctr; char* ctrde; size_t size; if (errno = regcomp(®, "href=\"([^\"]*)\"", REG_ICASE | REG_EXTENDED)) { regerror(errno, ®, error, 300); fprintf(stderr, "REGEX compilation error: %s\n", error); exit(2); } nmatch = reg.re_nsub + 1; matched = calloc(nmatch, sizeof(regmatch_t)); errno = 0; while (!errno) { errno = regexec(®, content, nmatch, matched, 0); if (errno == 0) { size = matched[1].rm_eo - matched[1].rm_so; if (size) { /* Voila l'url decouverte */ ctr = malloc(size + 1); strncpy(ctr, content + matched[1].rm_so, size); ctr[size] = 0; ctr = translateUrl(ctr, base); sprintf(query, "INSERT INTO page (url) VALUES(\"%s\")", ctr); mysql_query(sql, query); free(ctr); content += matched[1].rm_eo; } } } free(matched); regfree(®); }