/**
* Originally Written by Tim Macinta 1997 for their
* book "Guide to Search Engines" in Java.
*
* Distributed under the GNU Public License
* (a copy of which is enclosed with the source).
*
* This LinkExtractor can extract URLs from HTML files.
*
* modified by Xiannong Meng to fix the finite state machine
* to recognize urls containing white spaces.
* April 2005
*
* modified by Xiannong Meng to convert the program into C.
* December 2012
*
* modified by Xiannong Meng to remove extra functionality
* so we can see the bare-minimum of the function.
* January 2013.
*/
/*
* A simple finite-state-machine to recognize a url
*/
#include
#include
#include
#include
#define MAXSTRLEN 256
char * analyze(char* param);
void extractURLs( char* pageSrc, char*retval[], int *count) {
/*
* Input:
* char * pageSrc: original Html page
*
* Output:
* char * extracted url
*/
int len = strlen(pageSrc); /* length of the page */
char ch; /* current char */
int i; /* loop index */
int state = 0; /* state of the finite state machine */
char *sb = (char*) malloc(len + 1); /* string buffer, len < len(page) */
char *out = (char*) malloc(len + 1); /* string buffer, len < len(page) */
int c = 0;
for (i = 0; i < len; i ++) {
ch = pageSrc[i];
switch (state) {
case 0:
if (ch == '<') state = '<';
break;
case '<':
if (ch == '>') {
state = 0;
sb[c] = 0; /* terminate the string buffer */
if (c < MAXSTRLEN-1) { // skip the extreme long urls
out = analyze(sb);
strcpy((retval[*count]), out);
(*count)++;
}
c = 0; /* reset the string buffer */
} else if (ch == 'a' || ch == 'A') {
state = 'a';
sb[c] = ch;
c ++;
}
break;
case 'a':
if (isspace(ch)) {
state = '+';
sb[c] = ch;
c ++;
}
break;
case '+':
if (!isspace(ch)) {
state = '-';
sb[c] = ch;
c ++;
}
break;
case '-':
if (ch == '>') {
state = 0;
sb[c] = 0; /* terminate the string */
if (c < MAXSTRLEN-1) { // skip the extreme long urls
out = analyze(sb);
strcpy((retval[*count]), out);
(*count)++;
}
c = 0; /* reset the string buffer */
} else if (!isspace(ch)) {
sb[c] = ch;
c ++;
}
break;
} // end of 'switch'
} // end of 'for'
}
/*
* Analyzes "param", which should be the contents between a '<' and a '>',
* and adds any URLs that are found to the list of URLs.
*/
char * analyze(char* param) {
char *start, *end;
int i;
int len = strlen(param);
char *out = (char*)malloc(len + 1);
/*
* the 'param' should be inthe form of 'a href="a url"
* we need to extract the 'a url'
*/
printf("in analyze src : %s\n", param);
for (i = 0; i < len; i ++)
param[i] = tolower(param[i]);
start = strstr(param, "\"");
end = strstr((start+1), "\"");
strncpy(out, start, (end-start+1));
out[(end-start+1)] = 0;
printf("in analyze ret : %s\n", out);
return out;
}
int main(int argc, char*argv[]) {
int count = 5;
int i;
char *msgbuf[count];
for (i = 0; i < count; i ++) {
msgbuf[i] = (char*)malloc(MAXSTRLEN + 1);
}
/* the following are testing urls */
char * url = "anchor text";
// char * url = "anchor";
// char * url = "anchor ";
// char * url = "anchor";
// char * url = "some other text anchore after ancore ";
count = 0;
extractURLs( url, msgbuf, &count );
for (i = 0; i < count; i ++) {
printf("extracted url |%s|\n", msgbuf[i]);
}
}