/** 
 *  Originally Written by Tim Macinta 1997 for their     
 *  book "Guide to Search Engines" in Java.              
 *
 *  Distributed under the GNU Public License
 *       (a copy of which is enclosed with the source).   
 *                                                        
 
 *  This LinkExtractor can extract URLs from HTML files.  
 *
 *  modified by Xiannong Meng to fix the finite state machine
 *  to recognize urls containing white spaces.            
 *  April 2005                                            
 *
 *  modified by Xiannong Meng to convert the program into C.
 *  December 2012                                         
 *
 *  modified by Xiannong Meng to remove extra functionality
 *  so we can see the bare-minimum of the function.
 *  January 2013.
 */
/*
 * A simple finite-state-machine to recognize a url
 */
#include 
#include 
#include 
#include 
#define MAXSTRLEN 256
char * analyze(char* param);
void extractURLs( char* pageSrc, char*retval[], int *count) {
  /*
   * Input:
   * char * pageSrc:  original Html page
   *
   * Output: 
   * char * extracted url
   */
  int  len = strlen(pageSrc);   /* length of the page */
  char ch;                      /* current char */
  int  i;                       /* loop index */
  int  state = 0;               /* state of the finite state machine */
  char *sb = (char*) malloc(len + 1);  /* string buffer, len < len(page) */
  char *out = (char*) malloc(len + 1);  /* string buffer, len < len(page) */
  int  c = 0;
  for (i = 0; i < len; i ++) {
    ch = pageSrc[i];
    switch (state) {
    case 0:
      if (ch == '<') state = '<';
      break;
    case '<':
      if (ch == '>') {
	state = 0;
	sb[c] = 0;    /* terminate the string buffer */
	if (c < MAXSTRLEN-1)  { // skip the extreme long urls
	  out = analyze(sb);
	  strcpy((retval[*count]), out);
	  (*count)++;
	}
	c = 0;   /* reset the string buffer */
      } else if (ch == 'a' || ch == 'A') {
	state = 'a';
	sb[c] = ch;
	c ++;
      }
      break;
    case 'a':
      if (isspace(ch)) {
	state = '+';
	sb[c] = ch;
	c ++;
      }
      break;
    case '+':
      if (!isspace(ch))	{
	  state = '-';
	  sb[c] = ch;
	  c ++;
	}
      break;
    case '-':
      if (ch == '>')  {
	state = 0;
	sb[c] = 0;   /* terminate the string */
	if (c < MAXSTRLEN-1)  { // skip the extreme long urls
	  out = analyze(sb);
	  strcpy((retval[*count]), out);
	  (*count)++;
	}
	c = 0;       /* reset the string buffer */
      } else if (!isspace(ch)) {
	sb[c] = ch;
	c ++;
      }
      break;
    }   // end of 'switch'
  }  // end of 'for'
}
/*
 * Analyzes "param", which should be the contents between a '<' and a '>',
 *  and adds any URLs that are found to the list of URLs.
 */
char * analyze(char* param) {
  char *start, *end;
  int i;
  int len = strlen(param);
  char *out = (char*)malloc(len + 1);
  /* 
   * the 'param' should be inthe form of 'a href="a url" 
   * we need to extract the 'a url'
   */
  printf("in analyze src : %s\n", param);
  for (i = 0; i < len; i ++)
    param[i] = tolower(param[i]);
  start = strstr(param, "\"");
  end = strstr((start+1), "\"");
  strncpy(out, start, (end-start+1));
  out[(end-start+1)] = 0;
  printf("in analyze ret : %s\n", out);
  return out;
}
int main(int argc, char*argv[]) {
  int  count = 5;
  int  i;
  char *msgbuf[count];
  for (i = 0; i < count; i ++) {
    msgbuf[i] = (char*)malloc(MAXSTRLEN + 1);
  }
  /* the following are testing urls */
  char * url = "anchor text";
  //  char * url = "anchor";
  //  char * url = "anchor ";
  // char * url = "anchor"; 
  //  char * url = "some other text anchore  after ancore "; 
    count = 0;
    extractURLs( url, msgbuf, &count );
    for (i = 0; i < count; i ++) {
      printf("extracted url |%s|\n", msgbuf[i]);
    }
}