%{
/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
/*
 * Extract hypertext links from an HTML file.
 * The links understood are
 *    <a href=
 *    <frame src=
 *    <base href=
 *    <meta url=
 *    <area href=
 */
/* Head */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include <string.h>

#include <uri.h>
#include <salloc.h>

#include <hrefparse.h>

#define T_HREF 200
#define T_BASE 201

/*
 * TRUE if page has a frameset
 */
static int in_frame = 0;
/*
 * Hold the href string
 */
static char* href = 0;
/* 
 * Total size of the buffer pointed by href
 */
static int href_size = 0;

static int verbose = 0;

/*
 * Fill the href buffer when a link has been detected
 */
static void fill_href();
%}

%option caseless noyywrap 8bit prefix="href" full outfile="lex.yy.c" nounput

%x COMMENT_TAG FRAME_TAG ADDRESS_TAG BASE_TAG META_TAG AREA_TAG

OPTWS		[[:blank:]\n\r]*

%%
<INITIAL>{
	"<"{OPTWS}a BEGIN(ADDRESS_TAG);
	"<"{OPTWS}frame {
	  in_frame = 1;
	  BEGIN(FRAME_TAG);
	}
	"<"{OPTWS}base BEGIN(BASE_TAG);
	"<"{OPTWS}meta BEGIN(META_TAG);
	"<"{OPTWS}area BEGIN(AREA_TAG);
	"<!--" BEGIN(COMMENT_TAG);
	.	;
	[\n\r]	;
}

<FRAME_TAG>{
	src{OPTWS}={OPTWS}\"[^>"]*[>"]|src{OPTWS}={OPTWS}'[^>']*[>']	{
                fill_href(1);
		return T_HREF;
	}
	src{OPTWS}={OPTWS}[^"' >]+	{
                fill_href(0);
		return T_HREF;
	}
	">"	BEGIN(INITIAL);
	\n	
	.	
}

<META_TAG>{
	url{OPTWS}={OPTWS}\"[^>"]*[>"]|url{OPTWS}={OPTWS}'[^>']*[>']	{
                fill_href(1);
		return T_HREF;
	}
	url{OPTWS}={OPTWS}[^"' >]+	{
                fill_href(0);
		return T_HREF;
	}
	">"	BEGIN(INITIAL);
	\n	
	.	
}

<ADDRESS_TAG>{
	href{OPTWS}={OPTWS}\"[^>"]*[>"]|href{OPTWS}={OPTWS}\'[^'>]*[>']	{
                fill_href(1);
		return T_HREF;
	}
	href{OPTWS}={OPTWS}[^"' >]+	{
                fill_href(0);
		return T_HREF;
	}
	">"	BEGIN(INITIAL);
	\n	
	.	
}

<AREA_TAG>{
	href{OPTWS}={OPTWS}\"[^>"]*[>"]|href{OPTWS}={OPTWS}\'[^'>]*[>']	{
                fill_href(1);
		return T_HREF;
	}
	href{OPTWS}={OPTWS}[^"' >]+	{
                fill_href(0);
		return T_HREF;
	}
	">"	BEGIN(INITIAL);
	\n	
	.	
}

<BASE_TAG>{
	href{OPTWS}={OPTWS}\"[^>"]*[>"]|href{OPTWS}={OPTWS}\'[^>']*[>']	{
                fill_href(1);
		return T_BASE;
	}
	">"	BEGIN(INITIAL);
	\n	
	.	
}

<COMMENT_TAG>{
        "-->" BEGIN(INITIAL);
        "<html>" BEGIN(INITIAL);
        "<head>" BEGIN(INITIAL);
        "<title>" BEGIN(INITIAL);
	[[:blank:]\n\r]+ ;
	. ;
}

<<EOF>> {
        BEGIN(INITIAL);
        yyterminate();
}

%%

/*
 * Fill the href with link detected by the parser
 *   
 */
static void fill_href(int has_quote)
{
  /*
   * Realloc href if necessary
   */
  static_alloc(&href, &href_size, hrefleng + 1);
  if(verbose) fprintf(stderr, "fill_href: hreftext = %.*s, leng = %d\n", hrefleng, hreftext, hrefleng);
  {
    char* start	;
    int length	;

    /* Translate &#199; sequences */
    {
      int from;
      int to;
      for(from = 0, to = 0; from < hrefleng;) {
	if(hreftext[from] == '&' &&
	   (from + 1 < hrefleng && hreftext[from + 1] == '#')) {
	  char value = 0;
	  int i = 0;
	  if(verbose) fprintf(stderr, "found %.6s\n", &hreftext[from]);
	  for(i = 0;
	      from + 2 + i < hrefleng && i < 3 &&
	      hreftext[from + 2 + i] >= '0' && hreftext[from + 2 + i] <= '9';
	      i++) {
	    value = value * 10 + (char)(hreftext[from + 2 + i]) - '0';
	  }
	  if(from + 2 + i < hrefleng && hreftext[from + 2 + i] == ';')
	    i++;
	  from += 2 + i;
	  if(verbose) fprintf(stderr, "value = %c\n", value);
	  hreftext[to++] = value;
	} else {
	  hreftext[to++] = hreftext[from++];
	}
      }
      hrefleng = to;
      if(verbose) fprintf(stderr, "fill_href: translated hreftext = %.*s, leng = %d\n", hrefleng, hreftext, hrefleng);
    }

    /*
     * First remove surrounding quotes (" or ') if any
     */
    if(has_quote) {
      start = (char*)(strchr(hreftext, '"'));
      if(start == 0) {
	start = (char*)(strchr(hreftext, '\''));
      }
      if(start == 0) {
	fprintf(stderr, "fill_href: cannot find delimiter ' or \"\n");
	exit(2);
      }
    } else {
      start = (char*)(strchr(hreftext, '='));
      if(start == 0) {
	fprintf(stderr, "fill_href: cannot find delimiter = \n");
	exit(2);
      }
    }      

    /* Go over first delimiter. */
    start++;

    /* Strip leading spaces. */
    while(*start && isspace(*start))
      start++;
    length = hrefleng - (start - hreftext);
    if(has_quote) {
      /* Skip last delimiter. (can be '" or > if common <A href="url> occurs. */
      length--;
    }
    /*
     * If the size of the data after strip is null, nothing left : error
     */
    if(length < 0) {
      fprintf(stderr, "fill_href: length < 0 for %.*s\n", hrefleng, hreftext);
      exit(3);
    }
    /* 
     * Strip trailing spaces.
     */
    while(length > 0 && isspace(start[length - 1]))
      length--;

    /*
     * Refine the links, clean it
     */
    {
      int i;
      int tmp = length;
      char* p = href;

      /* 
       * Auto patch the http:/dir/file -> /dir/file. 
       */
      if(length > 7 && !strncmp("http:/", start, 6) && start[6] != '/') {
	start += 5;
	length -= 5;
      }

      /*
       * Remove newlines, yes links sometimes contain newlines
       */
      for(i = 0; i < tmp; i++) {
	if(start[i] != '\n') {
	  *p++ = start[i];
	} else {
	  length--;
	}
      }
    }
    /*
     * href is a null terminated string.
     */
    href[length] = '\0';
  }
}

/*
 * First time thru, allocate href buffer
 */
void hrefparse_init()
{
  in_frame = 0;
  static_alloc(&href, &href_size, 1024);
}

/*
 * Entry point. 
 * Read HTML file 'filename' and extract hypertext links, ignoring relative
 * links if 'strip_relative' is not null and return the list of hypertext
 * links found in the 'hrefs' variable.
 * The first element of the 'hrefs' string table is called the BASE (see
 * HTML specification). It defaults to the URL of the page or to the
 * value of the BASE tag if any.
 */
int hrefparse(char* source, int string_or_filename, int strip_relative, webbase_url_t* webbase_url, int size_limit)
{
  /* Allocate buffers etc. */
  hrefparse_init();

  /*
   * Loop on flex parser result over filename
   */
  {
    int token;
    /*
     * Open input
     */
    FILE* fp = 0;
    YY_BUFFER_STATE buffer;
    if(string_or_filename == HREFPARSE_SOURCE_FILENAME) {
      fp = fopen(source, "r");
      if(fp == 0) {
	fprintf(stderr, "hrefparse: cannot open %s for reading ", source);
	perror("");
	return -2;
      }
      /*
       * Arrange the parser to read the file
       */
      buffer = yy_create_buffer(fp, YY_BUF_SIZE);
    } else {
      buffer = yy_scan_string(source);
    }
    yy_switch_to_buffer(buffer);

    {
      char** buffer = 0;
      int* buffer_size = 0;
      int* buffer_length = 0;
      int relative_length = 0;
      int absolute_length = 0;
      int token_found = 0;
      int too_big = 0;

      /*
       * Call the lex parser that returns when a link is found
       */
      while((token = hreflex()) && !too_big) {
	int href_length = strlen(href);
	char* href_tmp;
	int href_tmp_length;
	token_found = 1;
	switch(token) {
	case T_BASE:
	  /*
	   * Replace default base with the value of the BASE tag.
	   */
	  static_alloc(&webbase_url->w_base_url, &webbase_url->w_base_url_length, strlen(href) + 1);
	  strcpy(webbase_url->w_base_url, href);
	  webbase_url->w_info |= WEBBASE_URL_INFO_BASE;
	  break;
	case T_HREF:
	  /*
	   * Ignore fragments alone
	   */
	  href_tmp = uri_cannonicalize_string(href, href_length, URI_STRING_URI_NOHASH_STYLE|URI_STRING_URI_STYLE);
	  href_tmp_length = href_tmp ? strlen(href_tmp) : 0;
	  if(href_tmp_length) {
	    int scheme = 0;
	    int allowed_scheme = 0;
	    {
	      char* dot = strchr(href_tmp, ':');
	      char* p = href_tmp;
	      if(dot) {
		scheme = 1;
		for(p = href_tmp; p < dot; p++)
		  if(!isalpha(*p))
		    scheme = 0;
	      }
	    }
	    if(scheme &&
	       (!strncasecmp("http://", href_tmp, 7) ||
		!strncasecmp("ftp://", href_tmp, 6))) {
	      allowed_scheme = 1;
	      buffer = &webbase_url->w_absolute;
	      buffer_size = &webbase_url->w_absolute_length;
	      buffer_length = &absolute_length;
	      webbase_url->w_info |= WEBBASE_URL_INFO_ABSOLUTE;
	    } else if(!scheme) {
	      buffer = &webbase_url->w_relative;
	      buffer_size = &webbase_url->w_relative_length;
	      buffer_length = &relative_length;
	      webbase_url->w_info |= WEBBASE_URL_INFO_RELATIVE;
	    } else {
	      if(verbose) fprintf(stderr, "hrefparse: %s rejected because scheme not allowed\n", href_tmp);
	    }

	    if(!scheme || allowed_scheme) {
	      /* + 1 for white space, + 1 for the trailing null */
	      static_alloc(buffer, buffer_size, *buffer_length + href_tmp_length + 1 + 1);
	      memcpy(*buffer + *buffer_length, href_tmp, href_tmp_length);
	      *buffer_length += href_tmp_length;
	      (*buffer)[(*buffer_length)++] = ' ';
	    }
	  } else {
	    if(verbose) fprintf(stderr, "hrefparse: %s cannot be cannonicalized\n", href);
	  }
	  break;
	default:
	  fprintf(stderr, "hrefparse: unexpected token %d\n", token);
	  break;
	}
	if(absolute_length + relative_length > size_limit) {
	  fprintf(stderr, "too many hrefs for %s (-size_limit %d), truncate\n", webbase_url->w_url, size_limit);
	  too_big = 1;
	}
      }

      if(too_big) webbase_url->w_info |= WEBBASE_URL_INFO_TRUNCATED;
      if(token_found) webbase_url->w_info |= WEBBASE_URL_INFO_COMPLETE;
      if(webbase_url->w_absolute) webbase_url->w_absolute[absolute_length] = '\0';
      if(webbase_url->w_relative) webbase_url->w_relative[relative_length] = '\0';
    }

    yy_delete_buffer(buffer);
    if(string_or_filename == HREFPARSE_SOURCE_FILENAME && fp != 0) {
      fclose(fp);
    }
  }

  if(in_frame)
    webbase_url->w_info |= WEBBASE_URL_INFO_FRAME;
  
  return 0;
}

/*
 Local Variables: ***
 mode: C ***
 End: ***
*/
