/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <uri.h>

#include <html.h>
#include <html_href.h>
#include <html_content.h>
#include <salloc.h>

static int verbose = 0;

/*
 * Extract hypertext links from an HTML file
 */

typedef struct html_href_data {
  webbase_url_t* webbase_url;
  int size_limit;
  int relative_length;
  int absolute_length;
} html_href_data_t;

static int html_href_collect(int info, uri_t* uri, void* data)
{
  html_href_data_t* href_data = (html_href_data_t*)data;
  webbase_url_t* webbase_url = href_data->webbase_url;
  char* href = 0;
  int href_size = 0;
  int href_length = 0;
  int ret = 1;

  uri_string(uri, &href, &href_size, URI_STRING_URI_STYLE|URI_STRING_URI_NOHASH_STYLE);
  
  if(!href) {
    fprintf(stderr, "html_href_collect: href was not allocated\n");
    return 0;
  }
  href_length = strlen(href);

  /*
   * Ignore fragements alone (#name)
   */
  if(href_length <= 0)
    goto end;

  /*
   * Silently reject unknown schemes
   */
  if(!(uri_info(uri) & URI_INFO_RELATIVE) &&
     strcasecmp("http", uri_scheme(uri)) &&
     strcasecmp("ftp", uri_scheme(uri))) {
    if(verbose) fprintf(stderr, "html_href_collect: parsing %s, found url %s and scheme %s not allowed\n", webbase_url->w_url, href, uri_scheme(uri));
    goto end;
  }
     

  if(verbose > 1) fprintf(stderr, "html_href_collect: %s\n", href);

  /*
   * Accumulate hypertext links
   */
  switch(info) {
  case HTML_URI_BASE:
    /*
     * Replace default base with the value of the BASE tag.
       */
    static_alloc(&webbase_url->w_base_url, &webbase_url->w_base_url_length, href_length + 1);
    strcpy(webbase_url->w_base_url, href);
    webbase_url->w_info |= WEBBASE_URL_INFO_BASE;
    break;
  case HTML_URI_FRAME:
    webbase_url->w_info |= WEBBASE_URL_INFO_FRAME;
  case HTML_URI_A:
  case HTML_URI_META:
  case HTML_URI_AREA:
    {
      char** buffer;
      int* buffer_size;
      int* buffer_length;

      if(uri_info(uri) & URI_INFO_RELATIVE) {
	buffer = &webbase_url->w_relative;
	buffer_size = &webbase_url->w_relative_length;
	buffer_length = &href_data->relative_length;
	webbase_url->w_info |= WEBBASE_URL_INFO_RELATIVE;
      } else {
	buffer = &webbase_url->w_absolute;
	buffer_size = &webbase_url->w_absolute_length;
	buffer_length = &href_data->absolute_length;
	webbase_url->w_info |= WEBBASE_URL_INFO_ABSOLUTE;
      }
      /* + 1 for white space, + 1 for the trailing null */
      static_alloc(buffer, buffer_size, *buffer_length + href_length + 1 + 1);
      memcpy(*buffer + *buffer_length, href, href_length);
      *buffer_length += href_length;
      (*buffer)[(*buffer_length)++] = ' ';
      (*buffer)[(*buffer_length)] = '\0';

      if(href_data->absolute_length + href_data->relative_length > href_data->size_limit) {
	fprintf(stderr, "too many hrefs for %s (-size_limit %d), truncate\n", webbase_url->w_url, href_data->size_limit);
	webbase_url->w_info |= WEBBASE_URL_INFO_TRUNCATED;
	ret = 0;
	goto end;
      }
    }
    break;
  default:
    fprintf(stderr, "unknown token %d found while parsing hrefs for %s, ignored\n", info, webbase_url->w_url);
    ret = 0;
    goto end;
    break;
  }

  webbase_url->w_info |= WEBBASE_URL_INFO_COMPLETE;

 end:
  free(href);

  return ret;
}

int html_href(char* path, webbase_url_t* webbase_url, int size_limit)
{
  html_href_t walk;
  html_href_data_t href_data;

  html_href_reset(&walk);
  walk.parser.info = HTML_SOURCE_FILENAME;
  walk.parser.source = path;
  walk.parser.source_length = strlen(path);
  walk.parser.ignore = HTML_URI_IMG;
  walk.href_callback = html_href_collect;
  walk.href_data = (void*)&href_data;

  memset((char*)&href_data, '\0', sizeof(html_href_data_t));
  href_data.webbase_url = webbase_url;
  href_data.size_limit = size_limit;

  return html_href_parse(&walk);
}

/*
 * Extract meta information on HTML file and build an excerpt
 */

static int html_content_collect_begin(int info, html_content_parsed_t* parsed, void* data)
{
  html_content_data_t* content_data = (html_content_data_t*)data;
  webbase_url_t* webbase_url = content_data->webbase_url;
  int ret = 1;

#define S(field_length,field,w_field,constant_field_length,flag) \
	if(field_length) { \
	  strncpy(webbase_url->w_field, field, constant_field_length); \
	  webbase_url->w_field[constant_field_length] = '\0'; \
	  webbase_url->w_info |= flag; \
        }

  switch(info) {

  case HTML_CONTENT_META:
    if(!strncasecmp((char*)parsed->buffer0, "keyword", 7)) {
      S(parsed->buffer1_length,(char*)parsed->buffer1,w_keywords,WEBBASE_KEYWORDS_LENGTH,WEBBASE_URL_INFO_KEYWORDS | WEBBASE_URL_INFO_COMPLETE);
    } else if(!strncasecmp((char*)parsed->buffer0, "description", 11)) {
      S(parsed->buffer1_length,(char*)parsed->buffer1,w_description,WEBBASE_DESCRIPTION_LENGTH,WEBBASE_URL_INFO_DESCRIPTION | WEBBASE_URL_INFO_COMPLETE);
    }
    break;

  case HTML_CONTENT_TITLE:
    S(parsed->buffer0_length,(char*)parsed->buffer0,w_title,WEBBASE_TITLE_LENGTH,WEBBASE_URL_INFO_TITLE);
    break;

  case HTML_CONTENT_TEXT:
    if(content_data->extract_length < WEBBASE_EXTRACT_LENGTH) {
      int remains = WEBBASE_EXTRACT_LENGTH - content_data->extract_length;
      int append_length = remains < parsed->buffer0_length ? remains : parsed->buffer0_length;
      memcpy(webbase_url->w_extract + content_data->extract_length, parsed->buffer0, append_length);
      content_data->extract_length += append_length;
      webbase_url->w_extract[content_data->extract_length] = '\0';
      webbase_url->w_info |= WEBBASE_URL_INFO_EXTRACT;
    }
    break;
  }
#undef S

  return ret;
}

int html_content_begin(char* buffer, int buffer_length, webbase_url_t* webbase_url)
{
  html_content_t walk;
  html_content_data_t content_data;

  html_content_reset(&walk);
  walk.parser.info = HTML_SOURCE_STRING;
  walk.parser.source = buffer;
  walk.parser.source_length = buffer_length;
  walk.content_callback = html_content_collect_begin;
  walk.content_data = (void*)&content_data;

  memset((char*)&content_data, '\0', sizeof(html_content_data_t));
  content_data.webbase_url = webbase_url;

  return html_content_parse(&walk);
}
