/*
 *   Copyright (C) 1997, 1998
 *   	Free Software Foundation, Inc.
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>

#include <salloc.h>
#include <logfile.h>
#include <uri.h>

#include <getopttools.h>
#include <webbase.h>
#include <sqlutil.h>

#include <md5str.h>

#define MAX_OPTIONS 100
#define APPLICATION_OPTIONS		0x8000000

typedef struct crawler_params {
  webbase_t* base;
  int fake;
  int repair;
  int keys;
  int keys_md5;
  int keys_normalize;
  int keys_url;
  char* log;
} crawler_params_t;

static crawler_params_t params;

static int verbose = 0;

static void init(int argc, char** argv);
static void checker();
static void fix_keys();
static void finish();

int main(int argc, char** argv)
{
  init(argc, argv);

  if(params.keys ||
     params.keys_md5 ||
     params.keys_normalize ||
     params.keys_url)
    fix_keys();
  else
    checker();
  finish();
  return 0;
}

typedef struct fix_key {
  char* table;
} fix_key_t;

/*
 * Cannonicalize url field, if necessary
 * Feed the url_md5 key
 * If a duplicate error occurs when updating the url_md5 key, remove the
 * record.
 * If the url cannot be cannonicalized issue an error message and do nothing.
 */
static void fix_one_key(char* argp, MYSQL_RES* res, MYSQL_ROW row)
{
  fix_key_t* arg = (fix_key_t*)argp;
  char query[256];
  webbase_t* base = params.base;
  char* rowid = row[0];
  char* url = row[1];
  char* cannonical_url;

  if(params.keys_normalize) {
    cannonical_url = uri_cannonicalize_string(url, strlen(url), URI_STRING_URI_STYLE);
  } else {
    cannonical_url = url;
  }
  
  if(cannonical_url) {
    if(strcmp(cannonical_url, url)) {
      /*
       * URL is not cannonical, update to cannonical form.
       */
      sprintf(query, "update %s set url = '%s' where rowid = %s", arg->table, sql_quote_char_simple(cannonical_url, strlen(cannonical_url)), rowid);
      smysql_query(&base->mysql, query);      
      if(verbose) fprintf(stderr, "fix_one_key: cannonicalize %s to %s\n", url, cannonical_url);
    }
    /*
     * Feed the url_md5 key
     */
    sprintf(query, "update %s set url_md5 = '%s' where rowid = %s", arg->table, str2md5ascii_simple(cannonical_url, strlen(cannonical_url)), rowid);
    {
      int ret;
      if((ret = mysql_query(&base->mysql, query))) {
	if(mysql_errno(&base->mysql) == ER_DUP_ENTRY) {
	  /*
	   * This URL already exists, delete the record..
	   */
	  sprintf(query, "delete from %s where rowid = %s", arg->table, rowid);
	  smysql_query(&base->mysql, query);
	  if(verbose) fprintf(stderr, "fix_one_key: duplicate for %s removed at rowid %s\n", cannonical_url, rowid);
	} else {
	  /*
	   * Other errors are fatal.
	   */
	  fprintf(stderr, "%s: %s (errno = %d)\n", query, mysql_error(&base->mysql), mysql_errno(&base->mysql));
	  exit(1);
	}
      }
    }
    /*
     * Feed the url_key field
     */
    if(params.keys_url) {
      sprintf(query, "update %s set url_key = '%s where rowid = %s",
	      arg->table,
	      sql_quote_char_simple(cannonical_url, strlen(cannonical_url)),
	      rowid);
    }
  } else {
    fprintf(stderr, "fix_one_key: url = %s at rowid = %s is malformed\n", url, rowid);
  }
}

static void fix_keys_table(char* table)
{
  char query[256];
  webbase_t* base = params.base;
  fix_key_t arg;

  arg.table = table;

  if(params.keys_md5) {
    if(verbose) fprintf(stderr, "fix_keys: %s ... url_md5 = rowid ... ", arg.table);
    sprintf(query, "update %s set url_md5 = rowid", arg.table);
    if(verbose) fprintf(stderr, "done\n");
  }
  sprintf(query, "select rowid,url from %s order by rowid desc", arg.table);
  sql_select(&base->mysql, 0, query, fix_one_key, (char*)&arg);
}

/*
 * Apply operations described in fix_one_key function to each record
 * of tables start and url. The records are ordered in decreasing rowid
 * number so that the oldest record is kept if duplicates are found.
 */
static void fix_keys()
{
  if(params.keys) {
    params.keys_md5 = 1;
    params.keys_normalize = 1;
    params.keys_url = 1;
  }
  
  /*
   * Normalize implies recalculate md5 and url_key because url may change.
   */
  if(params.keys_normalize != 0) {
    params.keys_md5 = 1;
    params.keys_url = 1;
  }
  
  fix_keys_table("start");

  /*
   * There is no url_key field in url table.
   */
  params.keys_url = 0;
  if(params.keys_normalize || params.keys_md5)
    fix_keys_table("url");
}

#define INDEX_INT  0x01
#define INDEX_CHAR 0x02

typedef struct repair {
  int index_type;
  int index;
  char* repair_query;
} repair_t;

static void repair(char* argp, MYSQL_RES* res, MYSQL_ROW row) {
  repair_t* arg = (repair_t*)argp;
  static char* query = 0;
  static int query_size = 0;

  if(row[arg->index] == 0) {
    fprintf(stderr, "unexpected NULL for index (repair = %s)\n", arg->repair_query);
  }
  static_alloc(&query, &query_size, strlen(arg->repair_query) + strlen(row[arg->index]) * 2);

  sprintf(query, arg->repair_query, (arg->index_type == INDEX_INT ? row[arg->index] : sql_quote_char_simple(row[arg->index], strlen(row[arg->index]))));
  
  printf("%s\n", query);

  if(!params.fake)
    smysql_query(&params.base->mysql, query);
}

static void checker_1(char* title, char* query, int index_type, int index, char* repair_query)
{
  webbase_t* base = params.base;

  if(sql_select(&base->mysql, title, query, 0, 0) && params.repair) {
    repair_t arg;
    arg.index_type = index_type;
    arg.index = index;
    arg.repair_query = repair_query;
    sql_select(&base->mysql, "", query, repair, (char*)&arg);
  }
}

static void checker()
{
  char query[10 * 1024];
  webbase_t* base = params.base;

  printf("================= consistency checks start for %s ==============\n", base->name);
  if(params.repair) {
    char* query = "delete from url where url like '% %'";
    printf("%s\n", query);
    if(!params.fake) smysql_query(&base->mysql, query);
  }

  /*
   * url_complete flags in url
   */
#define S(flag,field) \
  sprintf(query, "select url.url,url.complete_rowid from url,url_complete where url.complete_rowid = url_complete.rowid and url.info & 0x%x = 0 and url_complete." #field " is not null", flag); \
  checker_1("\nURLs without " #flag " but complete_url." #field " not null\n", query, INDEX_CHAR, 0, "delete from url where url = '%s'")
  S(WEBBASE_URL_INFO_BASE, base_url);
  S(WEBBASE_URL_INFO_RELATIVE, relative);
  S(WEBBASE_URL_INFO_ABSOLUTE, absolute);
  S(WEBBASE_URL_INFO_LOCATION, location);
#undef S
#define S(flag,field) \
  sprintf(query, "select url.url,url.complete_rowid from url,url_complete where url.complete_rowid = url_complete.rowid and url.info & 0x%x != 0 and url_complete." #field " is null", flag); \
  checker_1("\nURLs with " #flag " but complete_url." #field " is null\n", query, INDEX_CHAR, 0, "delete from url where url = '%s'")
  S(WEBBASE_URL_INFO_BASE, base_url);
  S(WEBBASE_URL_INFO_RELATIVE, relative);
  S(WEBBASE_URL_INFO_ABSOLUTE, absolute);
  S(WEBBASE_URL_INFO_LOCATION, location);

  sprintf(query, "select url,info from url where complete_rowid = 0 and info & 0x%x\n",
	  WEBBASE_URL_INFO_BASE |
	  WEBBASE_URL_INFO_RELATIVE |
	  WEBBASE_URL_INFO_ABSOLUTE |
	  WEBBASE_URL_INFO_LOCATION);
  checker_1("\nURLS which info field imply complete_rowid not null and it is null\n", query, INDEX_CHAR, 0, "delete from url where url = '%s'");

  /*
   * start2url
   */
  checker_1("\nURLs without a reference in start2url (table url):\n",
	    "select start2url.url,url.url from url left join start2url on url.rowid = start2url.url where start2url.url is null",
	    INDEX_CHAR,
	    1,
	    "delete from url where url = '%s'");

  checker_1("\nentries in start2url referencing non existent entry in url\n",
	    "select url.rowid,start2url.start,start2url.url from start2url left join url on start2url.url = url.rowid where url.rowid is null",
	    INDEX_INT,
	    2,
	    "delete from start2url where url = %s");

  checker_1("\nentries in start2url referencing non existent entry in start\n",
	    "select start.rowid,start2url.start,start2url.url from start2url left join start on start2url.start = start.rowid where start.rowid is null",
	    INDEX_INT,
	    1,
	    "delete from start2url where start = %s");

  sql_select(&base->mysql,
	     "\nentries in start2url with level < 0\n",
	     "select start,url from start2url where level < 0", 0, 0);

  /*
   * url and url_complete
   */
  checker_1("\nZombie URLs additional information (table url_complete):\n",
	    "select url.rowid,url_complete.rowid from url_complete left join url on url.complete_rowid = url_complete.rowid where url.rowid is null",
	    INDEX_INT,
	    1,
	    "delete from url_complete where rowid = %s");

  sprintf(query, "select url_complete.rowid,url.url,url.complete_rowid from url left join url_complete on url.complete_rowid = url_complete.rowid where url.info & 0x%x != 0 and url_complete.rowid is null", WEBBASE_URL_INFO_COMPLETE);
  sql_select(&base->mysql,
	     "\nURLs with COMPLETE flag but no record in complete\n",
	     query, 0, 0);

  printf("================= consistency checks end for %s ==============\n", base->name);
}

void finish()
{
  if(params.log) free(params.log);
  webbase_free(params.base);
  exit(0);
}

static void init(int argc, char** argv)
{
  static struct option long_options[MAX_OPTIONS + 1] =
  {
    /* These options set a flag. */
    {"verbose", 0, &verbose, 1},
    {"fake", 0, &params.fake, 1},
    {"keys", 0, &params.keys, 1},
    {"keys_md5", 0, &params.keys_md5, 1},
    {"keys_normalize", 0, &params.keys_normalize, 1},
    {"keys_url", 0, &params.keys_url, 1},
    {"log", 1, 0, 0},
    {"repair", 0, &params.repair, 1},
    {"verbose_sqlutil", 0, 0, 0},
    {0, MAX_OPTIONS, 0, APPLICATION_OPTIONS}
  };

  getopt_merge(long_options, webbase_options(long_options));

  optind = 0;
  while(1) {
    /* `getopt_long' stores the option index here. */
    int option_index = 0;
    int c;

    c = getopt_long_only(argc, argv, "", long_options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;
     
    switch (c)
      {
      case 0:
	/* If this option set a flag, do nothing else now. */
	if (long_options[option_index].flag != 0)
	  break;
	if(!strcmp(long_options[option_index].name, "log")) {
	  params.log = strdup(optarg);
	} else if(!strcmp(long_options[option_index].name, "verbose_sqlutil")) {
	  sqlutil_verbose(1);
	} else if(!strcmp(long_options[option_index].name, "")) {
	}
	break;
      default:
	fprintf(stderr, "option parse error %c, 0x%x\n", c & 0xff, c);
	exit(1);
      }
  }

  params.base = webbase_alloc(argc, argv, long_options);

  if(params.log) logfile(params.log);

  if(verbose > 1) getopt_dump(long_options);

  return;
}
