Logo Search packages:      
Sourcecode: calibre version File versions  Download package

extra.c
/*
 * extra.c - full-text search support for pychm
 *
 * Copyright (C) 2004 Rubens Ramos <rubensr@users.sourceforge.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, 
 * Boston, MA 02111-1307, USA.
 *
 * Author: Rubens Ramos <rubensr@users.sourceforge.net>
 *
 * Heavily based on work done by:
 * Pabs <pabs@zip.to> - chmdeco
 * Razvan Cojocaru <razvanco@gmx.net> - xCHM
 *
 */

#include "chm_lib.h"
#ifdef __PYTHON__
#include "Python.h"
#else
#include <stdio.h>
#define PyObject void
#endif

#include <stdlib.h>

#ifdef _MSC_VER
#include "stdint.h"
#define strcasecmp stricmp
#define strncasecmp strnicmp
#else
#include <inttypes.h>
#include <strings.h>
#endif

#if defined( _MSC_VER ) && !defined( __cplusplus )
# define inline __inline
#endif

#if defined(_WIN32) || defined(__WIN32__)
#       if defined(_MSC_VER)
#               if defined(STATIC_LINKED)
#                       define MODEXPORT(a) a
#                       define MODIMPORT(a) extern a
#               else
#                       define MODEXPORT(a) __declspec(dllexport) a
#                       define MODIMPORT(a) extern a
#               endif
#       else
#               if defined(__BORLANDC__)
#                       define MODEXPORT(a) a _export
#                       define MODIMPORT(a) a _export
#               else
#                       define MODEXPORT(a) a
#                       define MODIMPORT(a) a
#               endif
#       endif
#else
#       define MODEXPORT(a) a
#       define MODIMPORT(a) a
#endif

#define false 0
#define true 1

#define FTS_HEADER_LEN 0x32
#define TOPICS_ENTRY_LEN 16
#define COMMON_BUF_LEN 1025

#define FREE(x) free (x); x = NULL

inline uint16_t 
get_uint16 (uint8_t* b) {
  return b[0] |
    b[1]<<8;
}

inline uint32_t 
get_uint32 (uint8_t* b) {
  return b[0] |
    b[1]<<8   |
    b[2]<<16  |
    b[3]<<24;
}

inline uint64_t 
get_uint64 (uint8_t* b) {
  return b[0]           |
    b[1]<<8             |
    b[2]<<16            |
    b[3]<<24            |
    (uint64_t) b[4]<<32 |
    (uint64_t) b[5]<<40 |
    (uint64_t) b[6]<<48 |
    (uint64_t) b[7]<<56;
}

inline uint64_t 
be_encint (unsigned char *buffer, size_t *length)
{
  uint64_t result = 0;
  int shift=0;
  *length = 0;
  
  do {
    result |= ((*buffer) & 0x7f) << shift;
    shift += 7;
    *length = *length + 1;
  
  } while (*(buffer++) & 0x80);
  
  return result;
}

/*
  Finds the first unset bit in memory. Returns the number of set bits found.
  Returns -1 if the buffer runs out before we find an unset bit.
*/
inline int
ffus (unsigned char* byte, int* bit, size_t *length) {
  int bits = 0;
  *length = 0;
  
  while(*byte & (1 << *bit)){
    if(*bit)
      --(*bit);
    else {
      ++byte;
      ++(*length);
      *bit = 7;
    }
    ++bits;
  }
  
  if(*bit)
    --(*bit);
  else {
    ++(*length);
    *bit = 7;
  }
  
  return bits;
}


inline uint64_t
sr_int(unsigned char* byte, int* bit,
       unsigned char s, unsigned char r, size_t *length)
{
  uint64_t ret;
  unsigned char mask;
  int n, n_bits, num_bits, base, count;
  size_t fflen;

  *length = 0;
  
  if(!bit || *bit > 7 || s != 2)
    return ~(uint64_t)0;
  ret = 0;
  
  count = ffus(byte, bit, &fflen);
  *length += fflen;
  byte += *length;
  
  n_bits = n = r + (count ? count-1 : 0) ;
  
  while (n > 0) {
    num_bits = n > *bit ? *bit : n-1;
    base = n > *bit ? 0 : *bit - (n-1);
  
    switch (num_bits){
    case 0:
      mask = 1;
      break;
    case 1:
      mask = 3;
      break;
    case 2:
      mask = 7;
      break;
    case 3:
      mask = 0xf;
      break;
    case 4:
      mask = 0x1f;
      break;
    case 5:
      mask = 0x3f;
      break;
    case 6:
      mask = 0x7f;
      break;
    case 7:
      mask = 0xff;
      break;
    default:
      mask = 0xff;
      break;
    }
  
    mask <<= base;
    ret = (ret << (num_bits+1)) |
      (uint64_t)((*byte & mask) >> base);
  
    if( n > *bit ){
      ++byte;
      ++(*length);
      n -= *bit+1;
      *bit = 7;
    } else {
      *bit -= n;
      n = 0;
    }
  }
  
  if(count)
    ret |= (uint64_t)1 << n_bits;
  
  return ret;
}

            
inline uint32_t
get_leaf_node_offset(struct chmFile *chmfile,
                     const char *text,
                     uint32_t initial_offset,
                     uint32_t buff_size,
                     uint16_t tree_depth,
                     struct chmUnitInfo *ui)
{
  unsigned char word_len;
  unsigned char pos;
  uint16_t free_space;
  char *wrd_buf;
  char *word = NULL;
  uint32_t test_offset = 0;
  uint32_t i = sizeof(uint16_t);
  unsigned char *buffer = (unsigned char *)malloc (buff_size);

  if (NULL == buffer)
    return 0;
  
  while (--tree_depth) {
    if (initial_offset == test_offset) {
      FREE(buffer);
      return 0;
    }
    
    test_offset = initial_offset;
    if (chm_retrieve_object (chmfile, ui, buffer, 
                             initial_offset, buff_size) == 0) {
      FREE(buffer);
      return 0;
    }
    
    free_space = get_uint16 (buffer);
    
    while (i < buff_size - free_space) {

      word_len = *(buffer + i);
      pos = *(buffer + i + 1);
      
      wrd_buf = (char*)malloc (word_len);
      memcpy (wrd_buf, buffer + i + 2, word_len - 1);
      wrd_buf[word_len - 1] = 0;
      
      if (pos == 0) {
        FREE (word);
        word = (char *) strdup (wrd_buf);
      } else {
        word = (char*)realloc (word, word_len + pos + 1);
        strcpy (word + pos, wrd_buf);
      }

      FREE(wrd_buf);
      
      if (strcasecmp (text, word) <= 0) {
        initial_offset = get_uint32 (buffer + i + word_len + 1);
        break;
      }
      
      i += word_len + sizeof (unsigned char) + sizeof(uint32_t) + 
        sizeof(uint16_t);
    }
  }
  
  if(initial_offset == test_offset)
    initial_offset = 0;

  FREE(word);
  FREE(buffer);

  return initial_offset;
}

inline int 
pychm_process_wlc (struct chmFile *chmfile,
                   uint64_t wlc_count, uint64_t wlc_size,
                   uint32_t wlc_offset, unsigned char ds,
                   unsigned char dr, unsigned char cs,
                   unsigned char cr, unsigned char ls,
                   unsigned char lr, struct chmUnitInfo *uimain,
                   struct chmUnitInfo* uitbl,
                   struct chmUnitInfo *uistrings,
                   struct chmUnitInfo* topics,
                   struct chmUnitInfo *urlstr,
                   PyObject *dict)
{
  uint32_t stroff, urloff;
  uint64_t i, j, count;
  size_t length;
  int wlc_bit = 7;
  size_t off = 0;
  uint64_t index = 0;
  unsigned char entry[TOPICS_ENTRY_LEN];
  unsigned char combuf[COMMON_BUF_LEN];
  unsigned char *buffer = (unsigned char *)malloc (wlc_size);
  char *url = NULL;
  char *topic = NULL;

  if (chm_retrieve_object(chmfile, uimain, buffer, 
                          wlc_offset, wlc_size) == 0) {
    FREE(buffer);
    return false;
  }

  for (i = 0; i < wlc_count; ++i) {
    
    if(wlc_bit != 7) {
      ++off;
      wlc_bit = 7;
    }

    index += sr_int(buffer + off, &wlc_bit, ds, dr, &length);
    off += length;

    if(chm_retrieve_object(chmfile, topics, entry, 
                           index * 16, TOPICS_ENTRY_LEN) == 0) {
      FREE(topic);
      FREE(url);
      FREE(buffer);
      return false;
    }

    combuf[COMMON_BUF_LEN - 1] = 0;
    stroff = get_uint32 (entry + 4);

    FREE (topic);
    if (chm_retrieve_object (chmfile, uistrings, combuf, 
                             stroff, COMMON_BUF_LEN - 1) == 0) {
      topic = strdup ("Untitled in index");

    } else {
      combuf[COMMON_BUF_LEN - 1] = 0;
      
      topic = strdup ((char*)combuf);
    }
        
    urloff = get_uint32 (entry + 8);

    if(chm_retrieve_object (chmfile, uitbl, combuf, 
                            urloff, 12) == 0) {
      FREE(buffer);
      return false;
    }

    urloff = get_uint32 (combuf + 8);
    
    if (chm_retrieve_object (chmfile, urlstr, combuf, 
                             urloff + 8, COMMON_BUF_LEN - 1) == 0) {
      FREE(topic);
      FREE(url);
      FREE(buffer);
      return false;
    }
         
    combuf[COMMON_BUF_LEN - 1] = 0;

    FREE (url);
    url = strdup ((char*)combuf);

    if (url && topic) {
#ifdef __PYTHON__
      PyDict_SetItemString (dict, topic, 
                            PyString_FromString (url));
#else
      printf ("%s ==> %s\n", url, topic);
#endif
    }
        
    count = sr_int (buffer + off, &wlc_bit, cs, cr, &length);
    off += length;
    
    for (j = 0; j < count; ++j) {
      sr_int (buffer + off, &wlc_bit, ls, lr, &length);
      off += length;
    }
  }

  FREE(topic);
  FREE(url);
  FREE(buffer);

  return true;
}

int 
chm_search (struct chmFile *chmfile,
            const char *text, int whole_words, 
            int titles_only, PyObject *dict)
{
  unsigned char header[FTS_HEADER_LEN];
  unsigned char doc_index_s;
  unsigned char doc_index_r;
  unsigned char code_count_s;
  unsigned char code_count_r;
  unsigned char loc_codes_s;
  unsigned char loc_codes_r;
  unsigned char word_len, pos;
  unsigned char *buffer;
  char *word = NULL;
  uint32_t node_offset;
  uint32_t node_len;
  uint16_t tree_depth;
  uint32_t i;
  uint16_t free_space;
  uint64_t wlc_count, wlc_size;
  uint32_t wlc_offset;
  char *wrd_buf;
  unsigned char title;
  size_t encsz;
  struct chmUnitInfo ui, uitopics, uiurltbl, uistrings, uiurlstr;
  int partial = false;

  if (NULL == text)
    return -1;

  if (chm_resolve_object (chmfile, "/$FIftiMain", &ui) !=
      CHM_RESOLVE_SUCCESS || 
      chm_resolve_object (chmfile, "/#TOPICS", &uitopics) !=
      CHM_RESOLVE_SUCCESS ||
      chm_resolve_object (chmfile, "/#STRINGS", &uistrings) !=
      CHM_RESOLVE_SUCCESS ||
      chm_resolve_object (chmfile, "/#URLTBL", &uiurltbl) !=
      CHM_RESOLVE_SUCCESS ||
      chm_resolve_object (chmfile, "/#URLSTR", &uiurlstr) !=
      CHM_RESOLVE_SUCCESS)
    return false;

  if(chm_retrieve_object(chmfile, &ui, header, 0, FTS_HEADER_LEN) == 0)
    return false;
  
  doc_index_s = header[0x1E];
  doc_index_r = header[0x1F];
  code_count_s = header[0x20];
  code_count_r = header[0x21];
  loc_codes_s = header[0x22];
  loc_codes_r = header[0x23];

  if(doc_index_s != 2 || code_count_s != 2 || loc_codes_s != 2) {
    return false;
  }

  node_offset = get_uint32 (header + 0x14);
  node_len = get_uint32 (header + 0x2e);
  tree_depth = get_uint16 (header + 0x18);

  i = sizeof(uint16_t);
  
  buffer = (unsigned char*)malloc (node_len);
  
  node_offset = get_leaf_node_offset (chmfile, text, node_offset, node_len,
                                      tree_depth, &ui);
  
  if (!node_offset) { 
    FREE(buffer);
    return false;
  }
  
  do {
    
    if (chm_retrieve_object (chmfile, &ui, buffer, 
                             node_offset, node_len) == 0) {
      FREE(word);
      FREE(buffer);
      return false;
    }
    
    free_space = get_uint16 (buffer + 6);
    
    i = sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint16_t);
    
    encsz = 0;

    while (i < node_len - free_space) {
      word_len = *(buffer + i);
      pos = *(buffer + i + 1);
                        
      wrd_buf = (char*)malloc (word_len);
      memcpy (wrd_buf, buffer + i + 2, word_len - 1);
      wrd_buf[word_len - 1] = 0;
      
      if (pos == 0) {
        FREE(word);
        word = (char *) strdup (wrd_buf);
      } else {
        word = (char*)realloc (word, word_len + pos + 1);
        strcpy (word + pos, wrd_buf);
      }

      FREE(wrd_buf);
      
      i += 2 + word_len;
      title = *(buffer + i - 1);

      wlc_count = be_encint (buffer + i, &encsz);
      i += encsz;
      
      wlc_offset = get_uint32 (buffer + i);

      i += sizeof(uint32_t) + sizeof(uint16_t);
      wlc_size =  be_encint (buffer + i, &encsz);
      i += encsz;
      
      node_offset = get_uint32 (buffer);

      if (!title && titles_only)
        continue;
      
      if (whole_words && !strcasecmp(text, word)) {
        partial = pychm_process_wlc (chmfile, wlc_count, wlc_size, 
                                     wlc_offset, doc_index_s, 
                                     doc_index_r,code_count_s, 
                                     code_count_r, loc_codes_s, 
                                     loc_codes_r, &ui, &uiurltbl,
                                     &uistrings, &uitopics,
                                     &uiurlstr, dict);
        FREE(word);
        FREE(buffer);
        return partial;
      }
      
      if (!whole_words) {
        if (!strncasecmp (word, text, strlen(text))) {
          partial = true;
          pychm_process_wlc (chmfile, wlc_count, wlc_size, 
                             wlc_offset, doc_index_s, 
                             doc_index_r,code_count_s, 
                             code_count_r, loc_codes_s, 
                             loc_codes_r, &ui, &uiurltbl,
                             &uistrings, &uitopics,
                             &uiurlstr, dict);
          
        } else if (strncasecmp (text, word, strlen(text)) < -1)
          break;
      }

    }
  } while (!whole_words && 
           !strncmp (word, text, strlen(text)) && 
           node_offset);
  
  FREE(word);
  FREE(buffer);

  return partial;
}

00581 typedef struct {
  const char *file;
  int offset;
} Langrec;

Langrec lang_files[] = {
  {"/$FIftiMain",               0x7E},
  {"$WWKeywordLinks/BTree",     0x34},
  {"$WWAssociativeLinks/BTree", 0x34}
};

#define LANG_FILES_SIZE (sizeof(lang_files)/sizeof(Langrec))

int
chm_get_lcid (struct chmFile *chmfile) {
  struct chmUnitInfo ui;
  uint32_t lang;
  int i;

  for (i=0; i<LANG_FILES_SIZE; i++) {
  
    if (chm_resolve_object (chmfile, lang_files[i].file, &ui) == 
        CHM_RESOLVE_SUCCESS) {
    
      if (chm_retrieve_object (chmfile, &ui, (unsigned char *) &lang, 
                               lang_files[i].offset, sizeof(uint32_t)) != 0)
        return lang;
    }
  }

  return -1;
}

#ifdef __PYTHON__

static PyObject *
is_searchable (PyObject *self, PyObject *args) {
  struct chmFile *file;
  PyObject *obj0;
  struct chmUnitInfo ui;

  if (PyArg_ParseTuple (args, "O:is_searchable", &obj0)) {

    file = (struct chmFile *) PyCObject_AsVoidPtr(obj0);

    if (chm_resolve_object (file, "/$FIftiMain", &ui) !=
        CHM_RESOLVE_SUCCESS || 
        chm_resolve_object (file, "/#TOPICS", &ui) !=
        CHM_RESOLVE_SUCCESS ||
        chm_resolve_object (file, "/#STRINGS", &ui) !=
        CHM_RESOLVE_SUCCESS ||
        chm_resolve_object (file, "/#URLTBL", &ui) !=
        CHM_RESOLVE_SUCCESS ||
        chm_resolve_object (file, "/#URLSTR", &ui) !=
        CHM_RESOLVE_SUCCESS)
      return Py_BuildValue ("i", 0);
    else
      return Py_BuildValue ("i", 1);
  } else {
    PyErr_SetString(PyExc_TypeError, "Expected chmfile (not CHMFile!)");
    return NULL;
  }
}

static PyObject *
search (PyObject *self, PyObject *args) {
  char *text;
  int whole_words;
  int titles_only;
  int partial;
  struct chmFile *file;
  PyObject *obj0;
  PyObject *dict;

  if (PyArg_ParseTuple (args, "Osii:search", &obj0, &text, 
                        &whole_words, &titles_only)) {

    dict = PyDict_New();

    if (dict) {
      file = (struct chmFile *) PyCObject_AsVoidPtr(obj0);

      partial = chm_search (file, 
                            text, whole_words, titles_only, dict);
    
      return Py_BuildValue ("(iO)", partial, dict);

    } else {
      PyErr_NoMemory();
      return NULL;
    }
  } else {
    PyErr_SetString(PyExc_TypeError,
                    "Expected chmfile (not CHMFile!), string, int, int");
    return NULL;
  }
}

static PyObject *
get_lcid (PyObject *self, PyObject *args) {
  int code;
  struct chmFile *file;
  PyObject *obj0;

  if (PyArg_ParseTuple (args, "O:get_lcid", &obj0)) {

      file = (struct chmFile *) PyCObject_AsVoidPtr(obj0);

      code = chm_get_lcid (file);
    
      if (code != -1)
        return Py_BuildValue ("i", code);
      else 
        Py_INCREF(Py_None);
      return Py_None;
  } else {
    PyErr_SetString(PyExc_TypeError,"Expected a chmfile (not a CHMFile!)");
    return NULL;
  }
}

static PyMethodDef
IndexMethods[] = {
  {"get_lcid", get_lcid, METH_VARARGS, 
   "Returns LCID (Locale ID) for archive."},
  {"search", search, METH_VARARGS, 
   "Perform Full-Text search."},
  {"is_searchable", is_searchable, METH_VARARGS, 
   "Return 1 if it is possible to search the archive, 0 otherwise."},
  {NULL, NULL, 0, NULL}
};

#ifdef __cplusplus
extern "C"
#endif
MODEXPORT(void)
initchm_extra (void) {
  Py_InitModule ("chm_extra", IndexMethods);
}

#else

int
main (int argc, char **argv) {
  struct chmFile *file;
  char text[255];
  int whole_words, titles_only;
  int partial;

  if (argc == 2) {
    file = chm_open (argv[1]);

    if (file) {
      printf ("\nLCID= %d (%08X)\n", chm_get_lcid(file), chm_get_lcid(file));
      while (1) {
        printf ("\n<whole_words> <titles_only> <string>\n");
        printf ("> ");
        if (scanf ("%d %d %s", &whole_words, &titles_only, text))
          partial = chm_search (file, 
                                text, whole_words, titles_only, NULL);
        else 
          break;
        
        printf ("Partial = %d\n", partial);
      }

      chm_close (file);
      return 0;
    }

    return -1;

  } else {
    printf ("\n%s <filename>\n", argv[0]);
    return 0;
  }
}

#endif

Generated by  Doxygen 1.6.0   Back to index