spacer
Overview | Installation | Tyrant manager | Redis manager | Usage from Python | Design spec

An inverted index is an index data structure storing a mapping from content, such as words or numbers, to its locations in a database file, or in a document or a set of documents, in this case allowing full text search [more on wkipedia].

Lua extension

The inverted index for Tokyo Tyrant (and LightCloud) is hacked by Mikio Hirabayashi - the developer of Tokyo Tyrant:

--
-- Inverted index by the Lua extension of Tokyo Tyrant
--


-- constants
DELIMS = " \t\r\n"   -- delimiters of tokenizing
LIMNUM = 2000        -- limit number of kept occurrence
DEFMAX = 10          -- default maximum number of search


-- call back function when starting
function _begin()
   _log("Inverted index started")
end

-- call back function when ending
function _end()
   _log("Inverted index finished")
end

-- register a text into the index
function put(id, text)
   id = tonumber(id)
   if not id or id < 1 then
      return nil
   end
   if not text then
      return nil
   end
   local tokens = _tokenize(text)
   if math.random() < 5 / LIMNUM then
      for i = 1, #tokens do
         token = tokens[i]
         if not _lock(token) then
            _log("lock error")
            return nil
         end
         local ids = {}
         local idsel = _get(token)
         if idsel then
            ids = _unpack("w*", idsel)
         end
         local nids = {}
         local top = #ids - LIMNUM + 2
         if top < 1 then
            top = 1
         end
         for j = top, #ids do
            table.insert(nids, ids[j])
         end
         table.insert(nids, id)
         idsel = _pack("w*", nids)
         if not _put(token, idsel) then
            _log("put error")
            _unlock(token)
            return nil
         end
         _unlock(token)
      end
   else
      local idsel = _pack("w", id)
      for i = 1, #tokens do
         token = tokens[i]
         if not _lock(token) then
            _log("lock error")
            return nil
         end
         if not _putcat(token, idsel) then
            _log("putcat error")
            _unlock(token)
            return nil
         end
         _unlock(token)
      end
   end
   return "ok"
end

-- remove a text from the index
function out(id, text)
   id = tonumber(id)
   if not id or id < 1 then
      return nil
   end
   if not text then
      return nil
   end
   local tokens = _tokenize(text)
   for i = 1, #tokens do
      token = tokens[i]
      if not _lock(token) then
         _log("lock error")
         return nil
      end
      local ids = {}
      local idsel = _get(token)
      if idsel then
         ids = _unpack("w*", idsel)
      end
      local nids = {}
      for j = 0, #ids do
         if ids[j] ~= id then
            table.insert(nids, ids[j])
         end
      end
      idsel = _pack("w*", nids)
      if not _put(token, idsel) then
         _log("put error")
         _unlock(token)
         return nil
      end
      _unlock(token)
   end
   return "ok"
end

-- replace the text
function replace(id, befaft)
   id = tonumber(id)
   if not id or id < 1 then
      return nil
   end
   if not befaft then
      return nil
   end
   local pivot = string.find(befaft, "\n", 1, true)
   if not pivot then
      return nil
   end
   local bef = string.sub(befaft, 1, pivot - 1)
   local aft = string.sub(befaft, pivot + 1)
   if not out(id, bef) then
      return nil
   end
   if not put(id, aft) then
      return nil
   end
   return "ok"
end

-- search the index with a phrase of intersection
function search(phrase, max)
   if not phrase then
      return nil
   end
   max = tonumber(max)
   if not max or max < 0 then
      max = DEFMAX
   end
   local tokens = _tokenize(phrase)
   local tnum = #tokens
   if tnum < 1 then
      return "0\n"
   end
   local idsel = _get(tokens[1])
   local result = _unpack("w*", idsel)



   for i = 2, tnum do
      idsel = _get(tokens[i])
      local ids = _unpack("w*", idsel)
      result = _isect(result, ids)
   end




--   if tnum > 1 then
--      local rset = {}
--      table.insert(rset, result)
--      for i = 2, tnum do
--	 idsel = _get(tokens[i])
--	 result = _unpack("w*", idsel)
--	 table.insert(rset, result)
--      end
--      result = _isect(rset)
--   end



   table.sort(result)
   local rtxt = #result .. "\n"
   local bot = #result - max
   if bot < 1 then
      bot = 1
   end
   for i = #result, bot, -1 do
      if max < 1 then
         break
      end
      rtxt = rtxt .. result[i] .. "\n"
      max = max - 1
   end
   return rtxt
end

-- break a text into an array of tokens
function _tokenize(text)
   local tokens = {}
   local uniq = {}
   for token in string.gmatch(text, "[^" .. DELIMS .. "]+") do
      if #token > 0 and not uniq[token] then
         table.insert(tokens, token)
         uniq[token] = true
      end
   end
   return tokens
end



-- END OF FILE
spacer Login
Powered by Skeletonz
gipoco.com is neither affiliated with the authors of this page nor responsible for its contents. This is a safe-cache copy of the original web site.