Huffman.H

00001 
00002 /*
00003   This file is part of Aleph system
00004 
00005   Copyright (c) 2002, 2003, 2004, 2005, 2006
00006   UNIVERSITY LOS ANDES (ULA) Merida - REPÚBLICA BOLIVARIANA DE VENEZUELA  
00007 
00008   - Center of Studies in Microelectronics & Distributed Systems (CEMISID) 
00009   - ULA Computer Science Department
00010   - FUNDACITE Mérida - Ministerio de Ciencia y Tecnología
00011 
00012   PERMISSION TO USE, COPY, MODIFY AND DISTRIBUTE THIS SOFTWARE AND ITS
00013   DOCUMENTATION IS HEREBY GRANTED, PROVIDED THAT BOTH THE COPYRIGHT
00014   NOTICE AND THIS PERMISSION NOTICE APPEAR IN ALL COPIES OF THE
00015   SOFTWARE, DERIVATIVE WORKS OR MODIFIED VERSIONS, AND ANY PORTIONS
00016   THEREOF, AND THAT BOTH NOTICES APPEAR IN SUPPORTING DOCUMENTATION.
00017 
00018   Aleph is distributed in the hope that it will be useful, but WITHOUT
00019   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
00020   or FITNESS FOR A PARTICULAR PURPOSE.
00021 
00022   UNIVERSIDAD DE LOS ANDES requests users of this software to return to 
00023 
00024   Leandro Leon
00025   CEMISID 
00026   Ed La Hechicera 
00027   3er piso, ala sur
00028   Facultad de Ingenieria 
00029   Universidad de Los Andes 
00030   Merida - REPÚBLICA BOLIVARIANA DE VENEZUELA    or
00031 
00032   lrleon@ula.ve
00033 
00034   any improvements or extensions that they make and grant Universidad 
00035   de Los Andes (ULA) the rights to redistribute these changes. 
00036 
00037   Aleph is (or was) granted by: 
00038   - Consejo de Desarrollo Cientifico, Humanistico, Tecnico de la ULA 
00039     (CDCHT)
00040   - Fundacite Mérida
00041 */
00042 
00043 
00044 # ifndef HUFFMAN_H
00045 # define HUFFMAN_H
00046 
00047 # include <memory>
00048 # include <autosprintf.h>
00049 # include <tpl_treap.H>
00050 # include <tpl_binHeap.H>
00051 # include <tpl_dynMapTree.H>
00052 # include <bitArray.H>
00053 
00054 namespace Aleph {
00055 
00056 class Huffman_Node;
00057 
00058 typedef DynMapTree<Treap_Vtl, string, Huffman_Node *> Symbol_Map;
00059 typedef BinNode< Aleph::pair<string, size_t> > Freq_Node;
00060 
00061 struct Huffman_Node : public BinHeap<size_t>::Node
00062 {
00063   BinNode<string> * bin_node; 
00064 
00065   Freq_Node * freq_node;
00066 
00067   Huffman_Node() : BinHeap<size_t>::Node(0), bin_node(NULL), freq_node(NULL) 
00068   {
00069     /* empty */ 
00070   }
00071 
00072   Huffman_Node(BinNode<string> * node) 
00073     : BinHeap<size_t>::Node(0), bin_node(node), freq_node(NULL)
00074   {
00075     /* empty */
00076   }
00077 
00078   ~Huffman_Node() { /* No debe liberarse memoria de bin_node */ }
00079 
00080 };
00081 typedef BinHeap<size_t> Huffman_Heap;
00082 static inline const size_t & get_freq(Huffman_Node * huffman_node)
00083 
00084 {
00085   return huffman_node->get_key();
00086 }
00087 
00088 static inline void increase_freq(Huffman_Node * huffman_node)
00089 
00090 {
00091   huffman_node->get_key()++;
00092 }
00093 
00094 static inline void set_freq(Huffman_Node * huffman_node, const size_t & freq)
00095 
00096 {
00097   huffman_node->get_key() = freq;
00098 }
00099 
00100 typedef DynMapTree<Treap_Vtl, string, BitArray> Code_Map;
00101 static inline bool is_leaf(BinNode<string> * p)
00102 {
00103   return LLINK(p) == NULL and RLINK(p) == NULL;
00104 }
00105 
00111 class Huffman_Encoder_Engine
00112 {
00113   BinNode<string> * root; 
00114   Huffman_Heap heap;
00115   Symbol_Map   symbol_map; 
00116   Code_Map code_map;
00117 
00118   Freq_Node * freq_root;
00119 
00120   string end_symbol;
00121   size_t text_len;
00122   void build_prefix_encoding(BinNode<string> * root, 
00123                              BitArray &        array, 
00124                              const size_t &    len)
00125   {
00126     if (is_leaf(root))
00127       {
00128         string & str = root->get_key();
00129 
00130         code_map.insert(str, BitArray(array, len));
00131 
00132         return;
00133       }
00134 
00135     array[len] = 0; // ir hacia la izquierda
00136     build_prefix_encoding(LLINK(root), array, len + 1);
00137 
00138     array[len] = 1; // ir hacia la derecha
00139     build_prefix_encoding(RLINK(root), array, len + 1);
00140   }
00141   void build_encoding_map()
00142   {
00143 
00144     I(symbol_map.size() > 0);
00145 
00146     if (root == NULL)
00147       throw domain_error("Huffman encoding tree has not been generated");
00148 
00149     const size_t h = computeHeightRec(root);
00150     BitArray array(h * symbol_map.size());
00151     symbol_map.empty();
00152     build_prefix_encoding(root, array, 0);
00153   }
00154   bool test_end(const string & str) const
00155   {
00156     if (end_symbol == "NO-END")
00157       return false;
00158 
00159     return end_symbol == str;
00160   }
00161   void update_freq(const string & str)
00162   {
00163 
00164     if (root != NULL)
00165       throw domain_error("Huffman encoding tree has already been generated");
00166 
00167     if (test_end(str))
00168       throw domain_error("End symbol has already been inserted");
00169 
00170     Huffman_Node * huffman_node = NULL;
00171 
00172     Huffman_Node ** huffman_node_ptr = symbol_map.test(str);
00173 
00174     if (huffman_node_ptr == NULL) // ¿Fue definido previamente el símbolo?
00175       { // No ==> crear una entrada en symbol_map e insertarlo en heap
00176         auto_ptr<BinNode<string> > bin_node_auto ( new BinNode<string> (str) );
00177 
00178         huffman_node = static_cast<Huffman_Node*>
00179           (heap.insert(new Huffman_Node(bin_node_auto.get())));
00180 
00181         symbol_map.insert(str, huffman_node);
00182 
00183         bin_node_auto.release();
00184       }
00185     else
00186       huffman_node = *huffman_node_ptr; // Ya definido, recuperarlo
00187 
00188     increase_freq(huffman_node); 
00189 
00190     heap.update(huffman_node);
00191   }
00192   static void append_code(BitArray & bit_stream, size_t & bit_stream_len,
00193                           const BitArray & symbol_code)
00194   {
00195     const size_t symbol_code_size = symbol_code.size();
00196 
00197     if (bit_stream_len + symbol_code_size > bit_stream.size())
00198 
00199       {
00200 
00201       bit_stream.resize(5 * (bit_stream_len + symbol_code_size) / 4);
00202 
00203         I(bit_stream_len + symbol_code_size < bit_stream.size());
00204       }
00205 
00206         
00207     for (int i = 0; i < symbol_code_size; i++) 
00208       bit_stream[bit_stream_len++] = symbol_code[i];
00209   }
00210   public:
00212   Huffman_Encoder_Engine() 
00213     : root(NULL), freq_root(NULL), end_symbol("NO-END"), text_len(0)
00214   {
00215     // empty
00216   }
00218   BinNode<string> * get_root() 
00219   {
00220     if (root == NULL)
00221       throw std::domain_error("Huffman tree has not been generated");
00222 
00223     return root; 
00224   }
00241   BinNode<string> * generate_huffman_tree(
00242 
00243   const bool & with_freqs = false
00244 
00245                                           ) 
00246   {
00247     while (heap.size() > 1)
00248       {
00249         Huffman_Node * l_huffman_node = // nodo izquierdo
00250           static_cast <Huffman_Node *> (heap.getMin());
00251 
00252         Huffman_Node * r_huffman_node = // nodo derecho
00253           static_cast <Huffman_Node *> (heap.getMin());
00254         BinNode <string> * bin_node = new BinNode <string>;
00255         Huffman_Node * huffman_node = new Huffman_Node (bin_node);
00256 
00257         LLINK(bin_node) = l_huffman_node->bin_node;
00258         RLINK(bin_node) = r_huffman_node->bin_node;
00259 
00260         const size_t new_freq = get_freq(l_huffman_node) + get_freq(r_huffman_node);
00261 
00262         Aleph::set_freq(huffman_node, new_freq);
00263 
00264         if (with_freqs)
00265           {
00266             Freq_Node *& l_freq_node = l_huffman_node->freq_node;
00267 
00268             if (l_freq_node == NULL)
00269               {
00270                 l_freq_node                    = new Freq_Node;
00271                 l_freq_node->get_key().first   = 
00272                   l_huffman_node->bin_node->get_key();
00273                 l_freq_node->get_key().second = l_huffman_node->get_key();
00274               }
00275 
00276             Freq_Node *& r_freq_node = r_huffman_node->freq_node;
00277 
00278             if (r_freq_node == NULL)
00279               {
00280                 r_freq_node                    = new Freq_Node;
00281                 r_freq_node->get_key().first   = 
00282                   r_huffman_node->bin_node->get_key();
00283                 r_freq_node->get_key().second = r_huffman_node->get_key();
00284               }
00285 
00286             const string str = gnu::autosprintf ("%d", new_freq);
00287 
00288             Freq_Node *& freq_node      = huffman_node->freq_node;
00289             freq_node                   = new Freq_Node;
00290             freq_node->get_key().first  = str;
00291             freq_node->get_key().second = huffman_node->get_key();
00292             LLINK(freq_node)            = l_freq_node;
00293             RLINK(freq_node)            = r_freq_node;
00294           }
00295 
00296         delete l_huffman_node;
00297         delete r_huffman_node;
00298 
00299         heap.insert(huffman_node);
00300       } // Al salir del while,  queda en el heap un solo nodo que contiene
00301         // la raíz del árbol de prefijos  
00302     
00303     Huffman_Node * huffman_root = static_cast <Huffman_Node *> (heap.getMin());
00304 
00305     root = huffman_root->bin_node;
00306 
00307     if (with_freqs)
00308       freq_root = huffman_root->freq_node;
00309 
00310     delete huffman_root;
00311 
00312     build_encoding_map(); // construir mapeo de códigos
00313 
00314     return root;
00315   }
00317   Freq_Node * get_freq_root() 
00318   { 
00319     if (freq_root == NULL)
00320       throw std::domain_error("Huffman tree has not been generated");
00321 
00322     return freq_root; 
00323   }
00333   void set_freq(const string & str, const size_t & freq)
00334 
00335   {
00336     if (root != NULL)
00337       throw domain_error("Huffman encoding tree has already been generated");
00338 
00339     if (test_end(str))
00340       throw domain_error("End symbol has already been inserted");
00341        
00342         // Buscar símbolo str
00343     Huffman_Node ** huffman_node_ptr = symbol_map.test(str);
00344 
00345     if (huffman_node_ptr != NULL) // ¿ya fue definido?
00346       throw std::domain_error // Sí ==> esto es un error!
00347         (gnu::autosprintf("Frequency for symbol %s has already set", 
00348                           str.c_str()));
00349 
00350     auto_ptr<BinNode<string> > bin_node_auto ( new BinNode<string> (str) );
00351 
00352     Huffman_Node * huffman_node = new Huffman_Node(bin_node_auto.get());
00353 
00354     Aleph::set_freq(huffman_node, freq); 
00355 
00356     heap.insert(huffman_node);
00357 
00358     symbol_map.insert(str, huffman_node);
00359 
00360     bin_node_auto.release();
00361   }
00362 
00363   private:
00364 
00365     static const size_t Max_Token_Size = 256;
00366 
00367   public:
00368 
00381   void read_input(char * input
00382 
00383   , const bool & with_freqs = false
00384 
00385                   )
00386 
00387   {
00388     if (root != NULL)
00389       throw domain_error("Huffman encoding tree has already been generated");
00390 
00391     char * curr_stream = input;
00392     char curr_token[Max_Token_Size];
00393     curr_token[1] = '\0';
00394 
00395     while (*curr_stream != '\0')
00396       {
00397         curr_token[0] = *curr_stream++; 
00398 
00399         update_freq(curr_token);
00400         text_len++;
00401       }
00402 
00403     set_end_of_stream("");
00404     generate_huffman_tree(with_freqs);
00405   }
00418   void read_input(ifstream & input, const bool & with_freqs = false)
00419   {
00420     if (root != NULL)
00421       throw domain_error("Huffman encoding tree has already been generated");
00422 
00423     char curr_token[2];
00424     curr_token[0] = curr_token[1] = '\0';
00425 
00426     while (not input.eof())
00427       {
00428         input.read(curr_token, 1); 
00429 
00430         update_freq(curr_token);
00431         text_len++;
00432       }
00433       
00434     set_end_of_stream("");
00435     generate_huffman_tree(with_freqs);
00436   }
00437 
00439   void set_end_of_stream(const string & str)
00440   {
00441     if (test_end(str))
00442       throw domain_error("End symbol has already been inserted");
00443 
00444     if (root != NULL)
00445       throw domain_error("Huffman encoding tree has already been generated");
00446 
00447     auto_ptr<BinNode<string> > bin_node_auto ( new BinNode<string> (str) );
00448 
00449     Huffman_Node * huffman_node =  static_cast<Huffman_Node*>
00450       (heap.insert(new Huffman_Node(bin_node_auto.get())));
00451 
00452     symbol_map.insert(str, huffman_node); 
00453 
00454     bin_node_auto.release();
00455 
00456     end_symbol = str;
00457   }
00471   size_t encode(char * input, BitArray & bit_stream)
00472   {
00473 
00474     if (root == NULL)
00475       throw std::domain_error("Huffman tree has not been generated");
00476 
00477     char * curr_stream = input;
00478     char curr_token[Max_Token_Size];
00479     curr_token[1] = '\0';
00480 
00481     size_t bit_stream_len = 0;
00482 
00483     while (*curr_stream != '\0')
00484       {
00485         curr_token[0] = *curr_stream++; 
00486 
00487         append_code(bit_stream, bit_stream_len, code_map[curr_token]);
00488       }
00489 
00490     append_code(bit_stream, bit_stream_len, code_map[""]);
00491 
00492     return bit_stream_len;
00493   }
00494 
00508   size_t encode(ifstream & input, BitArray & bit_stream)
00509   {
00510     if (root == NULL)
00511       throw std::domain_error("Huffman tree has not been generated");
00512 
00513     char curr_token[2];
00514     curr_token[0] = curr_token[1] = '\0';
00515     size_t bit_stream_len = 0;
00516 
00517     while (not input.eof())
00518       {
00519         input.read(curr_token, 1); 
00520 
00521         append_code(bit_stream, bit_stream_len, code_map[curr_token]);
00522       }
00523 
00524     append_code(bit_stream, bit_stream_len, code_map[""]);
00525 
00526     return bit_stream_len;
00527   }
00528 
00529 };
00530 
00536 class Huffman_Decoder_Engine
00537 {
00538   BinNode<string> * root; 
00539   string end_symbol;
00540   public:
00541 
00550   Huffman_Decoder_Engine(BinNode<string> * p, const string & end)
00551     : root(p), end_symbol(end)
00552   {
00553     // empty
00554   }
00556   BinNode<string> * get_root() 
00557   {
00558     if (root == NULL)
00559       throw std::domain_error("Huffman tree has not been generated");
00560 
00561     return root; 
00562   }
00575   void decode(BitArray &     bit_stream, 
00576               const size_t & bit_stream_len,
00577               ostream &      output)
00578   {
00579     BinNode<string> * p = root;
00580 
00581     for (int i = 0; i < bit_stream_len; ++i)
00582       {
00583         if (bit_stream.read_bit(i) == 0)
00584           p = LLINK(p);
00585         else
00586           p = RLINK(p);
00587 
00588 
00589        if (p == NULL)
00590          throw std::domain_error("Invalid bits sequence");
00591 
00592        if (is_leaf(p)) // ¿es hoja?
00593          {     // sí ==> escribir smíbolo y reiniciar a raíz
00594            const string & symbol = p->get_key();
00595             
00596            if (symbol == end_symbol) // ¿se alcanza final?
00597              break;
00598 
00599            output << symbol;
00600 
00601            p = root;
00602          }
00603       }
00604   }
00605 };
00606 
00607 } // end namespace Aleph
00608 
00609 # endif // HUFFMAN_H
00610 

Leandro R. León