How to Huffman encode non-ascii values

问题内容:

My goal is to encode an input file in three different ways and output each respective way with its own output file. The different forms of encoding include: creating a binary pattern for each unique character in an input file, representing the Huffman tree as a stream of characters, and representing the tree as a stream of bits. This code will create the right output for most cases, but one of the test cases is the html file for a website which contains non-ascii values. With the addition of non-ascii values the code improperly enqueues the characters: or orders the priority of recurring characters (ascii or not) incorrectly.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "huffman.h"

TreeNode* _buildTreeNode(const char ascii, const int freq){
  TreeNode* new = malloc(sizeof(TreeNode));
  if(new == NULL){
    return NULL;
  }
  new -> freq = freq;
  new -> ascii = ascii;
  new -> left = NULL;
  new -> right = NULL;
  return new;
}

TreeNode* _altBuildTreeNode(TreeNode* left, TreeNode* right){
  TreeNode* new = malloc(sizeof(TreeNode));
  if(new == NULL){
    return NULL;
  }
  new -> freq = left -> freq + right -> freq;
  new -> left = left;
  new -> right = right;
  return new;
}

MinTreeNode* _buildNode(const char ascii, const int freq){
  MinTreeNode* new = malloc(sizeof(MinTreeNode));
  if(new == NULL){
    return NULL;
  }
  TreeNode* temp = _buildTreeNode(ascii, freq);
  new -> ptr = (void*)temp;
  new -> next = NULL;
  return new;
}

MinTreeNode* _altBuildNode(TreeNode* left, TreeNode* right){
  MinTreeNode* new = malloc(sizeof(MinTreeNode));
  if(new == NULL){
    return NULL;
  }
  TreeNode* temp = _altBuildTreeNode(left, right);
  new -> ptr = (void*)temp;
  new -> next = NULL;
  return new;
}

static int _compare(const TreeNode* first, const TreeNode* second){
  int freq = first -> freq - second -> freq;
  if (freq == 0){
    if((first->left != NULL || (first -> right != NULL))){
      return 1;
    }else{
      return -1;
    }
  } else{
    return freq;
  }
}

static int _altCompare(const TreeNode* first, const TreeNode* second){
  int freq = first -> freq - second -> freq;
  if(freq == 0){
    return (int)(first -> ascii) - (int)(second -> ascii);
  }else{
    return freq;
  }
}

MinTreeNode* _enqueue(MinTreeNode **addr, int freq, char ascii, int (*cmp)(const TreeNode*, const TreeNode*)){
  MinTreeNode* prev = NULL;
  MinTreeNode* curr = *addr;
  int i = 1;
  void* ptr = NULL;
  MinTreeNode* new = _buildNode(ascii, freq);
  if(new == NULL){
    return NULL;
  }
  if(*addr == NULL){
    *addr = new;
    return *addr;
  }
  while(curr != NULL){
    ptr = curr -> ptr;
    if((cmp(new -> ptr, ptr)) < 0){
      if (prev != NULL){
        prev -> next = new;
      }else{
        *addr = new;
      }
      new -> next = curr;
      i = 0;
      break;
    }
    prev = curr;
    curr = curr -> next;
  }
  if (i == 1){
    prev -> next = new;
  }
  return new;
}

MinTreeNode* _altEnqueue(MinTreeNode **addr, TreeNode* left, TreeNode * right, int (*cmp)(const TreeNode*, const TreeNode*)){
  MinTreeNode* prev = NULL;
  MinTreeNode* curr = *addr;
  int i = 1;
  void* ptr = NULL;
  MinTreeNode* new = _altBuildNode(left, right);
  if(new == NULL){
    return NULL;
  }
  if (*addr == NULL){
    *addr = new;
    return *addr;
  }
  while(curr != NULL){
    ptr = curr -> ptr;
    if((cmp(new -> ptr, ptr)) < 0){
      if (prev != NULL){
        prev -> next = new;
      }else{
        *addr = new;
      }
      new -> next = curr;
      i = 0;
      break;
    }
    prev = curr;
    curr = curr -> next;
  }
  if (i == 1){
    prev -> next = new;
  }
  return new;
}

MinTreeNode* _dequeue(MinTreeNode **addr){
  if(*addr == NULL){
    return NULL;
  }
  MinTreeNode* tempNode = *addr;
  if(tempNode -> next == NULL){
    *addr = NULL;
  }else{
    *addr = tempNode -> next;
    tempNode -> next = NULL;
  }
  return tempNode;
}

void _huffmanOutput(FILE* output, TreeNode* new, char* direction, int num){
  if((new -> left == NULL) && (new -> right == NULL)){
    fprintf(output, "%c:%s\n", new -> ascii, direction);
    return;
  }else{
    direction[num] = '0';
    direction[num + 1] = '\0';
    _huffmanOutput(output, new -> left, direction, num + 1);
    direction[num] = '1';
    direction[num + 1] = '\0';
    _huffmanOutput(output, new -> right, direction, num + 1);
  }
}

void _charOutput(FILE* output, TreeNode* new){
  if(new == NULL){
    return;
  }
  _charOutput(output, new -> left);
  _charOutput(output, new -> right);
  if((new -> left == NULL) && (new -> right == NULL)){
      fprintf(output, "%d%c", 1, new -> ascii);
  } else {
      fprintf(output, "%d", 0);
      return;
  }
}

void bitOutput(FILE* output, FILE* output2){
  int pos = 2;
  char element = fgetc(output);
  int size = 0;
  int i = 1;
  int buffZ = 0xFF;
  int buff1 = 0xFF;
  int j = 0;
  int k = 0;
  int tempZ = buffZ >> 8;
  int temp1 = buffZ >> 8;
  int curr = buffZ >> 8;
  while ((element != EOF)){
    if(element == '1'){
      if(i == 1){
        pos = pos - 1;
      }
      buff1 = (buffZ << (8-pos))&(buffZ >> (pos - 1));
      curr = temp1 | buff1 | curr;
      pos = pos + 1;
      i = 0;
      j = 0;
      k = 0;
    } else if (element == '0'){
      if (i == 1){
        pos = pos - 1;
      }
      curr = temp1 | tempZ | curr;
      pos = pos + 1;
      i = 0;
      j = 0;
      k = 0;
    } else{
      if(pos > 1){
        pos = pos - 1;
        size = (int)element >> pos;
        curr = temp1 | size | curr;
        j = 1;
      } else {
        curr = (int)element;
      }
      pos = pos + 8;
    }
    if (pos > 8){
      fwrite(&curr, 1, 1, output2);
      if(j == 1){
        temp1 = (int)element << (16-pos);
        i = 1;
        pos = pos - 6;
      }else{
        temp1 = buffZ >> 8;
        i = 0;
        pos = 1;
      }
      curr = buffZ >> 8;
      buff1 = 0xFF;
      k = 1;
    }
    element = fgetc(output);
  }
  if (k == 0){
    fwrite(&curr, 1, 1, output2);
  }
}

void HuffmanCodes(char* in, char* out1, char* out2, char* out3){
  long int arr[256] = {0};
  int i = 0;
  FILE* input = fopen(in, "r");
  FILE* output1 = fopen(out1, "w");
  FILE* output2 = fopen(out2, "w");
  FILE* output3 = fopen(out3, "w");
  char element;
  while((element = fgetc(input))!= EOF){
    arr[(int)element]++;
  }
  MinTreeNode* head = NULL;
  while(i < 256){
    if(arr[i] != 0){
      _enqueue(&head, arr[i], (char)i, _altCompare);
    }
    i++;
  }
  MinTreeNode* left = NULL;
  MinTreeNode* right = NULL;
  while(head -> next != NULL){
    left = _dequeue(&head);
    right = _dequeue(&head);
    _altEnqueue(&head, left -> ptr, right -> ptr, _compare);
  }
  fclose(input);
  char* direction = malloc(sizeof(char) * 8);
  _huffmanOutput(output1, (TreeNode*)head -> ptr, direction, 0);
  fclose(output1);
  _charOutput(output2, (TreeNode*)head -> ptr);
  fprintf(output2, "%d", 0);
  free(direction);
  fclose(output2);
  FILE* readOutput2 = fopen(out2, "r");
  bitOutput(readOutput2, output3);
  fclose(readOutput2);
  fclose(output3);
}

//////////////header file//////////////////

#include <stdio.h>
#include <stdlib.h>

typedef struct _TreeNode{
  long int freq;
  unsigned char ascii;
  struct _TreeNode* left;
  struct _TreeNode* right;
}TreeNode;

typedef struct _MinTreeNode{
  void* ptr;
  struct _MinTreeNode* next;
}MinTreeNode;

void HuffmanCodes(char* in, char* out1, char* out2, char* out3);

////////////main.c//////////////////

#include <stdlib.h>
#include <stdio.h>
#include "huffman.h"

int main(int argc, char* argv[]){
    if (argc != 5){
        return EXIT_FAILURE;
    } 
    FILE* check = fopen(argv[1], "r");
    if(check == NULL){
      return EXIT_FAILURE;
    }
    fclose(check);
    HuffmanCodes(argv[1], argv[2], argv[3], argv[4]);
    return EXIT_SUCCESS;
}

////////////expected charOutput file///////////////

1a1o01p1c01_1'1j1P1*1[0001]1|01B1J01O1X0001q00001m01d0001-1#1x001g01h01y161401201N1L01&01�1�1@001�01A00171T00001b100001t1<1>00001l1w1:01"001e01F1$1%001\1R1�1�0001{001}1�1K1Z001^1�01�1�0001!001Y1Q1�1�01�1�001+000001,01.01;1v015180001r01u1=1M1I01(01)1E1D1U0019000001i0001 1/111k1301H1S01C1?1G01�001W1z0000001n01f1
01s000000

////////////actual charOutput file/////////////////

1a1o01p1_1C1W01z1j001'001-1#1P1*1[001]1|01B1J01O1X00001q1N1L00000001c1d0001m1g01y1x161400121&1A017000001h1b100001t1<1>00001l1w1T1^1K1Z001@01$01F01%1\01{0001,001:1.0001e01"1;1v01}1Y01M01!1Q01E01I00150001u1=181(1)000001r1i0001 1/111R1+01D01U1?1G0001901k0131H1S000001n01f101s000000

问题评论:

    
Can you explain what “messes up” means? Be specific.
    
Sorry I should have explained in more detail, I’ll go ahead and give this codes output vs what is expected
– JustCProblems
10 hours ago
    
@JustCProblems Did you forget something?
    
If you are experiencing encoding/decoding errors with “non-ASCII values”, chances are that you are handing your “characters” as signed entities. You are also wrong in assuming Huffman is for any kind of ‘characters’ – it is not. It can encode and decode any value. How you interpret those values (as text or whatever) is up to you.
    
@user2564301 Does that mean if I change char to unsigned it will be able to read non ascii values
– JustCProblems
51 mins ago

原文地址:

https://stackoverflow.com/questions/47746888/how-to-huffman-encode-non-ascii-values

Tags:,

添加评论

友情链接:蝴蝶教程