哈夫曼树 (Huffman Tree) 的理解与代码实现

发表于 2020-10-27 更新于 2020-12-08 分类于算法，数据结构

本文字数： 7.3k

哈夫曼编码 —— 二进制前缀编码，以 n 个字符的出现频率作为权重，以求得字长最短的二进制编码

哈夫曼树 —— 采用二叉树的形式储存编码格式

前缀编码：字符集中任一字符的编码都不是其它字符的编码的前缀

数据结构课代码实现之哈夫曼树

简要理解

在已知每个字符结点及其权重的情况下，每次选择两个权重最小的字符结点，创建一个新的结点

将新结点的左右孩子结点设置为选择的两个字符结点，设置新结点权重为两子结点权重之和

设置两个字符结点的双亲结点为新结点

标记两个子结点已被选择过，并将在下次选择时将新结点设为可选择

有 n 个字符的哈夫曼树即有 n 个叶子结点的哈夫曼树共有 2n - 1 个结点

所以在 n - 1 次选择过后哈夫曼树即被建立

具体理解与代码实现

声明

#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <cstdlib>
#include <map>
#include <utility>

#define swap(a, b) {int c = a; a = b; b = c;}

using namespace std;

typedef struct
{
    unsigned int weight;
    unsigned int parent, lchild, rchild;
    char data;
} HTNode, *HuffmanTree;

typedef char **HuffmanCode;

bool *visit;                    // 储存结点是否被选择过 

map<char, unsigned int> x;      // 读取模式2 下字符与权值储存在 map 容器中

struct w
{
    unsigned int w;             // 在 Select 函数中排序临时使用的结构体容器
    int pos;
};

bool cmp(w a, w b)
{
    if (a.w < b.w)
        return true;
    else
        return false;
}

Select 函数

void Select(HuffmanTree HT, int i, int &s1, int &s2)
{
    w we[i];                    // 使用临时的结构体 排序不改变原结构顺序
    for (int j = 1; j <= i; j++)
    {
        we[j].w = HT[j].weight;
        we[j].pos = j;
    }
    sort(we + 1, we + i + 1, cmp);  // 未使用 0 地址
    for (int j = 1; j <= i; j++)
    {
        if (!visit[we[j].pos])
        {
            if (s1 == 0)
                s1 = we[j].pos;
            else if (s2 == 0)
            {
                s2 = we[j].pos;
                break;
            }
        }
    }
    if (s1 > s2)
        swap(s1, s2);                // 确保 s1 <= s2
    visit[s1] = true;
    visit[s2] = true;
}

哈夫曼建树编码的过程

读取模式1 (预先设置好每个字符的权重手动输入对应字符)

方法一 (由叶子结点到根编码)

void HuffmanCoding(HuffmanTree &HT, HuffmanCode &HC, unsigned int *w, int n)
{
    if (n < 1)
        return;
    int m = 2 * n - 1;
    HT = (HuffmanTree)malloc((m + 1) * sizeof(HTNode)); // 2n - 1 个空间，不使用 0 地址所以多开一个位置
    HuffmanTree p = HT + 1;
    int i;
    cout << "请输入各字符：(不需要空格分开，回车结束)" << endl;
    for (i = 1; i <= n; ++i, ++p, ++w)
    {
        char x;
        x = getchar();
        *p = {*w, 0, 0, 0, x};
    }
    for (; i <= m; ++i)
    {
        int s1 = 0;
        int s2 = 0;
        Select(HT, i - 1, s1, s2);
        HT[s1].parent = i;
        HT[s2].parent = i;
        HT[i].lchild = s1;
        HT[i].rchild = s2;
        HT[i].weight = HT[s1].weight + HT[s2].weight;
    }

    HT[m].parent = 0;       //编码时访问至根节点会查询根节点的双亲节点 所以初始化成0
    // 查了好久才发现的问题...
    HC = (HuffmanCode)malloc((n + 1) * sizeof(char *));
    char *cd = (char *)malloc(n * sizeof(char));        // cd 是临时储存编码过程的空间
    cd[n - 1] = '\0';
    for (i = 1; i <= n; ++i)
    {
        int start = n - 1;
        int f, c;
        for (c = i, f = HT[i].parent; f != 0; c = f, f = HT[f].parent)
        {
            if (HT[f].lchild == c)
                cd[--start] = '0';
            else
                cd[--start] = '1';
        }
        HC[i] = (char *)malloc((n - start) * sizeof(char));
        strcpy(HC[i], &cd[start]);          // \0前的所有内容复制至 HC[i] 包括 \0
    }
    free(cd);
}

方法二 (由根向叶子结点编码)

void HuffmanCoding2(HuffmanTree &HT, HuffmanCode &HC, unsigned int* w, int n)
{
    
    /* 省略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略 */
    
    /* 同方法一的数据读入 */
    
    char *cd = (char *)malloc(n * sizeof(char));
    HC = (HuffmanCode)malloc((n + 1) * sizeof(char *));
    int p = 2 * n - 1;
    int cdlen = 0;              // 编码过程中维护的 cd 长度
    HT[p].parent = 0;
    for (int i = 1; i <= p; i++)
        HT[i].weight = 0;
    while (p)                   // 最后会指向 2n - 1 结点的父亲 设置的是 0
    {
        if (HT[p].weight == 0)
        {
            HT[p].weight = 1;   // 标记该结点左孩子结点已被遍历过
            if (HT[p].lchild != 0)
            {
                p = HT[p].lchild;
                cd[cdlen++] = '0';
            }
            else if (HT[p].rchild == 0)
            {
                HC[p] = (char *)malloc((cdlen + 1) * sizeof(char));
                cd[cdlen] = '\0';
                strcpy(HC[p], cd);      // 左右孩子结点都为空为叶子结点 完成编码一次
            }
        }
        else if (HT[p].weight == 1)
        {
            HT[p].weight = 2;           // 标记该结点右孩子结点已被遍历过
            if (HT[p].rchild != 0)
            {
                p = HT[p].rchild;
                cd[cdlen++] = '1';
            }
        }
        else
        {
            HT[p].weight = 0;
            p = HT[p].parent;           // 退回双亲结点
            --cdlen;                    // 长度减一
        }
    }
}

读取模式2 (输入一段话来计算字符与相应权重)

void HuffmanCoding3(HuffmanTree &HT, HuffmanCode &HC,int n)
{
    int i;
    int m = 2 * n - 1;
    HT = (HuffmanTree)malloc((m + 1) * sizeof(HTNode));
    HuffmanTree p = HT + 1;
    map<char, unsigned int>::iterator it;       // 迭代器遍历 map
    cout << "字符" << " " << "权重" << endl;
    for (it = x.begin(); it != x.end(); it++)
    {
        *p = {it->second, 0, 0, 0, it->first};
        p++;
        cout << it->first << "    " << it->second << endl;
    }
    
    /* 省略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略略 */
    
    /* 同读取模式1的建树编码 */
    
}

输入原字段输出哈夫曼编码

void prtHuf(HuffmanTree HT, HuffmanCode HC, int n)
{
    char *read;
    read = (char *)malloc(1000 * sizeof(char));     // 不够你就开大点
    char a;
    a = getchar();
    int cnt = 0;
    while (a != '#')
    {
        for (int i = 1; i <= n; i++)
        {
            if (a == HT[i].data)
            {
                strcpy(read + cnt, HC[i]);      //暴力写法，不要在意
                cnt += strlen(HC[i]);
            }
        }
        a = getchar();
    }
    getchar();
    cout << "Huffman编码：" << endl;
    cout << read << endl;
}

输入哈夫曼编码输出原字段

void prtData(HuffmanTree HT, int n)
{
    char a;
    int pos = 2 * n - 1;            // 由根节点开始译码
    char *ans = (char *)malloc(1000 * sizeof(char));        // 不够你就开大点
    int cnt = 0;
    a = getchar();
    while (a != '#')
    {
        if (a == '0')
            pos = HT[pos].lchild;
        if (a == '1')
            pos = HT[pos].rchild;
        if (HT[pos].lchild == 0 || HT[pos].rchild == 0)
        {
            ans[cnt++] = HT[pos].data;
            pos = 2 * n - 1;                // 到叶子结点后返回根节点
        }
        a = getchar();
    }
    getchar();
    ans[++cnt] = '\0';
    cout << "原字段：" << endl;
    cout << ans;
}

main函数

int main()
{
    cout << "1.手动输入字符与权重" << endl << "2.字段自动识别" << endl << "请输入相应序号：" << endl;
    int mode;
    cin >> mode;
    getchar();
    if (mode == 1)
    {
        int n;
        cout << "请输入字符个数：" << endl;
        cin >> n;
        unsigned int *w = (unsigned int *)malloc(n * sizeof(unsigned int));
        visit = (bool *)malloc((2 * n + 1) * sizeof(bool));
        for (int i = 0; i <= 2 * n + 1; i++)
            visit[i] = false;
        cout << "请输入各字符权重：(空格分开回车结束)" << endl;
        for (int i = 0; i < n; i++)
            cin >> w[i];
        getchar();
        HuffmanCode Code, Code1;
        HuffmanTree Tree;
        HuffmanCoding(Tree, Code, w, n);
        cout << "请输入原字段,以#结束： " << endl;
        prtHuf(Tree, Code, n);
        cout << "请输入Huffman编码,以#结束：" << endl;
        prtData(Tree, n);
    }
    else if (mode == 2)
    {
        cout << "请输入一段话：(以#结束)" << endl;
        char a;
        a = getchar();
        while (a != '#')
        {
            if (!x.count(a))
            {
                x[a] = 1;
            }
            else
            {
                x[a]++;
            }
            a = getchar();
        }
        int n = x.size();
        unsigned int *w = (unsigned int *)malloc(n * sizeof(unsigned int));
        visit = (bool *)malloc((2 * n + 1) * sizeof(bool));
        for (int i = 0; i <= 2 * n + 1; i++)
            visit[i] = false;
        HuffmanCode Code;
        HuffmanTree Tree;
        HuffmanCoding3(Tree, Code, n);
        cout << "请输入原字段,以#结束： " << endl;
        prtHuf(Tree, Code, n);
        cout << "请输入Huffman编码,以#结束：" << endl;
        prtData(Tree, n);
    }
    return 0;
}

给个数据

1.手动输入字符与权重
2.字段自动识别
请输入相应序号：
2
请输入一段话：(以#结束)
Youth is not a time of life; it is a state of mind. It is not a matter of rosy cheeks, red lips and supple knees. It is a matter of the will, a quality of the imagination, vigor of the emotions; it is the freshness of the deep spring of life.#
字符 权重
     51
,    3
.    3
;    2
I    2
Y    1
a    12
c    1
d    4
e    22
f    11
g    3
h    8
i    20
k    2
l    7
m    6
n    10
o    16
p    5
q    1
r    7
s    16
t    22
u    3
v    1
w    1
y    2
请输入原字段,以#结束：
I think i am a pop man.#
Huffman编码：
1001101000101001011001101010011100011000001100111000011000110111101011011100011100110110101111111
请输入Huffman编码,以#结束：
1001101000101001011001101010011100011000001100111000011000110111101011011100011100110110101111111#
原字段：
I think i am a pop man.

1.手动输入字符与权重
2.字段自动识别
请输入相应序号：
1
请输入字符个数：
8
请输入各字符权重：(空格分开回车结束)
15 7 5 4 9 2 1 6
请输入各字符：(不需要空格分开，回车结束)        
ILOVEYU                       // U 后面有个空格
请输入原字段,以#结束：
I LOVE YOU#
Huffman编码：
100111100101110000111111001011111
请输入Huffman编码,以#结束：
100111100101110000111111001011111#
原字段：
I LOVE YOU