问渠网-机器学习笔记-第四节程序实现

本文作者：李德强
第四节程序实现
为了让程序能够读取训练样本的数据，我们将样本文字描述修改为数字：
色泽	敲声	纹理	好瓜
0	0	0	1
1	1	0	1
1	0	0	1
0	1	0	1
2	0	0	1
1	1	1	0
0	2	0	0
2	2	1	0
2	0	1	0
0	0	1	0
然后把数据做成训练样本读入程序。
//决策树
typedef struct s_dtree
{
	//当前节点样本
	int* sample_x;
	//样本数
	int sample_c;
	//特征数
	int sample_f;
	//当前特征号
	int sample_fi;
	//当前特征值
	int sample_v;
	//样本索引
	int sample_ind;
	//分类值
	int sample_y;
	//孩子节点
	struct s_dtree *child;
	//兄弟节点
	struct s_dtree *sibling;
} s_dtree;

//链表
typedef struct s_slist
{
	//关键字值
	int key;
	//关键字出现的次数
	int count;
	//下一节点
	struct s_slist* next;
} s_slist;


//决策树根节点
s_dtree *root = NULL;

//计算关键字出现的次数
int slist_key_count(s_slist* list, int key)
{
	if (list == NULL)
	{
		return -1;
	}

	//头节点
	if (list->key == key)
	{
		list->count++;
		return 0;
	}

	s_slist *p = list;
	//查找关键字
	while (p->next != NULL)
	{
		//关键字相同的节点
		if (p->next->key == key)
		{
			//出现个数加1
			p->next->count++;
			return 0;
		}
		p = p->next;
	}

	//如果没找到关键字，创建一个新节点
	s_slist *pnew = malloc(sizeof(s_slist));
	pnew->key = key;
	pnew->count = 1;
	pnew->next = NULL;
	p->next = pnew;

	return 0;
}

//取得关键字出现的次数
int slist_get_count(s_slist *list, int key)
{
	s_slist *p = list;
	while (p != NULL)
	{
		if (p->key == key)
		{
			return p->count;
		}
		p = p->next;
	}
	return 0;
}

//释放内存资源
int slist_free(s_slist *list)
{
	s_slist *p = list;
	while (p != NULL)
	{
		s_slist *d = p;
		p = p->next;
		free(d);
	}

	return 0;
}

//计算香农熵
double dt_ent_value(s_slist *feature, int csample)
{
	if (feature == NULL)
	{
		return 0;
	}
	//计算特征每一种取值的概率
	double px = (double) feature->count / (double) csample;
	//递归计算香农熵
	return -px * log2(px) + dt_ent_value(feature->next, csample);
}

//计算每一个特征的香农熵，并返回熵最大的特征号
int dt_ent(s_slist *features, int* sample_x, int csample, int cfeature)
{
	//最大香农熵
	double max_ent = 0;
	//最大熵的特征
	int max_feature = 0;
	//计算每一个特征的香农熵
	for (int j = 0; j < cfeature; j++)
	{
		//计算特征每一种取值的个数
		for (int i = 0; i < csample; i++)
		{
			//链表头
			if (i == 0)
			{
				//特征取值
				features[j].key = sample_x[i * cfeature + j];
				//出现个数
				features[j].count = 1;
				features[j].next = NULL;
				continue;
			}
			//计算特征每一种取值的个数
			slist_key_count(&features[j], sample_x[i * cfeature + j]);
		}
		//计算香农熵
		double ent = dt_ent_value(&features[j], csample);
		//找到最大香农熵
		if (ent > max_ent)
		{
			max_ent = ent;
			//等到具有最大香农熵的特征号
			max_feature = j;
		}
	}
	//返回具有最大香农熵的特征号
	return max_feature;
}

//构建决策树的节点
int dt_create_node(s_dtree *node, int *sample_x, int row, s_slist *features, int feature, int count_feature)
{
	if (node == NULL)
	{
		return -1;
	}

	//取得当前样本特征值
	int value = sample_x[row * count_feature + feature];
	//如果与当前节点值相同
	if (node->sample_v == value)
	{
		//复制样本数据到节点中的样本子集
		for (int j = 0, k = 0; j < count_feature; j++)
		{
			if (j != feature)
			{
				node->sample_x[node->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j];
				k++;
			}
		}
		//样本索引号加1
		node->sample_ind++;
		return 0;
	}

	s_dtree *p = node;
	//查找当前节点的兄弟节点
	while (p->sibling != NULL)
	{
		//如果兄弟节点值相同
		if (p->sibling->sample_v == value)
		{
			//复制样本数据到节点中的样本子集
			for (int j = 0, k = 0; j < count_feature; j++)
			{
				if (j != feature)
				{
					p->sibling->sample_x[p->sibling->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j];
					k++;
				}
			}
			//样本索引号加1
			p->sibling->sample_ind++;
			return 0;
		}
		p = p->sibling;
	}

	//如果没有找到值相同的节点，则创建一个新节点
	s_dtree *pnew = malloc(sizeof(s_dtree));
	//样本特征值
	pnew->sample_v = sample_x[row * count_feature + feature];
	//具有相同值的样本数
	int s_count = slist_get_count(&features[feature], pnew->sample_v);
	//样本数
	pnew->sample_c = s_count;
	//样本索引
	pnew->sample_ind = 0;
	//样本子集
	pnew->sample_x = malloc(sizeof(int) * s_count * (count_feature - 1));
	for (int j = 0, k = 0; j < count_feature; j++)
	{
		if (j != feature)
		{
			pnew->sample_x[pnew->sample_ind * (count_feature - 1) + k] = sample_x[row * count_feature + j];
			k++;
		}
	}
	//样本索引号加1
	pnew->sample_ind++;
	//特征数
	pnew->sample_f = count_feature - 1;
	//当前节点特征号
	pnew->sample_fi = feature;
	//分类鸮
	pnew->sample_y = -1;
	pnew->child = NULL;
	pnew->sibling = NULL;
	//加入新节点
	p->sibling = pnew;

	return 0;
}

//创建决策树
s_dtree* dt_create(s_dtree *p_node, int* sample_x, int csample, int cfeature)
{
	if (sample_x == NULL)
	{
		return NULL;
	}

	//训练样本的特征等于1,说明只有最后一个特征，即分类号
	if (cfeature <= 1)
	{
		//计算分类号出现的次数
		s_slist *ps = malloc(sizeof(s_slist));
		for (int i = 0; i < csample; i++)
		{
			if (i == 0)
			{
				ps->key = sample_x[i * cfeature];
				ps->count = 1;
				ps->next = NULL;
				continue;
			}
			//计算分类号出现的次数
			slist_key_count(ps, sample_x[i * cfeature]);
		}
		//找出出现次数最多的分类
		s_slist *p = ps;
		int max_count = 0;
		int max_y = 0;
		while (p != NULL)
		{
			if (p->count > max_count)
			{
				max_count = p->count;
				max_y = p->key;
			}
			p = p->next;
		}
		slist_free(ps);
		//得到分类号
		p_node->sample_y = max_y;
		return NULL;
	}

	//当前等待处理的节点
	s_dtree *node = NULL;
	//特征数组，为计算出现样本数、香农熵用
	s_slist *features = malloc(sizeof(s_slist) * cfeature);
	//计算香农熵
	int feature = dt_ent(features, sample_x, csample, cfeature);
	//将样本按不用的取值划分到不用节点上，每一个节点即为一个数据划分子集
	for (int i = 0; i < csample; i++)
	{
		if (i == 0)
		{
			//新节点
			node = malloc(sizeof(s_dtree));
			//样本特征值
			node->sample_v = sample_x[i * cfeature + feature];
			//样本子集数量
			int s_count = slist_get_count(&features[feature], node->sample_v);
			//样本子集数量
			node->sample_c = s_count;
			//样本子集索引号
			node->sample_ind = 0;
			//样本子集
			node->sample_x = malloc(sizeof(int) * s_count * (cfeature - 1));
			//复制样本数据到子集中
			for (int j = 0, k = 0; j < cfeature; j++)
			{
				if (j != feature)
				{
					node->sample_x[node->sample_ind * cfeature + k] = sample_x[i * cfeature + j];
					k++;
				}
			}
			//样本子集索引号加1
			node->sample_ind++;
			//节点特征数
			node->sample_f = cfeature - 1;
			//节点当前特征号
			node->sample_fi = feature;
			//分类号
			node->sample_y = -1;
			node->child = NULL;
			node->sibling = NULL;
			//如果父节点为空
			if (p_node == NULL)
			{
				//即为根节点
				root = node;
				continue;
			}
			//将新节点挂接到父节点上
			p_node->child = node;
			continue;
		}
		//根据训练样本划分子集，创建节点
		dt_create_node(node, sample_x, i, features, feature, cfeature);
	}
	//释放资源
	slist_free(features);

	//递归处理所有节点
	s_dtree *p = node;
	while (p != NULL)
	{
		//递归创建决策树
		dt_create(p, p->sample_x, p->sample_c, cfeature - 1);
		p = p->sibling;
	}

	return node;
}

//释放内存资源
int dt_free(s_dtree *node)
{
	s_dtree *p = node;
	if (p == NULL)
	{
		return 0;
	}
	s_dtree *d = p;
	dt_free(p->sibling);
	dt_free(p->child);
	if (d->sample_x != NULL)
	{
		//释放样本内存
		free(d->sample_x);
	}
	//释放树节点内存
	free(d);

	return 0;
}

//使用决策树分类
int dt_classify(s_dtree *node, int *sample_x)
{
	if (node == NULL)
	{
		return -1;
	}

	//取得样本在当前节点中特征号的值
	int v = sample_x[node->sample_fi];
	//如果是最底层节点，特征号为1,即分类
	if (node->sample_f == 1)
	{
		s_dtree *p = node;
		//如果样本特征值与节点样本特征值相同
		if (node->sample_v == v)
		{
			//返回分类号
			return node->sample_y;
		}
		//递归使用决策树分类用其兄弟节点分类
		return dt_classify(node->sibling, sample_x);
	}

	//如果样本特征值与节点样本特征值相同
	if (node->sample_v == v)
	{
		//新样本，去掉当前特征
		int sample_target[node->sample_f];
		//复制数据到新样本
		for (int i = 0, k = 0; i < node->sample_f; i++)
		{
			//去掉当前特征
			if (i != node->sample_fi)
			{
				sample_target[k] = sample_x[i];
				k++;
			}
		}
		//递归子节点
		return dt_classify(node->child, sample_target);
	}
	//递归兄弟节点
	return dt_classify(node->sibling, sample_x);
}