diff --git a/Others/DataMining_BayesNetwork/BayesNetWorkTool.java b/Others/DataMining_BayesNetwork/BayesNetWorkTool.java new file mode 100644 index 0000000..cbf99ae --- /dev/null +++ b/Others/DataMining_BayesNetwork/BayesNetWorkTool.java @@ -0,0 +1,328 @@ +package DataMining_BayesNetwork; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * 贝叶斯网络算法工具类 + * + * @author lyq + * + */ +public class BayesNetWorkTool { + // 联合概率分布数据文件地址 + private String dataFilePath; + // 事件关联数据文件地址 + private String attachFilePath; + // 属性列列数 + private int columns; + // 概率分布数据 + private String[][] totalData; + // 关联数据对 + private ArrayList attachData; + // 节点存放列表 + private ArrayList nodes; + // 属性名与列数之间的对应关系 + private HashMap attr2Column; + + public BayesNetWorkTool(String dataFilePath, String attachFilePath) { + this.dataFilePath = dataFilePath; + this.attachFilePath = attachFilePath; + + initDatas(); + } + + /** + * 初始化关联数据和概率分布数据 + */ + private void initDatas() { + String[] columnValues; + String[] array; + ArrayList datas; + ArrayList adatas; + + // 从文件中读取数据 + datas = readDataFile(dataFilePath); + adatas = readDataFile(attachFilePath); + + columnValues = datas.get(0).split(" "); + // 属性割名称代表事件B(盗窃),E(地震),A(警铃响).M(接到M的电话),J同M的意思, + // 属性值都是y,n代表yes发生和no不发生 + this.attr2Column = new HashMap<>(); + for (int i = 0; i < columnValues.length; i++) { + // 从数据中取出属性名称行,列数值存入图中 + this.attr2Column.put(columnValues[i], i); + } + + this.columns = columnValues.length; + this.totalData = new String[datas.size()][columns]; + for (int i = 0; i < datas.size(); i++) { + this.totalData[i] = datas.get(i).split(" "); + } + + this.attachData = new ArrayList<>(); + // 解析关联数据对 + for (String str : adatas) { + array = str.split(" "); + this.attachData.add(array); + } + + // 构造贝叶斯网络结构图 + constructDAG(); + } + + /** + * 从文件中读取数据 + */ + private ArrayList readDataFile(String filePath) { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + while ((str = in.readLine()) != null) { + dataArray.add(str); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + return dataArray; + } + + /** + * 根据关联数据构造贝叶斯网络无环有向图 + */ + private void constructDAG() { + // 节点存在标识 + boolean srcExist; + boolean desExist; + String name1; + String name2; + Node srcNode; + Node desNode; + + this.nodes = new ArrayList<>(); + for (String[] array : this.attachData) { + srcExist = false; + desExist = false; + + name1 = array[0]; + name2 = array[1]; + + // 新建节点 + srcNode = new Node(name1); + desNode = new Node(name2); + + for (Node temp : this.nodes) { + // 如果找到相同节点,则取出 + if (srcNode.isEqual(temp)) { + srcExist = true; + srcNode = temp; + } else if (desNode.isEqual(temp)) { + desExist = true; + desNode = temp; + } + + // 如果2个节点都已找到,则跳出循环 + if (srcExist && desExist) { + break; + } + } + + // 将2个节点进行连接 + srcNode.connectNode(desNode); + + // 根据标识判断是否需要加入列表容器中 + if (!srcExist) { + this.nodes.add(srcNode); + } + + if (!desExist) { + this.nodes.add(desNode); + } + } + } + + /** + * 查询条件概率 + * + * @param attrValues + * 条件属性值 + * @return + */ + private double queryConditionPro(ArrayList attrValues) { + // 判断是否满足先验属性值条件 + boolean hasPrior; + // 判断是否满足后验属性值条件 + boolean hasBack; + int priorIndex; + int attrIndex; + double backPro; + double totalPro; + double pro; + double currentPro; + // 先验属性 + String[] priorValue; + String[] tempData; + + pro = 0; + totalPro = 0; + backPro = 0; + attrValues.get(0); + priorValue = attrValues.get(0); + // 得到后验概率 + attrValues.remove(0); + + // 取出先验属性的列数 + priorIndex = this.attr2Column.get(priorValue[0]); + // 跳过第一行的属性名称行 + for (int i = 1; i < this.totalData.length; i++) { + tempData = this.totalData[i]; + + hasPrior = false; + hasBack = true; + + // 当前行的概率 + currentPro = Double.parseDouble(tempData[this.columns - 1]); + // 判断是否满足先验条件 + if (tempData[priorIndex].equals(priorValue[1])) { + hasPrior = true; + } + + for (String[] array : attrValues) { + attrIndex = this.attr2Column.get(array[0]); + + // 判断值是否满足条件 + if (!tempData[attrIndex].equals(array[1])) { + hasBack = false; + break; + } + } + + // 进行计数统计,分别计算满足后验属性的值和同时满足条件的个数 + if (hasBack) { + backPro += currentPro; + if (hasPrior) { + totalPro += currentPro; + } + } else if (hasPrior && attrValues.size() == 0) { + // 如果只有先验概率则为纯概率的计算 + totalPro += currentPro; + backPro = 1.0; + } + } + + // 计算总的概率=都发生概率/只发生后验条件的时间概率 + pro = totalPro / backPro; + + return pro; + } + + /** + * 根据贝叶斯网络计算概率 + * + * @param queryStr + * 查询条件串 + * @return + */ + public double calProByNetWork(String queryStr) { + double temp; + double pro; + String[] array; + // 先验条件值 + String[] preValue; + // 后验条件值 + String[] backValue; + // 所有先验条件和后验条件值的属性值的汇总 + ArrayList attrValues; + + // 判断是否满足网络结构 + if (!satisfiedNewWork(queryStr)) { + return -1; + } + + pro = 1; + // 首先做查询条件的分解 + array = queryStr.split(","); + + // 概率的初值等于第一个事件发生的随机概率 + attrValues = new ArrayList<>(); + attrValues.add(array[0].split("=")); + pro = queryConditionPro(attrValues); + + for (int i = 0; i < array.length - 1; i++) { + attrValues.clear(); + + // 下标小的在前面的属于后验属性 + backValue = array[i].split("="); + preValue = array[i + 1].split("="); + attrValues.add(preValue); + attrValues.add(backValue); + + // 算出此种情况的概率值 + temp = queryConditionPro(attrValues); + // 进行积的相乘 + pro *= temp; + } + + return pro; + } + + /** + * 验证事件的查询因果关系是否满足贝叶斯网络 + * + * @param queryStr + * 查询字符串 + * @return + */ + private boolean satisfiedNewWork(String queryStr) { + String attrName; + String[] array; + boolean isExist; + boolean isSatisfied; + // 当前节点 + Node currentNode; + // 候选节点列表 + ArrayList nodeList; + + isSatisfied = true; + currentNode = null; + // 做查询字符串的分解 + array = queryStr.split(","); + nodeList = this.nodes; + + for (String s : array) { + // 开始时默认属性对应的节点不存在 + isExist = false; + // 得到属性事件名 + attrName = s.split("=")[0]; + + for (Node n : nodeList) { + if (n.name.equals(attrName)) { + isExist = true; + + currentNode = n; + // 下一轮的候选节点为当前节点的孩子节点 + nodeList = currentNode.childNodes; + + break; + } + } + + // 如果存在未找到的节点,则说明不满足依赖结构跳出循环 + if (!isExist) { + isSatisfied = false; + break; + } + } + + return isSatisfied; + } +} diff --git a/Others/DataMining_BayesNetwork/Client.java b/Others/DataMining_BayesNetwork/Client.java new file mode 100644 index 0000000..98706c4 --- /dev/null +++ b/Others/DataMining_BayesNetwork/Client.java @@ -0,0 +1,32 @@ +package DataMining_BayesNetwork; + +import java.text.MessageFormat; + +/** + * 贝叶斯网络场景测试类 + * + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + String dataFilePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + String attachFilePath = "C:\\Users\\lyq\\Desktop\\icon\\attach.txt"; + // 查询串语句 + String queryStr; + // 结果概率 + double result; + + // 查询语句的描述的事件是地震发生了,导致响铃响了,导致接到Mary的电话 + queryStr = "E=y,A=y,M=y"; + BayesNetWorkTool tool = new BayesNetWorkTool(dataFilePath, + attachFilePath); + result = tool.calProByNetWork(queryStr); + + if (result == -1) { + System.out.println("所描述的事件不满足贝叶斯网络的结构,无法求其概率"); + } else { + System.out.println(String.format("事件%s发生的概率为%s", queryStr, result)); + } + } +} diff --git a/Others/DataMining_BayesNetwork/Node.java b/Others/DataMining_BayesNetwork/Node.java new file mode 100644 index 0000000..bb2a07d --- /dev/null +++ b/Others/DataMining_BayesNetwork/Node.java @@ -0,0 +1,58 @@ +package DataMining_BayesNetwork; + +import java.util.ArrayList; + +/** + * 贝叶斯网络节点类 + * + * @author lyq + * + */ +public class Node { + // 节点的属性名称 + String name; + // 节点的父亲节点,也就是上游节点,可能多个 + ArrayList parentNodes; + // 节点的子节点,也就是下游节点,可能多个 + ArrayList childNodes; + + public Node(String name) { + this.name = name; + + // 初始化变量 + this.parentNodes = new ArrayList<>(); + this.childNodes = new ArrayList<>(); + } + + /** + * 将自身节点连接到目标给定的节点 + * + * @param node + * 下游节点 + */ + public void connectNode(Node node) { + // 将下游节点加入自身节点的孩子节点中 + this.childNodes.add(node); + // 将自身节点加入到下游节点的父节点中 + node.parentNodes.add(this); + } + + /** + * 判断与目标节点是否相同,主要比较名称是否相同即可 + * + * @param node + * 目标结点 + * @return + */ + public boolean isEqual(Node node) { + boolean isEqual; + + isEqual = false; + // 节点名称相同则视为相等 + if (this.name.equals(node.name)) { + isEqual = true; + } + + return isEqual; + } +} diff --git a/Others/DataMining_BayesNetwork/attach.txt b/Others/DataMining_BayesNetwork/attach.txt new file mode 100644 index 0000000..bd4bdb6 --- /dev/null +++ b/Others/DataMining_BayesNetwork/attach.txt @@ -0,0 +1,4 @@ +B A +E A +A M +A J \ No newline at end of file diff --git a/Others/DataMining_BayesNetwork/input.txt b/Others/DataMining_BayesNetwork/input.txt new file mode 100644 index 0000000..ed01889 --- /dev/null +++ b/Others/DataMining_BayesNetwork/input.txt @@ -0,0 +1,33 @@ +B E A M J P +y y y y y 0.00012 +y y y y n 0.000051 +y y y n y 0.000013 +y y y n n 0.0000057 +y y n y y 0.000000005 +y y n y n 0.00000049 +y y n n y 0.000000095 +y y n n n 0.0000094 +y n y y y 0.0058 +y n y y n 0.0025 +y n y n y 0.00065 +y n y n n 0.00028 +y n n y y 0.00000029 +y n n y n 0.000029 +y n n n y 0.0000056 +y n n n n 0.00055 +n y y y y 0.0036 +n y y y n 0.0016 +n y y n y 0.0004 +n y y n n 0.00017 +n y n y y 0.000007 +n y n y n 0.00069 +n y n n y 0.00013 +n y n n n 0.013 +n n y y y 0.00061 +n n y y n 0.00026 +n n y n y 0.000068 +n n y n n 0.000029 +n n n y y 0.00048 +n n n y n 0.048 +n n n n y 0.0092 +n n n n n 0.91 \ No newline at end of file diff --git a/Others/DataMining_TAN/AttrMutualInfo.java b/Others/DataMining_TAN/AttrMutualInfo.java new file mode 100644 index 0000000..6caf12d --- /dev/null +++ b/Others/DataMining_TAN/AttrMutualInfo.java @@ -0,0 +1,28 @@ +package DataMining_TAN; + +/** + * 属性之间的互信息值,表示属性之间的关联性大小 + * @author lyq + * + */ +public class AttrMutualInfo implements Comparable{ + //互信息值 + Double value; + //关联属性值对 + Node[] nodeArray; + + public AttrMutualInfo(double value, Node node1, Node node2){ + this.value = value; + + this.nodeArray = new Node[2]; + this.nodeArray[0] = node1; + this.nodeArray[1] = node2; + } + + @Override + public int compareTo(AttrMutualInfo o) { + // TODO Auto-generated method stub + return o.value.compareTo(this.value); + } + +} diff --git a/Others/DataMining_TAN/Client.java b/Others/DataMining_TAN/Client.java new file mode 100644 index 0000000..bd104bc --- /dev/null +++ b/Others/DataMining_TAN/Client.java @@ -0,0 +1,36 @@ +package DataMining_TAN; + +/** + * TAN树型朴素贝叶斯算法 + * + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + // 条件查询语句 + String queryStr; + // 分类结果概率1 + double classResult1; + // 分类结果概率2 + double classResult2; + + TANTool tool = new TANTool(filePath); + queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=No"; + classResult1 = tool.calHappenedPro(queryStr); + + queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=Yes"; + classResult2 = tool.calHappenedPro(queryStr); + + System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=No", + classResult1)); + System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=Yes", + classResult2)); + if (classResult1 > classResult2) { + System.out.println("分类类别为PlayTennis=No"); + } else { + System.out.println("分类类别为PlayTennis=Yes"); + } + } +} diff --git a/Others/DataMining_TAN/Node.java b/Others/DataMining_TAN/Node.java new file mode 100644 index 0000000..f3a3b51 --- /dev/null +++ b/Others/DataMining_TAN/Node.java @@ -0,0 +1,63 @@ +package DataMining_TAN; + +import java.util.ArrayList; + +/** + * 贝叶斯网络节点类 + * + * @author lyq + * + */ +public class Node { + //节点唯一id,方便后面节点连接方向的确定 + int id; + // 节点的属性名称 + String name; + // 该节点所连续的节点 + ArrayList connectedNodes; + + public Node(int id, String name) { + this.id = id; + this.name = name; + + // 初始化变量 + this.connectedNodes = new ArrayList<>(); + } + + /** + * 将自身节点连接到目标给定的节点 + * + * @param node + * 下游节点 + */ + public void connectNode(Node node) { + //避免连接自身 + if(this.id == node.id){ + return; + } + + // 将节点加入自身节点的节点列表中 + this.connectedNodes.add(node); + // 将自身节点加入到目标节点的列表中 + node.connectedNodes.add(this); + } + + /** + * 判断与目标节点是否相同,主要比较名称是否相同即可 + * + * @param node + * 目标结点 + * @return + */ + public boolean isEqual(Node node) { + boolean isEqual; + + isEqual = false; + // 节点名称相同则视为相等 + if (this.id == node.id) { + isEqual = true; + } + + return isEqual; + } +} diff --git a/Others/DataMining_TAN/TANTool.java b/Others/DataMining_TAN/TANTool.java new file mode 100644 index 0000000..56e90a6 --- /dev/null +++ b/Others/DataMining_TAN/TANTool.java @@ -0,0 +1,571 @@ +package DataMining_TAN; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + +/** + * TAN树型朴素贝叶斯算法工具类 + * + * @author lyq + * + */ +public class TANTool { + // 测试数据集地址 + private String filePath; + // 数据集属性总数,其中一个个分类属性 + private int attrNum; + // 分类属性名 + private String classAttrName; + // 属性列名称行 + private String[] attrNames; + // 贝叶斯网络边的方向,数组内的数值为节点id,从i->j + private int[][] edges; + // 属性名到列下标的映射 + private HashMap attr2Column; + // 属性,属性对取值集合映射对 + private HashMap> attr2Values; + // 贝叶斯网络总节点列表 + private ArrayList totalNodes; + // 总的测试数据 + private ArrayList totalDatas; + + public TANTool(String filePath) { + this.filePath = filePath; + + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + private void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] array; + + while ((str = in.readLine()) != null) { + array = str.split(" "); + dataArray.add(array); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + this.totalDatas = dataArray; + this.attrNames = this.totalDatas.get(0); + this.attrNum = this.attrNames.length; + this.classAttrName = this.attrNames[attrNum - 1]; + + Node node; + this.edges = new int[attrNum][attrNum]; + this.totalNodes = new ArrayList<>(); + this.attr2Column = new HashMap<>(); + this.attr2Values = new HashMap<>(); + + // 分类属性节点id最小设为0 + node = new Node(0, attrNames[attrNum - 1]); + this.totalNodes.add(node); + for (int i = 0; i < attrNames.length; i++) { + if (i < attrNum - 1) { + // 创建贝叶斯网络节点,每个属性一个节点 + node = new Node(i + 1, attrNames[i]); + this.totalNodes.add(node); + } + + // 添加属性到列下标的映射 + this.attr2Column.put(attrNames[i], i); + } + + String[] temp; + ArrayList values; + // 进行属性名,属性值对的映射匹配 + for (int i = 1; i < this.totalDatas.size(); i++) { + temp = this.totalDatas.get(i); + + for (int j = 0; j < temp.length; j++) { + // 判断map中是否包含此属性名 + if (this.attr2Values.containsKey(attrNames[j])) { + values = this.attr2Values.get(attrNames[j]); + } else { + values = new ArrayList<>(); + } + + if (!values.contains(temp[j])) { + // 加入新的属性值 + values.add(temp[j]); + } + + this.attr2Values.put(attrNames[j], values); + } + } + } + + /** + * 根据条件互信息度对构建最大权重跨度树,返回第一个节点为根节点 + * + * @param iArray + */ + private Node constructWeightTree(ArrayList iArray) { + Node node1; + Node node2; + Node root; + ArrayList existNodes; + + existNodes = new ArrayList<>(); + + for (Node[] i : iArray) { + node1 = i[0]; + node2 = i[1]; + + // 将2个节点进行连接 + node1.connectNode(node2); + // 避免出现环路现象 + addIfNotExist(node1, existNodes); + addIfNotExist(node2, existNodes); + + if (existNodes.size() == attrNum - 1) { + break; + } + } + + // 返回第一个作为根节点 + root = existNodes.get(0); + return root; + } + + /** + * 为树型结构确定边的方向,方向为属性根节点方向指向其他属性节点方向 + * + * @param root + * 当前遍历到的节点 + */ + private void confirmGraphDirection(Node currentNode) { + int i; + int j; + ArrayList connectedNodes; + + connectedNodes = currentNode.connectedNodes; + + i = currentNode.id; + for (Node n : connectedNodes) { + j = n.id; + + // 判断连接此2节点的方向是否被确定 + if (edges[i][j] == 0 && edges[j][i] == 0) { + // 如果没有确定,则制定方向为i->j + edges[i][j] = 1; + + // 递归继续搜索 + confirmGraphDirection(n); + } + } + } + + /** + * 为属性节点添加分类属性节点为父节点 + * + * @param parentNode + * 父节点 + * @param nodeList + * 子节点列表 + */ + private void addParentNode() { + // 分类属性节点 + Node parentNode; + + parentNode = null; + for (Node n : this.totalNodes) { + if (n.id == 0) { + parentNode = n; + break; + } + } + + for (Node child : this.totalNodes) { + parentNode.connectNode(child); + + if (child.id != 0) { + // 确定连接方向 + this.edges[0][child.id] = 1; + } + } + } + + /** + * 在节点集合中添加节点 + * + * @param node + * 待添加节点 + * @param existNodes + * 已存在的节点列表 + * @return + */ + public boolean addIfNotExist(Node node, ArrayList existNodes) { + boolean canAdd; + + canAdd = true; + for (Node n : existNodes) { + // 如果节点列表中已经含有节点,则算添加失败 + if (n.isEqual(node)) { + canAdd = false; + break; + } + } + + if (canAdd) { + existNodes.add(node); + } + + return canAdd; + } + + /** + * 计算节点条件概率 + * + * @param node + * 关于node的后验概率 + * @param queryParam + * 查询的属性参数 + * @return + */ + private double calConditionPro(Node node, HashMap queryParam) { + int id; + double pro; + String value; + String[] attrValue; + + ArrayList priorAttrInfos; + ArrayList backAttrInfos; + ArrayList parentNodes; + + pro = 1; + id = node.id; + parentNodes = new ArrayList<>(); + priorAttrInfos = new ArrayList<>(); + backAttrInfos = new ArrayList<>(); + + for (int i = 0; i < this.edges.length; i++) { + // 寻找父节点id + if (this.edges[i][id] == 1) { + for (Node temp : this.totalNodes) { + // 寻找目标节点id + if (temp.id == i) { + parentNodes.add(temp); + break; + } + } + } + } + + // 获取先验属性的属性值,首先添加先验属性 + value = queryParam.get(node.name); + attrValue = new String[2]; + attrValue[0] = node.name; + attrValue[1] = value; + priorAttrInfos.add(attrValue); + + // 逐一添加后验属性 + for (Node p : parentNodes) { + value = queryParam.get(p.name); + attrValue = new String[2]; + attrValue[0] = p.name; + attrValue[1] = value; + + backAttrInfos.add(attrValue); + } + + pro = queryConditionPro(priorAttrInfos, backAttrInfos); + + return pro; + } + + /** + * 查询条件概率 + * + * @param attrValues + * 条件属性值 + * @return + */ + private double queryConditionPro(ArrayList priorValues, + ArrayList backValues) { + // 判断是否满足先验属性值条件 + boolean hasPrior; + // 判断是否满足后验属性值条件 + boolean hasBack; + int attrIndex; + double backPro; + double totalPro; + double pro; + String[] tempData; + + pro = 0; + totalPro = 0; + backPro = 0; + + // 跳过第一行的属性名称行 + for (int i = 1; i < this.totalDatas.size(); i++) { + tempData = this.totalDatas.get(i); + + hasPrior = true; + hasBack = true; + + // 判断是否满足先验条件 + for (String[] array : priorValues) { + attrIndex = this.attr2Column.get(array[0]); + + // 判断值是否满足条件 + if (!tempData[attrIndex].equals(array[1])) { + hasPrior = false; + break; + } + } + + // 判断是否满足后验条件 + for (String[] array : backValues) { + attrIndex = this.attr2Column.get(array[0]); + + // 判断值是否满足条件 + if (!tempData[attrIndex].equals(array[1])) { + hasBack = false; + break; + } + } + + // 进行计数统计,分别计算满足后验属性的值和同时满足条件的个数 + if (hasBack) { + backPro++; + if (hasPrior) { + totalPro++; + } + } else if (hasPrior && backValues.size() == 0) { + // 如果只有先验概率则为纯概率的计算 + totalPro++; + backPro = 1.0; + } + } + + if (backPro == 0) { + pro = 0; + } else { + // 计算总的概率=都发生概率/只发生后验条件的时间概率 + pro = totalPro / backPro; + } + + return pro; + } + + /** + * 输入查询条件参数,计算发生概率 + * + * @param queryParam + * 条件参数 + * @return + */ + public double calHappenedPro(String queryParam) { + double result; + double temp; + // 分类属性值 + String classAttrValue; + String[] array; + String[] array2; + HashMap params; + + result = 1; + params = new HashMap<>(); + + // 进行查询字符的参数分解 + array = queryParam.split(","); + for (String s : array) { + array2 = s.split("="); + params.put(array2[0], array2[1]); + } + + classAttrValue = params.get(classAttrName); + // 构建贝叶斯网络结构 + constructBayesNetWork(classAttrValue); + + for (Node n : this.totalNodes) { + temp = calConditionPro(n, params); + + // 为了避免出现条件概率为0的现象,进行轻微矫正 + if (temp == 0) { + temp = 0.001; + } + + // 按照联合概率公式,进行乘积运算 + result *= temp; + } + + return result; + } + + /** + * 构建树型贝叶斯网络结构 + * + * @param value + * 类别量值 + */ + private void constructBayesNetWork(String value) { + Node rootNode; + ArrayList mInfoArray; + // 互信息度对 + ArrayList iArray; + + iArray = null; + rootNode = null; + + // 在每次重新构建贝叶斯网络结构的时候,清空原有的连接结构 + for (Node n : this.totalNodes) { + n.connectedNodes.clear(); + } + this.edges = new int[attrNum][attrNum]; + + // 从互信息对象中取出属性值对 + iArray = new ArrayList<>(); + mInfoArray = calAttrMutualInfoArray(value); + for (AttrMutualInfo v : mInfoArray) { + iArray.add(v.nodeArray); + } + + // 构建最大权重跨度树 + rootNode = constructWeightTree(iArray); + // 为无向图确定边的方向 + confirmGraphDirection(rootNode); + // 为每个属性节点添加分类属性父节点 + addParentNode(); + } + + /** + * 给定分类变量值,计算属性之间的互信息值 + * + * @param value + * 分类变量值 + * @return + */ + private ArrayList calAttrMutualInfoArray(String value) { + double iValue; + Node node1; + Node node2; + AttrMutualInfo mInfo; + ArrayList mInfoArray; + + mInfoArray = new ArrayList<>(); + + for (int i = 0; i < this.totalNodes.size() - 1; i++) { + node1 = this.totalNodes.get(i); + // 跳过分类属性节点 + if (node1.id == 0) { + continue; + } + + for (int j = i + 1; j < this.totalNodes.size(); j++) { + node2 = this.totalNodes.get(j); + // 跳过分类属性节点 + if (node2.id == 0) { + continue; + } + + // 计算2个属性节点之间的互信息值 + iValue = calMutualInfoValue(node1, node2, value); + mInfo = new AttrMutualInfo(iValue, node1, node2); + mInfoArray.add(mInfo); + } + } + + // 将结果进行降序排列,让互信息值高的优先用于构建树 + Collections.sort(mInfoArray); + + return mInfoArray; + } + + /** + * 计算2个属性节点的互信息值 + * + * @param node1 + * 节点1 + * @param node2 + * 节点2 + * @param vlaue + * 分类变量值 + */ + private double calMutualInfoValue(Node node1, Node node2, String value) { + double iValue; + double temp; + // 三种不同条件的后验概率 + double pXiXj; + double pXi; + double pXj; + String[] array1; + String[] array2; + ArrayList attrValues1; + ArrayList attrValues2; + ArrayList priorValues; + // 后验概率,在这里就是类变量值 + ArrayList backValues; + + array1 = new String[2]; + array2 = new String[2]; + priorValues = new ArrayList<>(); + backValues = new ArrayList<>(); + + iValue = 0; + array1[0] = classAttrName; + array1[1] = value; + // 后验属性都是类属性 + backValues.add(array1); + + // 获取节点属性的属性值集合 + attrValues1 = this.attr2Values.get(node1.name); + attrValues2 = this.attr2Values.get(node2.name); + + for (String v1 : attrValues1) { + for (String v2 : attrValues2) { + priorValues.clear(); + + array1 = new String[2]; + array1[0] = node1.name; + array1[1] = v1; + priorValues.add(array1); + + array2 = new String[2]; + array2[0] = node2.name; + array2[1] = v2; + priorValues.add(array2); + + // 计算3种条件下的概率 + pXiXj = queryConditionPro(priorValues, backValues); + + priorValues.clear(); + priorValues.add(array1); + pXi = queryConditionPro(priorValues, backValues); + + priorValues.clear(); + priorValues.add(array2); + pXj = queryConditionPro(priorValues, backValues); + + // 如果出现其中一个计数概率为0,则直接赋值为0处理 + if (pXiXj == 0 || pXi == 0 || pXj == 0) { + temp = 0; + } else { + // 利用公式计算针对此属性值对组合的概率 + temp = pXiXj * Math.log(pXiXj / (pXi * pXj)) / Math.log(2); + } + + // 进行和属性值对组合的累加即为整个属性的互信息值 + iValue += temp; + } + } + + return iValue; + } +} diff --git a/Others/DataMining_TAN/input.txt b/Others/DataMining_TAN/input.txt new file mode 100644 index 0000000..aea7074 --- /dev/null +++ b/Others/DataMining_TAN/input.txt @@ -0,0 +1,15 @@ +OutLook Temperature Humidity Wind PlayTennis +Sunny Hot High Weak No +Sunny Hot High Strong No +Overcast Hot High Weak Yes +Rainy Mild High Weak Yes +Rainy Cool Normal Weak Yes +Rainy Cool Normal Strong No +Overcast Cool Normal Strong Yes +Sunny Mild High Weak No +Sunny Cool Normal Weak Yes +Rainy Mild Normal Weak Yes +Sunny Mild Normal Strong Yes +Overcast Mild High Strong Yes +Overcast Hot Normal Weak Yes +Rainy Mild High Strong No \ No newline at end of file diff --git a/Others/DataMining_Viterbi/BaseNames.java b/Others/DataMining_Viterbi/BaseNames.java new file mode 100644 index 0000000..cca0aaa --- /dev/null +++ b/Others/DataMining_Viterbi/BaseNames.java @@ -0,0 +1,24 @@ +package DataMining_Viterbi; + +/** + * 基本变量定义类 + * @author lyq + * + */ +public class BaseNames { + //日期天数下标 + public static final int DAY1 = 0; + public static final int DAY2 = 1; + public static final int DAY3 = 2; + + //天气属性类别 + public static final int WEATHER_SUNNY = 0; + public static final int WEATHER_CLOUDY = 1; + public static final int WEATHER_RAINY = 2; + + //湿度属性类别 + public static final int HUMIDITY_DRY = 0; + public static final int HUMIDITY_DRYISH = 1; + public static final int HUMIDITY_DAMP = 1; + public static final int HUMIDITY_SOGGY = 1; +} diff --git a/Others/DataMining_Viterbi/Client.java b/Others/DataMining_Viterbi/Client.java new file mode 100644 index 0000000..577eabd --- /dev/null +++ b/Others/DataMining_Viterbi/Client.java @@ -0,0 +1,31 @@ +package DataMining_Viterbi; + +/** + * 维特比算法 + * + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + // 状态转移概率矩阵路径 + String stmFilePath; + // 混淆矩阵路径 + String cfFilePath; + // 观察到的状态 + String[] observeStates; + // 初始状态 + double[] initStatePro; + ViterbiTool tool; + + stmFilePath = "C:\\Users\\lyq\\Desktop\\icon\\stmatrix.txt"; + cfFilePath = "C:\\Users\\lyq\\Desktop\\icon\\humidity-matrix.txt"; + + initStatePro = new double[] { 0.63, 0.17, 0.20 }; + observeStates = new String[] { "Dry", "Damp", "Soggy" }; + + tool = new ViterbiTool(stmFilePath, cfFilePath, initStatePro, + observeStates); + tool.calHMMObserve(); + } +} diff --git a/Others/DataMining_Viterbi/ViterbiTool.java b/Others/DataMining_Viterbi/ViterbiTool.java new file mode 100644 index 0000000..6f1ade6 --- /dev/null +++ b/Others/DataMining_Viterbi/ViterbiTool.java @@ -0,0 +1,240 @@ +package DataMining_Viterbi; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +/** + * 维特比算法工具类 + * + * @author lyq + * + */ +public class ViterbiTool { + // 状态转移概率矩阵文件地址 + private String stmFilePath; + // 混淆矩阵文件地址 + private String confusionFilePath; + // 初始状态概率 + private double[] initStatePro; + // 观察到的状态序列 + public String[] observeStates; + // 状态转移矩阵值 + private double[][] stMatrix; + // 混淆矩阵值 + private double[][] confusionMatrix; + // 各个条件下的潜在特征概率值 + private double[][] potentialValues; + // 潜在特征 + private ArrayList potentialAttrs; + // 属性值列坐标映射图 + private HashMap name2Index; + // 列坐标属性值映射图 + private HashMap index2name; + + public ViterbiTool(String stmFilePath, String confusionFilePath, + double[] initStatePro, String[] observeStates) { + this.stmFilePath = stmFilePath; + this.confusionFilePath = confusionFilePath; + this.initStatePro = initStatePro; + this.observeStates = observeStates; + + initOperation(); + } + + /** + * 初始化数据操作 + */ + private void initOperation() { + double[] temp; + int index; + ArrayList smtDatas; + ArrayList cfDatas; + + smtDatas = readDataFile(stmFilePath); + cfDatas = readDataFile(confusionFilePath); + + index = 0; + this.stMatrix = new double[smtDatas.size()][]; + for (String[] array : smtDatas) { + temp = new double[array.length]; + for (int i = 0; i < array.length; i++) { + try { + temp[i] = Double.parseDouble(array[i]); + } catch (NumberFormatException e) { + temp[i] = -1; + } + } + + // 将转换后的值赋给数组中 + this.stMatrix[index] = temp; + index++; + } + + index = 0; + this.confusionMatrix = new double[cfDatas.size()][]; + for (String[] array : cfDatas) { + temp = new double[array.length]; + for (int i = 0; i < array.length; i++) { + try { + temp[i] = Double.parseDouble(array[i]); + } catch (NumberFormatException e) { + temp[i] = -1; + } + } + + // 将转换后的值赋给数组中 + this.confusionMatrix[index] = temp; + index++; + } + + this.potentialAttrs = new ArrayList<>(); + // 添加潜在特征属性 + for (String s : smtDatas.get(0)) { + this.potentialAttrs.add(s); + } + // 去除首列无效列 + potentialAttrs.remove(0); + + this.name2Index = new HashMap<>(); + this.index2name = new HashMap<>(); + + // 添加名称下标映射关系 + for (int i = 1; i < smtDatas.get(0).length; i++) { + this.name2Index.put(smtDatas.get(0)[i], i); + // 添加下标到名称的映射 + this.index2name.put(i, smtDatas.get(0)[i]); + } + + for (int i = 1; i < cfDatas.get(0).length; i++) { + this.name2Index.put(cfDatas.get(0)[i], i); + } + } + + /** + * 从文件中读取数据 + */ + private ArrayList readDataFile(String filePath) { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + return dataArray; + } + + /** + * 根据观察特征计算隐藏的特征概率矩阵 + */ + private void calPotencialProMatrix() { + String curObserveState; + // 观察特征和潜在特征的下标 + int osIndex; + int psIndex; + double temp; + double maxPro; + // 混淆矩阵概率值,就是相关影响的因素概率 + double confusionPro; + + this.potentialValues = new double[observeStates.length][potentialAttrs + .size() + 1]; + for (int i = 0; i < this.observeStates.length; i++) { + curObserveState = this.observeStates[i]; + osIndex = this.name2Index.get(curObserveState); + maxPro = -1; + + // 因为是第一个观察特征,没有前面的影响,根据初始状态计算 + if (i == 0) { + for (String attr : this.potentialAttrs) { + psIndex = this.name2Index.get(attr); + confusionPro = this.confusionMatrix[psIndex][osIndex]; + + temp = this.initStatePro[psIndex - 1] * confusionPro; + this.potentialValues[BaseNames.DAY1][psIndex] = temp; + } + } else { + // 后面的潜在特征受前一个特征的影响,以及当前的混淆因素影响 + for (String toDayAttr : this.potentialAttrs) { + psIndex = this.name2Index.get(toDayAttr); + confusionPro = this.confusionMatrix[psIndex][osIndex]; + + int index; + maxPro = -1; + // 通过昨天的概率计算今天此特征的最大概率 + for (String yAttr : this.potentialAttrs) { + index = this.name2Index.get(yAttr); + temp = this.potentialValues[i - 1][index] + * this.stMatrix[index][psIndex]; + + // 计算得到今天此潜在特征的最大概率 + if (temp > maxPro) { + maxPro = temp; + } + } + + this.potentialValues[i][psIndex] = maxPro * confusionPro; + } + } + } + } + + /** + * 根据同时期最大概率值输出潜在特征值 + */ + private void outputResultAttr() { + double maxPro; + int maxIndex; + ArrayList psValues; + + psValues = new ArrayList<>(); + for (int i = 0; i < this.potentialValues.length; i++) { + maxPro = -1; + maxIndex = 0; + + for (int j = 0; j < potentialValues[i].length; j++) { + if (this.potentialValues[i][j] > maxPro) { + maxPro = potentialValues[i][j]; + maxIndex = j; + } + } + + // 取出最大概率下标对应的潜在特征 + psValues.add(this.index2name.get(maxIndex)); + } + + System.out.println("观察特征为:"); + for (String s : this.observeStates) { + System.out.print(s + ", "); + } + System.out.println(); + + System.out.println("潜在特征为:"); + for (String s : psValues) { + System.out.print(s + ", "); + } + System.out.println(); + } + + /** + * 根据观察属性,得到潜在属性信息 + */ + public void calHMMObserve() { + calPotencialProMatrix(); + outputResultAttr(); + } +} diff --git a/Others/DataMining_Viterbi/humidity-matrix.txt b/Others/DataMining_Viterbi/humidity-matrix.txt new file mode 100644 index 0000000..ff41df6 --- /dev/null +++ b/Others/DataMining_Viterbi/humidity-matrix.txt @@ -0,0 +1,4 @@ +# Dry Dryish Damp Soggy +Sunny 0.6 0.2 0.15 0.05 +Cloudy 0.25 0.25 0.25 0.25 +Rainy 0.05 0.10 0.35 0.50 \ No newline at end of file diff --git a/Others/DataMining_Viterbi/stmatrix.txt b/Others/DataMining_Viterbi/stmatrix.txt new file mode 100644 index 0000000..af66956 --- /dev/null +++ b/Others/DataMining_Viterbi/stmatrix.txt @@ -0,0 +1,4 @@ +# Sunny Cloudy Rainy +Sunny 0.5 0.375 0.125 +Cloudy 0.25 0.125 0.625 +Rainy 0.25 0.375 0.375 \ No newline at end of file diff --git a/README.md b/README.md index 02100cd..7dd82f0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,45 @@ # 鏁版嵁鎸栨帢绠楁硶 -##18澶х粡鍏窪M绠楁硶 +## 绠楁硶鐩綍 +#### 18澶M绠楁硶 +鍖呭悕 | 鐩綍鍚 | 绠楁硶鍚 | +-----| ------ |--------| +AssociationAnalysis | DataMining_Apriori | Apriori-鍏宠仈瑙勫垯鎸栨帢绠楁硶 +AssociationAnalysis | DataMining_FPTree | FPTree-棰戠箒妯″紡鏍戠畻娉 +BaggingAndBoosting | DataMining_AdaBoost | AdaBoost-瑁呰鎻愬崌绠楁硶 +Classification | DataMining_CART | CART-鍒嗙被鍥炲綊鏍戠畻娉 +Classification | DataMining_ID3 | ID3-鍐崇瓥鏍戝垎绫荤畻娉 +Classification | DataMining_KNN | KNN-k鏈杩戦偦绠楁硶宸ュ叿绫 +Classification | DataMining_NaiveBayes | NaiveBayes-鏈寸礌璐濆彾鏂畻娉 +Clustering | DataMining_BIRCH | BIRCH-灞傛鑱氱被绠楁硶 +Clustering | DataMining_KMeans | KMeans-K鍧囧肩畻娉 +GraphMining | DataMining_GSpan | GSpan-棰戠箒瀛愬浘鎸栨帢绠楁硶 +IntegratedMining | DataMining_CBA | CBA-鍩轰簬鍏宠仈瑙勫垯鐨勫垎绫荤畻娉 +LinkMining | DataMining_HITS | HITS-閾炬帴鍒嗘瀽绠楁硶 +LinkMining | DataMining_PageRank | PageRank-缃戦〉閲嶈鎬/鎺掑悕绠楁硶 +RoughSets | DataMining_RoughSets | RoughSets-绮楃硻闆嗗睘鎬х害绠绠楁硶 +SequentialPatterns | DataMining_GSP | GSP-搴忓垪妯″紡鍒嗘瀽绠楁硶 +SequentialPatterns | DataMining_PrefixSpan | PrefixSpan-搴忓垪妯″紡鍒嗘瀽绠楁硶 +StatisticalLearning | DataMining_EM | EM-鏈熸湜鏈澶у寲绠楁硶 +StatisticalLearning | DataMining_SVM | SVM-鏀寔鍚戦噺鏈虹畻娉 + +#### 鍏朵粬缁忓吀DM绠楁硶 +鍖呭悕 | 鐩綍鍚 | 绠楁硶鍚 | +-----| ------ |--------| +Others | DataMining_ACO | ACO-铓佺兢绠楁硶 +Others | DataMining_BayesNetwork | BayesNetwork-璐濆彾鏂綉缁滅畻娉 +Others | DataMining_CABDDCC | CABDDCC-鍩轰簬杩為氬浘鐨勫垎瑁傝仛绫荤畻娉 +Others | DataMining_Chameleon | Chameleon-涓ら樁娈靛悎骞惰仛绫荤畻娉 +Others | DataMining_DBSCAN | DBSCAN-鍩轰簬瀵嗗害鐨勮仛绫荤畻娉 +Others | DataMining_GA | GA-閬椾紶绠楁硶 +Others | DataMining_GA_Maze | GA_Maze-閬椾紶绠楁硶鍦ㄨ蛋杩峰娓告垙涓殑搴旂敤绠楁硶 +Others | DataMining_KDTree | KDTree-k缁寸┖闂村叧閿暟鎹绱㈢畻娉曞伐鍏风被 +Others | DataMining_MSApriori | MSApriori-鍩轰簬澶氭敮鎸佸害鐨凙priori绠楁硶 +Others | DataMining_RandomForest | RandomForest-闅忔満妫灄绠楁硶 +Others | DataMining_TAN | TAN-鏍戝瀷鏈寸礌璐濆彾鏂畻娉 +Others | DataMining_Viterbi | Viterbi-缁寸壒姣旂畻娉 + +## 18澶х粡鍏窪M绠楁硶 18澶ф暟鎹寲鎺樼殑缁忓吀绠楁硶浠ュ強浠g爜瀹炵幇锛屾秹鍙婂埌浜嗗喅绛栧垎绫伙紝鑱氱被锛岄摼鎺ユ寲鎺橈紝鍏宠仈鎸栨帢锛屾ā寮忔寲鎺樼瓑绛夋柟闈,鍚庨潰閮芥槸鐩稿簲绠楁硶鐨勫崥鏂囬摼鎺ワ紝甯屾湜鑳藉甯姪澶у瀛︺ 鐩墠杩藉姞浜嗗叾浠栫殑涓浜涚粡鍏哥殑DM绠楁硶锛屽湪others鐨勫寘涓秹鍙婅仛绫伙紝鍒嗙被锛屽浘绠楁硶锛屾悳绱㈢畻绛夌瓑锛屾病鏈夊叿浣撳垎绫汇 @@ -86,3 +125,18 @@ K-Dimension Tree銆傚缁寸┖闂村垝鍒嗘爲锛屾暟鎹湪澶氱淮绌洪棿杩涜鍒掑垎涓 * ### ACO 铓佺兢绠楁硶銆傝殎缇ょ畻娉曞張绉颁负铓傝殎绠楁硶銆傚悓GA閬椾紶绠楁硶绫讳技锛屼篃鏄繍鐢ㄤ簡澶ц嚜鐒惰寰嬬殑绠楁硶锛岀敤浜庡湪鍥句腑瀵绘壘鏈浼樿矾寰勭殑姒傜巼鍨嬬畻娉曘傜伒鎰熸潵婧愪簬铓傝殎鍦ㄥ鎵鹃鐗╂椂浼氭暎鎾俊鎭礌鐨勫彂鐜拌矾寰勮涓恒俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/45395491) + +* ### BayesNetwork +璐濆彾鏂綉缁滅畻娉曘傚讥琛ヤ簡鏈寸礌璐濆彾鏂畻娉曚腑蹇呴』瑕佷簨浠剁嫭绔嬫х殑缂虹偣锛屽埄鐢ㄤ簡璐濆彾鏂綉缁滅殑DAG鏈夊悜鏃犵幆鍥撅紝鍏佽鍚勪釜浜嬩欢淇濈暀涓瀹氱殑渚濊禆鍏崇郴锛岀綉缁滅粨鏋勪腑鐨勬瘡涓妭鐐逛唬琛ㄤ竴绉嶅睘鎬э紝杈逛唬琛ㄧ浉搴旂殑鏉′欢姒傜巼鍊硷紝閫氳繃璁$畻浠庤岃兘寰楀埌绮惧噯鐨勫垎绫绘晥鏋溿俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/46683729) + +* ### TAN +鏍戝瀷鏈寸礌璐濆彾鏂畻娉曘傛绠楁硶鍙堣绉颁负鍔犲己鐗堟湸绱犺礉鍙舵柉绠楁硶銆傚湪婊¤冻鍘熸湁鏈寸礌璐濆彾鏂潯浠剁殑鍩虹涓婏紝浠栧厑璁搁儴鏉′欢灞炴х洿鎺ョ殑鍏宠仈鎬с傚舰鎴愭爲鍨嬬殑缁撴瀯銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/46763427) + +* ### Viterbi +缁寸壒姣旂畻娉曘傜粰瀹氫竴涓殣椹皵绉戝か妯″瀷浠ュ強涓涓瀵熷簭鍒楋紝姹傚嚭娼滃湪鐨勭姸鎬佸簭鍒椾俊鎭紝姣忎釜娼滃湪鐘舵佷俊鎭張浼氬彈鍒板墠涓涓姸鎬佷俊鎭殑褰卞搷銆 + +## 绠楁硶浣跨敤鏂规硶 +鍦ㄦ瘡涓畻娉曚腑缁欏嚭浜3澶х被鍨嬶紝涓荤畻娉曠▼搴忥紝璋冪敤绋嬪簭锛岃緭鍏ユ暟鎹紝璋冪敤鏂规硶濡備笅锛 +* 灏嗛渶瑕佹暟鎹殑娴嬭瘯鏁版嵁杞寲鎴愪笌缁欏畾鐨勮緭鍏ユ牸寮忕浉鍚 +* 鐒跺悗浠lient绫荤殑娴嬭瘯绋嬪簭璋冪敤鏂瑰紡杩涜浣跨敤銆 +* 涔熷彲浠ヨ嚜琛屼慨鏀圭畻娉曠▼搴忥紝鏉ラ傜敤浜庤嚜宸辩殑浣跨敤鍦烘櫙