diff --git a/Others/DataMining_ACO/ACOTool.java b/Others/DataMining_ACO/ACOTool.java new file mode 100644 index 0000000..21b9760 --- /dev/null +++ b/Others/DataMining_ACO/ACOTool.java @@ -0,0 +1,351 @@ +package DataMining_ACO; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +/** + * 蚁群算法工具类 + * + * @author lyq + * + */ +public class ACOTool { + // 输入数据类型 + public static final int INPUT_CITY_NAME = 1; + public static final int INPUT_CITY_DIS = 2; + + // 城市间距离邻接矩阵 + public static double[][] disMatrix; + // 当前时间 + public static int currentTime; + + // 测试数据地址 + private String filePath; + // 蚂蚁数量 + private int antNum; + // 控制参数 + private double alpha; + private double beita; + private double p; + private double Q; + // 随机数产生器 + private Random random; + // 城市名称集合,这里为了方便,将城市用数字表示 + private ArrayList totalCitys; + // 所有的蚂蚁集合 + private ArrayList totalAnts; + // 城市间的信息素浓度矩阵,随着时间的增多而减少 + private double[][] pheromoneMatrix; + // 目标的最短路径,顺序为从集合的前部往后挪动 + private ArrayList bestPath; + // 信息素矩阵存储图,key采用的格式(i,j,t)->value + private Map pheromoneTimeMap; + + public ACOTool(String filePath, int antNum, double alpha, double beita, + double p, double Q) { + this.filePath = filePath; + this.antNum = antNum; + this.alpha = alpha; + this.beita = beita; + this.p = p; + this.Q = Q; + this.currentTime = 0; + + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + private void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + int flag = -1; + int src = 0; + int des = 0; + int size = 0; + // 进行城市名称种数的统计 + this.totalCitys = new ArrayList<>(); + for (String[] array : dataArray) { + if (array[0].equals("#") && totalCitys.size() == 0) { + flag = INPUT_CITY_NAME; + + continue; + } else if (array[0].equals("#") && totalCitys.size() > 0) { + size = totalCitys.size(); + // 初始化距离矩阵 + this.disMatrix = new double[size + 1][size + 1]; + this.pheromoneMatrix = new double[size + 1][size + 1]; + + // 初始值-1代表此对应位置无值 + for (int i = 0; i < size; i++) { + for (int j = 0; j < size; j++) { + this.disMatrix[i][j] = -1; + this.pheromoneMatrix[i][j] = -1; + } + } + + flag = INPUT_CITY_DIS; + continue; + } + + if (flag == INPUT_CITY_NAME) { + this.totalCitys.add(array[0]); + } else { + src = Integer.parseInt(array[0]); + des = Integer.parseInt(array[1]); + + this.disMatrix[src][des] = Double.parseDouble(array[2]); + this.disMatrix[des][src] = Double.parseDouble(array[2]); + } + } + } + + /** + * 计算从蚂蚁城市i到j的概率 + * + * @param cityI + * 城市I + * @param cityJ + * 城市J + * @param currentTime + * 当前时间 + * @return + */ + private double calIToJProbably(String cityI, String cityJ, int currentTime) { + double pro = 0; + double n = 0; + double pheromone; + int i; + int j; + + i = Integer.parseInt(cityI); + j = Integer.parseInt(cityJ); + + pheromone = getPheromone(currentTime, cityI, cityJ); + n = 1.0 / disMatrix[i][j]; + + if (pheromone == 0) { + pheromone = 1; + } + + pro = Math.pow(n, alpha) * Math.pow(pheromone, beita); + + return pro; + } + + /** + * 计算综合概率蚂蚁从I城市走到J城市的概率 + * + * @return + */ + public String selectAntNextCity(Ant ant, int currentTime) { + double randomNum; + double tempPro; + // 总概率指数 + double proTotal; + String nextCity = null; + ArrayList allowedCitys; + // 各城市概率集 + double[] proArray; + + // 如果是刚刚开始的时候,没有路过任何城市,则随机返回一个城市 + if (ant.currentPath.size() == 0) { + nextCity = String.valueOf(random.nextInt(totalCitys.size()) + 1); + + return nextCity; + } else if (ant.nonVisitedCitys.isEmpty()) { + // 如果全部遍历完毕,则再次回到起点 + nextCity = ant.currentPath.get(0); + + return nextCity; + } + + proTotal = 0; + allowedCitys = ant.nonVisitedCitys; + proArray = new double[allowedCitys.size()]; + + for (int i = 0; i < allowedCitys.size(); i++) { + nextCity = allowedCitys.get(i); + proArray[i] = calIToJProbably(ant.currentPos, nextCity, currentTime); + proTotal += proArray[i]; + } + + for (int i = 0; i < allowedCitys.size(); i++) { + // 归一化处理 + proArray[i] /= proTotal; + } + + // 用随机数选择下一个城市 + randomNum = random.nextInt(100) + 1; + randomNum = randomNum / 100; + // 因为1.0是无法判断到的,,总和会无限接近1.0取为0.99做判断 + if (randomNum == 1) { + randomNum = randomNum - 0.01; + } + + tempPro = 0; + // 确定区间 + for (int j = 0; j < allowedCitys.size(); j++) { + if (randomNum > tempPro && randomNum <= tempPro + proArray[j]) { + // 采用拷贝的方式避免引用重复 + nextCity = allowedCitys.get(j); + break; + } else { + tempPro += proArray[j]; + } + } + + return nextCity; + } + + /** + * 获取给定时间点上从城市i到城市j的信息素浓度 + * + * @param t + * @param cityI + * @param cityJ + * @return + */ + private double getPheromone(int t, String cityI, String cityJ) { + double pheromone = 0; + String key; + + // 上一周期需将时间倒回一周期 + key = MessageFormat.format("{0},{1},{2}", cityI, cityJ, t); + + if (pheromoneTimeMap.containsKey(key)) { + pheromone = pheromoneTimeMap.get(key); + } + + return pheromone; + } + + /** + * 每轮结束,刷新信息素浓度矩阵 + * + * @param t + */ + private void refreshPheromone(int t) { + double pheromone = 0; + // 上一轮周期结束后的信息素浓度,丛信息素浓度图中查找 + double lastTimeP = 0; + // 本轮信息素浓度增加量 + double addPheromone; + String key; + + for (String i : totalCitys) { + for (String j : totalCitys) { + if (!i.equals(j)) { + // 上一周期需将时间倒回一周期 + key = MessageFormat.format("{0},{1},{2}", i, j, t - 1); + + if (pheromoneTimeMap.containsKey(key)) { + lastTimeP = pheromoneTimeMap.get(key); + } else { + lastTimeP = 0; + } + + addPheromone = 0; + for (Ant ant : totalAnts) { + if(ant.pathContained(i, j)){ + // 每只蚂蚁传播的信息素为控制因子除以距离总成本 + addPheromone += Q / ant.calSumDistance(); + } + } + + // 将上次的结果值加上递增的量,并存入图中 + pheromone = p * lastTimeP + addPheromone; + key = MessageFormat.format("{0},{1},{2}", i, j, t); + pheromoneTimeMap.put(key, pheromone); + } + } + } + + } + + /** + * 蚁群算法迭代次数 + * @param loopCount + * 具体遍历次数 + */ + public void antStartSearching(int loopCount) { + // 蚁群寻找的总次数 + int count = 0; + // 选中的下一个城市 + String selectedCity = ""; + + pheromoneTimeMap = new HashMap(); + totalAnts = new ArrayList<>(); + random = new Random(); + + while (count < loopCount) { + initAnts(); + + while (true) { + for (Ant ant : totalAnts) { + selectedCity = selectAntNextCity(ant, currentTime); + ant.goToNextCity(selectedCity); + } + + // 如果已经遍历完所有城市,则跳出此轮循环 + if (totalAnts.get(0).isBack()) { + break; + } + } + + // 周期时间叠加 + currentTime++; + refreshPheromone(currentTime); + count++; + } + + // 根据距离成本,选出所花距离最短的一个路径 + Collections.sort(totalAnts); + bestPath = totalAnts.get(0).currentPath; + System.out.println(MessageFormat.format("经过{0}次循环遍历,最终得出的最佳路径:", count)); + System.out.print("entrance"); + for (String cityName : bestPath) { + System.out.print(MessageFormat.format("-->{0}", cityName)); + } + } + + /** + * 初始化蚁群操作 + */ + private void initAnts() { + Ant tempAnt; + ArrayList nonVisitedCitys; + totalAnts.clear(); + + // 初始化蚁群 + for (int i = 0; i < antNum; i++) { + nonVisitedCitys = (ArrayList) totalCitys.clone(); + tempAnt = new Ant(pheromoneMatrix, nonVisitedCitys); + + totalAnts.add(tempAnt); + } + } +} diff --git a/Others/DataMining_ACO/Ant.java b/Others/DataMining_ACO/Ant.java new file mode 100644 index 0000000..fd89c71 --- /dev/null +++ b/Others/DataMining_ACO/Ant.java @@ -0,0 +1,125 @@ +package DataMining_ACO; + +import java.util.ArrayList; + +/** + * 蚂蚁类,进行路径搜索的载体 + * + * @author lyq + * + */ +public class Ant implements Comparable { + // 蚂蚁当前所在城市 + String currentPos; + // 蚂蚁遍历完回到原点所用的总距离 + Double sumDistance; + // 城市间的信息素浓度矩阵,随着时间的增多而减少 + double[][] pheromoneMatrix; + // 蚂蚁已经走过的城市集合 + ArrayList visitedCitys; + // 还未走过的城市集合 + ArrayList nonVisitedCitys; + // 蚂蚁当前走过的路径 + ArrayList currentPath; + + public Ant(double[][] pheromoneMatrix, ArrayList nonVisitedCitys) { + this.pheromoneMatrix = pheromoneMatrix; + this.nonVisitedCitys = nonVisitedCitys; + + this.visitedCitys = new ArrayList<>(); + this.currentPath = new ArrayList<>(); + } + + /** + * 计算路径的总成本(距离) + * + * @return + */ + public double calSumDistance() { + sumDistance = 0.0; + String lastCity; + String currentCity; + + for (int i = 0; i < currentPath.size() - 1; i++) { + lastCity = currentPath.get(i); + currentCity = currentPath.get(i + 1); + + // 通过距离矩阵进行计算 + sumDistance += ACOTool.disMatrix[Integer.parseInt(lastCity)][Integer + .parseInt(currentCity)]; + } + + return sumDistance; + } + + /** + * 蚂蚁选择前往下一个城市 + * + * @param city + * 所选的城市 + */ + public void goToNextCity(String city) { + this.currentPath.add(city); + this.currentPos = city; + this.nonVisitedCitys.remove(city); + this.visitedCitys.add(city); + } + + /** + * 判断蚂蚁是否已经又重新回到起点 + * + * @return + */ + public boolean isBack() { + boolean isBack = false; + String startPos; + String endPos; + + if (currentPath.size() == 0) { + return isBack; + } + + startPos = currentPath.get(0); + endPos = currentPath.get(currentPath.size() - 1); + if (currentPath.size() > 1 && startPos.equals(endPos)) { + isBack = true; + } + + return isBack; + } + + /** + * 判断蚂蚁在本次的走过的路径中是否包含从城市i到城市j + * + * @param cityI + * 城市I + * @param cityJ + * 城市J + * @return + */ + public boolean pathContained(String cityI, String cityJ) { + String lastCity; + String currentCity; + boolean isContained = false; + + for (int i = 0; i < currentPath.size() - 1; i++) { + lastCity = currentPath.get(i); + currentCity = currentPath.get(i + 1); + + // 如果某一段路径的始末位置一致,则认为有经过此城市 + if ((lastCity.equals(cityI) && currentCity.equals(cityJ)) + || (lastCity.equals(cityJ) && currentCity.equals(cityI))) { + isContained = true; + break; + } + } + + return isContained; + } + + @Override + public int compareTo(Ant o) { + // TODO Auto-generated method stub + return this.sumDistance.compareTo(o.sumDistance); + } +} diff --git a/Others/DataMining_ACO/Client.java b/Others/DataMining_ACO/Client.java new file mode 100644 index 0000000..0e9ede9 --- /dev/null +++ b/Others/DataMining_ACO/Client.java @@ -0,0 +1,32 @@ +package DataMining_ACO; + +/** + * 蚁群算法测试类 + * @author lyq + * + */ +public class Client { + public static void main(String[] args){ + //测试数据 + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + //蚂蚁数量 + int antNum; + //蚁群算法迭代次数 + int loopCount; + //控制参数 + double alpha; + double beita; + double p; + double Q; + + antNum = 3; + alpha = 0.5; + beita = 1; + p = 0.5; + Q = 5; + loopCount = 5; + + ACOTool tool = new ACOTool(filePath, antNum, alpha, beita, p, Q); + tool.antStartSearching(loopCount); + } +} diff --git a/Others/DataMining_ACO/input.txt b/Others/DataMining_ACO/input.txt new file mode 100644 index 0000000..87bed70 --- /dev/null +++ b/Others/DataMining_ACO/input.txt @@ -0,0 +1,12 @@ +# CityName +1 +2 +3 +4 +# Distance +1 2 1 +1 3 1.4 +1 4 1 +2 3 1 +2 4 1 +3 4 1 \ No newline at end of file diff --git a/Others/DataMining_BayesNetwork/BayesNetWorkTool.java b/Others/DataMining_BayesNetwork/BayesNetWorkTool.java new file mode 100644 index 0000000..cbf99ae --- /dev/null +++ b/Others/DataMining_BayesNetwork/BayesNetWorkTool.java @@ -0,0 +1,328 @@ +package DataMining_BayesNetwork; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * 贝叶斯网络算法工具类 + * + * @author lyq + * + */ +public class BayesNetWorkTool { + // 联合概率分布数据文件地址 + private String dataFilePath; + // 事件关联数据文件地址 + private String attachFilePath; + // 属性列列数 + private int columns; + // 概率分布数据 + private String[][] totalData; + // 关联数据对 + private ArrayList attachData; + // 节点存放列表 + private ArrayList nodes; + // 属性名与列数之间的对应关系 + private HashMap attr2Column; + + public BayesNetWorkTool(String dataFilePath, String attachFilePath) { + this.dataFilePath = dataFilePath; + this.attachFilePath = attachFilePath; + + initDatas(); + } + + /** + * 初始化关联数据和概率分布数据 + */ + private void initDatas() { + String[] columnValues; + String[] array; + ArrayList datas; + ArrayList adatas; + + // 从文件中读取数据 + datas = readDataFile(dataFilePath); + adatas = readDataFile(attachFilePath); + + columnValues = datas.get(0).split(" "); + // 属性割名称代表事件B(盗窃),E(地震),A(警铃响).M(接到M的电话),J同M的意思, + // 属性值都是y,n代表yes发生和no不发生 + this.attr2Column = new HashMap<>(); + for (int i = 0; i < columnValues.length; i++) { + // 从数据中取出属性名称行,列数值存入图中 + this.attr2Column.put(columnValues[i], i); + } + + this.columns = columnValues.length; + this.totalData = new String[datas.size()][columns]; + for (int i = 0; i < datas.size(); i++) { + this.totalData[i] = datas.get(i).split(" "); + } + + this.attachData = new ArrayList<>(); + // 解析关联数据对 + for (String str : adatas) { + array = str.split(" "); + this.attachData.add(array); + } + + // 构造贝叶斯网络结构图 + constructDAG(); + } + + /** + * 从文件中读取数据 + */ + private ArrayList readDataFile(String filePath) { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + while ((str = in.readLine()) != null) { + dataArray.add(str); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + return dataArray; + } + + /** + * 根据关联数据构造贝叶斯网络无环有向图 + */ + private void constructDAG() { + // 节点存在标识 + boolean srcExist; + boolean desExist; + String name1; + String name2; + Node srcNode; + Node desNode; + + this.nodes = new ArrayList<>(); + for (String[] array : this.attachData) { + srcExist = false; + desExist = false; + + name1 = array[0]; + name2 = array[1]; + + // 新建节点 + srcNode = new Node(name1); + desNode = new Node(name2); + + for (Node temp : this.nodes) { + // 如果找到相同节点,则取出 + if (srcNode.isEqual(temp)) { + srcExist = true; + srcNode = temp; + } else if (desNode.isEqual(temp)) { + desExist = true; + desNode = temp; + } + + // 如果2个节点都已找到,则跳出循环 + if (srcExist && desExist) { + break; + } + } + + // 将2个节点进行连接 + srcNode.connectNode(desNode); + + // 根据标识判断是否需要加入列表容器中 + if (!srcExist) { + this.nodes.add(srcNode); + } + + if (!desExist) { + this.nodes.add(desNode); + } + } + } + + /** + * 查询条件概率 + * + * @param attrValues + * 条件属性值 + * @return + */ + private double queryConditionPro(ArrayList attrValues) { + // 判断是否满足先验属性值条件 + boolean hasPrior; + // 判断是否满足后验属性值条件 + boolean hasBack; + int priorIndex; + int attrIndex; + double backPro; + double totalPro; + double pro; + double currentPro; + // 先验属性 + String[] priorValue; + String[] tempData; + + pro = 0; + totalPro = 0; + backPro = 0; + attrValues.get(0); + priorValue = attrValues.get(0); + // 得到后验概率 + attrValues.remove(0); + + // 取出先验属性的列数 + priorIndex = this.attr2Column.get(priorValue[0]); + // 跳过第一行的属性名称行 + for (int i = 1; i < this.totalData.length; i++) { + tempData = this.totalData[i]; + + hasPrior = false; + hasBack = true; + + // 当前行的概率 + currentPro = Double.parseDouble(tempData[this.columns - 1]); + // 判断是否满足先验条件 + if (tempData[priorIndex].equals(priorValue[1])) { + hasPrior = true; + } + + for (String[] array : attrValues) { + attrIndex = this.attr2Column.get(array[0]); + + // 判断值是否满足条件 + if (!tempData[attrIndex].equals(array[1])) { + hasBack = false; + break; + } + } + + // 进行计数统计,分别计算满足后验属性的值和同时满足条件的个数 + if (hasBack) { + backPro += currentPro; + if (hasPrior) { + totalPro += currentPro; + } + } else if (hasPrior && attrValues.size() == 0) { + // 如果只有先验概率则为纯概率的计算 + totalPro += currentPro; + backPro = 1.0; + } + } + + // 计算总的概率=都发生概率/只发生后验条件的时间概率 + pro = totalPro / backPro; + + return pro; + } + + /** + * 根据贝叶斯网络计算概率 + * + * @param queryStr + * 查询条件串 + * @return + */ + public double calProByNetWork(String queryStr) { + double temp; + double pro; + String[] array; + // 先验条件值 + String[] preValue; + // 后验条件值 + String[] backValue; + // 所有先验条件和后验条件值的属性值的汇总 + ArrayList attrValues; + + // 判断是否满足网络结构 + if (!satisfiedNewWork(queryStr)) { + return -1; + } + + pro = 1; + // 首先做查询条件的分解 + array = queryStr.split(","); + + // 概率的初值等于第一个事件发生的随机概率 + attrValues = new ArrayList<>(); + attrValues.add(array[0].split("=")); + pro = queryConditionPro(attrValues); + + for (int i = 0; i < array.length - 1; i++) { + attrValues.clear(); + + // 下标小的在前面的属于后验属性 + backValue = array[i].split("="); + preValue = array[i + 1].split("="); + attrValues.add(preValue); + attrValues.add(backValue); + + // 算出此种情况的概率值 + temp = queryConditionPro(attrValues); + // 进行积的相乘 + pro *= temp; + } + + return pro; + } + + /** + * 验证事件的查询因果关系是否满足贝叶斯网络 + * + * @param queryStr + * 查询字符串 + * @return + */ + private boolean satisfiedNewWork(String queryStr) { + String attrName; + String[] array; + boolean isExist; + boolean isSatisfied; + // 当前节点 + Node currentNode; + // 候选节点列表 + ArrayList nodeList; + + isSatisfied = true; + currentNode = null; + // 做查询字符串的分解 + array = queryStr.split(","); + nodeList = this.nodes; + + for (String s : array) { + // 开始时默认属性对应的节点不存在 + isExist = false; + // 得到属性事件名 + attrName = s.split("=")[0]; + + for (Node n : nodeList) { + if (n.name.equals(attrName)) { + isExist = true; + + currentNode = n; + // 下一轮的候选节点为当前节点的孩子节点 + nodeList = currentNode.childNodes; + + break; + } + } + + // 如果存在未找到的节点,则说明不满足依赖结构跳出循环 + if (!isExist) { + isSatisfied = false; + break; + } + } + + return isSatisfied; + } +} diff --git a/Others/DataMining_BayesNetwork/Client.java b/Others/DataMining_BayesNetwork/Client.java new file mode 100644 index 0000000..98706c4 --- /dev/null +++ b/Others/DataMining_BayesNetwork/Client.java @@ -0,0 +1,32 @@ +package DataMining_BayesNetwork; + +import java.text.MessageFormat; + +/** + * 贝叶斯网络场景测试类 + * + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + String dataFilePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + String attachFilePath = "C:\\Users\\lyq\\Desktop\\icon\\attach.txt"; + // 查询串语句 + String queryStr; + // 结果概率 + double result; + + // 查询语句的描述的事件是地震发生了,导致响铃响了,导致接到Mary的电话 + queryStr = "E=y,A=y,M=y"; + BayesNetWorkTool tool = new BayesNetWorkTool(dataFilePath, + attachFilePath); + result = tool.calProByNetWork(queryStr); + + if (result == -1) { + System.out.println("所描述的事件不满足贝叶斯网络的结构,无法求其概率"); + } else { + System.out.println(String.format("事件%s发生的概率为%s", queryStr, result)); + } + } +} diff --git a/Others/DataMining_BayesNetwork/Node.java b/Others/DataMining_BayesNetwork/Node.java new file mode 100644 index 0000000..bb2a07d --- /dev/null +++ b/Others/DataMining_BayesNetwork/Node.java @@ -0,0 +1,58 @@ +package DataMining_BayesNetwork; + +import java.util.ArrayList; + +/** + * 贝叶斯网络节点类 + * + * @author lyq + * + */ +public class Node { + // 节点的属性名称 + String name; + // 节点的父亲节点,也就是上游节点,可能多个 + ArrayList parentNodes; + // 节点的子节点,也就是下游节点,可能多个 + ArrayList childNodes; + + public Node(String name) { + this.name = name; + + // 初始化变量 + this.parentNodes = new ArrayList<>(); + this.childNodes = new ArrayList<>(); + } + + /** + * 将自身节点连接到目标给定的节点 + * + * @param node + * 下游节点 + */ + public void connectNode(Node node) { + // 将下游节点加入自身节点的孩子节点中 + this.childNodes.add(node); + // 将自身节点加入到下游节点的父节点中 + node.parentNodes.add(this); + } + + /** + * 判断与目标节点是否相同,主要比较名称是否相同即可 + * + * @param node + * 目标结点 + * @return + */ + public boolean isEqual(Node node) { + boolean isEqual; + + isEqual = false; + // 节点名称相同则视为相等 + if (this.name.equals(node.name)) { + isEqual = true; + } + + return isEqual; + } +} diff --git a/Others/DataMining_BayesNetwork/attach.txt b/Others/DataMining_BayesNetwork/attach.txt new file mode 100644 index 0000000..bd4bdb6 --- /dev/null +++ b/Others/DataMining_BayesNetwork/attach.txt @@ -0,0 +1,4 @@ +B A +E A +A M +A J \ No newline at end of file diff --git a/Others/DataMining_BayesNetwork/input.txt b/Others/DataMining_BayesNetwork/input.txt new file mode 100644 index 0000000..ed01889 --- /dev/null +++ b/Others/DataMining_BayesNetwork/input.txt @@ -0,0 +1,33 @@ +B E A M J P +y y y y y 0.00012 +y y y y n 0.000051 +y y y n y 0.000013 +y y y n n 0.0000057 +y y n y y 0.000000005 +y y n y n 0.00000049 +y y n n y 0.000000095 +y y n n n 0.0000094 +y n y y y 0.0058 +y n y y n 0.0025 +y n y n y 0.00065 +y n y n n 0.00028 +y n n y y 0.00000029 +y n n y n 0.000029 +y n n n y 0.0000056 +y n n n n 0.00055 +n y y y y 0.0036 +n y y y n 0.0016 +n y y n y 0.0004 +n y y n n 0.00017 +n y n y y 0.000007 +n y n y n 0.00069 +n y n n y 0.00013 +n y n n n 0.013 +n n y y y 0.00061 +n n y y n 0.00026 +n n y n y 0.000068 +n n y n n 0.000029 +n n n y y 0.00048 +n n n y n 0.048 +n n n n y 0.0092 +n n n n n 0.91 \ No newline at end of file diff --git a/Others/DataMining_CABDDCC/CABDDCCTool.java b/Others/DataMining_CABDDCC/CABDDCCTool.java new file mode 100644 index 0000000..34081b4 --- /dev/null +++ b/Others/DataMining_CABDDCC/CABDDCCTool.java @@ -0,0 +1,102 @@ +package DataMining_CABDDCC; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.text.MessageFormat; +import java.util.ArrayList; + +/** + * 基于连通图的分裂聚类算法 + * + * @author lyq + * + */ +public class CABDDCCTool { + // 测试数据点数据 + private String filePath; + // 连通图距离阈值l + private int length; + // 原始坐标点 + public static ArrayList totalPoints; + // 聚类结果坐标点集合 + private ArrayList> resultClusters; + // 连通图 + private Graph graph; + + public CABDDCCTool(String filePath, int length) { + this.filePath = filePath; + this.length = length; + + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + public void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + Point p; + totalPoints = new ArrayList<>(); + for (String[] array : dataArray) { + p = new Point(array[0], array[1], array[2]); + totalPoints.add(p); + } + + // 用边和点构造图 + graph = new Graph(null, totalPoints); + } + + /** + * 分裂连通图得到聚类 + */ + public void splitCluster() { + // 获取形成连通子图 + ArrayList subGraphs; + ArrayList> pointList; + resultClusters = new ArrayList<>(); + + subGraphs = graph.splitGraphByLength(length); + + for (Graph g : subGraphs) { + // 获取每个连通子图分裂后的聚类结果 + pointList = g.getClusterByDivding(); + resultClusters.addAll(pointList); + } + + printResultCluster(); + } + + /** + * 输出结果聚簇 + */ + private void printResultCluster() { + int i = 1; + for (ArrayList cluster : resultClusters) { + System.out.print("聚簇" + i + ":"); + for (Point p : cluster){ + System.out.print(MessageFormat.format("({0}, {1}) ", p.x, p.y)); + } + System.out.println(); + i++; + } + + } + +} diff --git a/Others/DataMining_CABDDCC/Client.java b/Others/DataMining_CABDDCC/Client.java new file mode 100644 index 0000000..c57e3f5 --- /dev/null +++ b/Others/DataMining_CABDDCC/Client.java @@ -0,0 +1,17 @@ +package DataMining_CABDDCC; + +/** + * 基于连通图的分裂聚类算法 + * @author lyq + * + */ +public class Client { + public static void main(String[] agrs){ + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\graphData.txt"; + //连通距离阈值 + int length = 3; + + CABDDCCTool tool = new CABDDCCTool(filePath, length); + tool.splitCluster(); + } +} diff --git a/Others/DataMining_CABDDCC/Graph.java b/Others/DataMining_CABDDCC/Graph.java new file mode 100644 index 0000000..b59d06e --- /dev/null +++ b/Others/DataMining_CABDDCC/Graph.java @@ -0,0 +1,287 @@ +package DataMining_CABDDCC; + +import java.util.ArrayList; +import java.util.Collections; + +/** + * 连通图类 + * + * @author lyq + * + */ +public class Graph { + // 坐标点之间的连接属性,括号内为坐标id号 + int[][] edges; + // 连通图内的坐标点数 + ArrayList points; + // 此图下分割后的聚类子图 + ArrayList> clusters; + + public Graph(int[][] edges) { + this.edges = edges; + this.points = getPointByEdges(edges); + } + + public Graph(int[][] edges, ArrayList points) { + this.edges = edges; + this.points = points; + } + + public int[][] getEdges() { + return edges; + } + + public void setEdges(int[][] edges) { + this.edges = edges; + } + + public ArrayList getPoints() { + return points; + } + + public void setPoints(ArrayList points) { + this.points = points; + } + + /** + * 根据距离阈值做连通图的划分,构成连通图集 + * + * @param length + * 距离阈值 + * @return + */ + public ArrayList splitGraphByLength(int length) { + int[][] edges; + Graph tempGraph; + ArrayList graphs = new ArrayList<>(); + + for (Point p : points) { + if (!p.isVisited) { + // 括号中的下标为id号 + edges = new int[points.size()][points.size()]; + dfsExpand(p, length, edges); + + tempGraph = new Graph(edges); + graphs.add(tempGraph); + } else { + continue; + } + } + + return graphs; + } + + /** + * 深度优先方式扩展连通图 + * + * @param points + * 需要继续深搜的坐标点 + * @param length + * 距离阈值 + * @param edges + * 边数组 + */ + private void dfsExpand(Point point, int length, int edges[][]) { + int id1 = 0; + int id2 = 0; + double distance = 0; + ArrayList tempPoints; + + // 如果处理过了,则跳过 + if (point.isVisited) { + return; + } + + id1 = point.id; + point.isVisited = true; + tempPoints = new ArrayList<>(); + for (Point p2 : points) { + id2 = p2.id; + + if (id1 == id2) { + continue; + } else { + distance = point.ouDistance(p2); + if (distance <= length) { + edges[id1][id2] = 1; + edges[id2][id1] = 1; + + tempPoints.add(p2); + } + } + } + + // 继续递归 + for (Point p : tempPoints) { + dfsExpand(p, length, edges); + } + } + + /** + * 判断连通图是否还需要再被划分 + * + * @param pointList1 + * 坐标点集合1 + * @param pointList2 + * 坐标点集合2 + * @return + */ + private boolean needDivided(ArrayList pointList1, + ArrayList pointList2) { + boolean needDivided = false; + // 承受系数t=轻的集合的坐标点数/2部分连接的边数 + double t = 0; + // 分裂阈值,即平均每边所要承受的重量 + double landa = 0; + int pointNum1 = pointList1.size(); + int pointNum2 = pointList2.size(); + // 总边数 + int totalEdgeNum = 0; + // 连接2部分的边数量 + int connectedEdgeNum = 0; + ArrayList totalPoints = new ArrayList<>(); + + totalPoints.addAll(pointList1); + totalPoints.addAll(pointList2); + int id1 = 0; + int id2 = 0; + for (Point p1 : totalPoints) { + id1 = p1.id; + for (Point p2 : totalPoints) { + id2 = p2.id; + + if (edges[id1][id2] == 1 && id1 < id2) { + if ((pointList1.contains(p1) && pointList2.contains(p2)) + || (pointList1.contains(p2) && pointList2 + .contains(p1))) { + connectedEdgeNum++; + } + totalEdgeNum++; + } + } + } + + if (pointNum1 < pointNum2) { + // 承受系数t=轻的集合的坐标点数/连接2部分的边数 + t = 1.0 * pointNum1 / connectedEdgeNum; + } else { + t = 1.0 * pointNum2 / connectedEdgeNum; + } + + // 计算分裂阈值,括号内为总边数/总点数,就是平均每边所承受的点数量 + landa = 0.5 * Math.exp((1.0 * totalEdgeNum / (pointNum1 + pointNum2))); + + // 如果承受系数不小于分裂阈值,则代表需要分裂 + if (t >= landa) { + needDivided = true; + } + + return needDivided; + } + + /** + * 递归的划分连通图 + * + * @param pointList + * 待划分的连通图的所有坐标点 + */ + public void divideGraph(ArrayList pointList) { + // 判断此坐标点集合是否能够被分割 + boolean canDivide = false; + ArrayList> pointGroup; + ArrayList pointList1 = new ArrayList<>(); + ArrayList pointList2 = new ArrayList<>(); + + for (int m = 2; m <= pointList.size() / 2; m++) { + // 进行坐标点的分割 + pointGroup = removePoint(pointList, m); + pointList1 = pointGroup.get(0); + pointList2 = pointGroup.get(1); + + // 判断是否满足分裂条件 + if (needDivided(pointList1, pointList2)) { + canDivide = true; + divideGraph(pointList1); + divideGraph(pointList2); + } + } + + // 如果所有的分割组合都无法分割,则说明此已经是一个聚类 + if (!canDivide) { + clusters.add(pointList); + } + } + + /** + * 获取分裂得到的聚类结果 + * + * @return + */ + public ArrayList> getClusterByDivding() { + clusters = new ArrayList<>(); + + divideGraph(points); + + return clusters; + } + + /** + * 将当前坐标点集合移除removeNum个点,构成2个子坐标点集合 + * + * @param pointList + * 原集合点 + * @param removeNum + * 移除的数量 + */ + private ArrayList> removePoint(ArrayList pointList, + int removeNum) { + //浅拷贝一份原坐标点数据 + ArrayList copyPointList = (ArrayList) pointList.clone(); + ArrayList> pointGroup = new ArrayList<>(); + ArrayList pointList2 = new ArrayList<>(); + // 进行按照坐标轴大小排序 + Collections.sort(copyPointList); + + for (int i = 0; i < removeNum; i++) { + pointList2.add(copyPointList.get(i)); + } + copyPointList.removeAll(pointList2); + + pointGroup.add(copyPointList); + pointGroup.add(pointList2); + + return pointGroup; + } + + /** + * 根据边的情况获取其中的点 + * + * @param edges + * 当前的已知的边的情况 + * @return + */ + private ArrayList getPointByEdges(int[][] edges) { + Point p1; + Point p2; + ArrayList pointList = new ArrayList<>(); + + for (int i = 0; i < edges.length; i++) { + for (int j = 0; j < edges[0].length; j++) { + if (edges[i][j] == 1) { + p1 = CABDDCCTool.totalPoints.get(i); + p2 = CABDDCCTool.totalPoints.get(j); + + if (!pointList.contains(p1)) { + pointList.add(p1); + } + + if (!pointList.contains(p2)) { + pointList.add(p2); + } + } + } + } + + return pointList; + } +} diff --git a/Others/DataMining_CABDDCC/Point.java b/Others/DataMining_CABDDCC/Point.java new file mode 100644 index 0000000..2763be4 --- /dev/null +++ b/Others/DataMining_CABDDCC/Point.java @@ -0,0 +1,69 @@ +package DataMining_CABDDCC; + + + +/** + * 坐标点类 + * @author lyq + * + */ +public class Point implements Comparable{ + //坐标点id号,id号唯一 + int id; + //坐标横坐标 + Integer x; + //坐标纵坐标 + Integer y; + //坐标点是否已经被访问(处理)过,在生成连通子图的时候用到 + boolean isVisited; + + public Point(String id, String x, String y){ + this.id = Integer.parseInt(id); + this.x = Integer.parseInt(x); + this.y = Integer.parseInt(y); + } + + /** + * 计算当前点与制定点之间的欧式距离 + * + * @param p + * 待计算聚类的p点 + * @return + */ + public double ouDistance(Point p) { + double distance = 0; + + distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) + * (this.y - p.y); + distance = Math.sqrt(distance); + + return distance; + } + + /** + * 判断2个坐标点是否为用个坐标点 + * + * @param p + * 待比较坐标点 + * @return + */ + public boolean isTheSame(Point p) { + boolean isSamed = false; + + if (this.x == p.x && this.y == p.y) { + isSamed = true; + } + + return isSamed; + } + + @Override + public int compareTo(Point p) { + if(this.x.compareTo(p.x) != 0){ + return this.x.compareTo(p.x); + }else{ + //如果在x坐标相等的情况下比较y坐标 + return this.y.compareTo(p.y); + } + } +} diff --git a/Others/DataMining_CABDDCC/graphData.txt b/Others/DataMining_CABDDCC/graphData.txt new file mode 100644 index 0000000..9a04431 --- /dev/null +++ b/Others/DataMining_CABDDCC/graphData.txt @@ -0,0 +1,15 @@ +0 1 12 +1 3 9 +2 3 12 +3 4 10 +4 4 4 +5 4 1 +6 6 1 +7 6 3 +8 6 9 +9 8 3 +10 8 10 +11 9 2 +12 9 11 +13 10 9 +14 11 12 \ No newline at end of file diff --git a/Others/DataMining_Chameleon/ChameleonTool.java b/Others/DataMining_Chameleon/ChameleonTool.java new file mode 100644 index 0000000..811ea3d --- /dev/null +++ b/Others/DataMining_Chameleon/ChameleonTool.java @@ -0,0 +1,423 @@ +package DataMining_Chameleon; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.text.MessageFormat; +import java.util.ArrayList; + +/** + * Chameleon 两阶段聚类算法工具类 + * + * @author lyq + * + */ +public class ChameleonTool { + // 测试数据点文件地址 + private String filePath; + // 第一阶段的k近邻的k大小 + private int k; + // 簇度量函数阈值 + private double minMetric; + // 总的坐标点的个数 + private int pointNum; + // 总的连接矩阵的情况,括号表示的是坐标点的id号 + public static int[][] edges; + // 点与点之间的边的权重 + public static double[][] weights; + // 原始坐标点数据 + private ArrayList totalPoints; + // 第一阶段产生的所有的连通子图作为最初始的聚类 + private ArrayList initClusters; + // 结果簇结合 + private ArrayList resultClusters; + + public ChameleonTool(String filePath, int k, double minMetric) { + this.filePath = filePath; + this.k = k; + this.minMetric = minMetric; + + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + private void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + Point p; + totalPoints = new ArrayList<>(); + for (String[] array : dataArray) { + p = new Point(array[0], array[1], array[2]); + totalPoints.add(p); + } + pointNum = totalPoints.size(); + } + + /** + * 递归的合并小聚簇 + */ + private void combineSubClusters() { + Cluster cluster = null; + + resultClusters = new ArrayList<>(); + + // 当最后的聚簇只剩下一个的时候,则退出循环 + while (initClusters.size() > 1) { + cluster = initClusters.get(0); + combineAndRemove(cluster, initClusters); + } + } + + /** + * 递归的合并聚簇和移除聚簇 + * + * @param clusterList + */ + private ArrayList combineAndRemove(Cluster cluster, + ArrayList clusterList) { + ArrayList remainClusters; + double metric = 0; + double maxMetric = -Integer.MAX_VALUE; + Cluster cluster1 = null; + Cluster cluster2 = null; + + for (Cluster c2 : clusterList) { + if(cluster.id == c2.id){ + continue; + } + + metric = calMetricfunction(cluster, c2, 1); + + if (metric > maxMetric) { + maxMetric = metric; + cluster1 = cluster; + cluster2 = c2; + } + } + + // 如果度量函数值超过阈值,则进行合并,继续搜寻可以合并的簇 + if (maxMetric > minMetric) { + clusterList.remove(cluster2); + //将边进行连接 + connectClusterToCluster(cluster1, cluster2); + // 将簇1和簇2合并 + cluster1.points.addAll(cluster2.points); + remainClusters = combineAndRemove(cluster1, clusterList); + } else { + clusterList.remove(cluster); + remainClusters = clusterList; + resultClusters.add(cluster); + } + + return remainClusters; + } + + /** + * 将2个簇进行边的连接 + * @param c1 + * 聚簇1 + * @param c2 + * 聚簇2 + */ + private void connectClusterToCluster(Cluster c1, Cluster c2){ + ArrayList connectedEdges; + + connectedEdges = c1.calNearestEdge(c2, 2); + + for(int[] array: connectedEdges){ + edges[array[0]][array[1]] = 1; + edges[array[1]][array[0]] = 1; + } + } + + /** + * 算法第一阶段形成局部的连通图 + */ + private void connectedGraph() { + double distance = 0; + Point p1; + Point p2; + + // 初始化权重矩阵和连接矩阵 + weights = new double[pointNum][pointNum]; + edges = new int[pointNum][pointNum]; + for (int i = 0; i < pointNum; i++) { + for (int j = 0; j < pointNum; j++) { + p1 = totalPoints.get(i); + p2 = totalPoints.get(j); + + distance = p1.ouDistance(p2); + if (distance == 0) { + // 如果点为自身的话,则权重设置为0 + weights[i][j] = 0; + } else { + // 边的权重采用的值为距离的倒数,距离越近,权重越大 + weights[i][j] = 1.0 / distance; + } + } + } + + double[] tempWeight; + int[] ids; + int id1 = 0; + int id2 = 0; + // 对每个id坐标点,取其权重前k个最大的点进行相连 + for (int i = 0; i < pointNum; i++) { + tempWeight = weights[i]; + // 进行排序 + ids = sortWeightArray(tempWeight); + + // 取出前k个权重最大的边进行连接 + for (int j = 0; j < ids.length; j++) { + if (j < k) { + id1 = i; + id2 = ids[j]; + + edges[id1][id2] = 1; + edges[id2][id1] = 1; + } + } + } + } + + /** + * 权重的冒泡算法排序 + * + * @param array + * 待排序数组 + */ + private int[] sortWeightArray(double[] array) { + double[] copyArray = array.clone(); + int[] ids = null; + int k = 0; + double maxWeight = -1; + + ids = new int[pointNum]; + for(int i=0; i maxWeight){ + maxWeight = copyArray[j]; + k = j; + } + } + + ids[i] = k; + //将当前找到的最大的值重置为-1代表已经找到过了 + copyArray[k] = -1; + } + + return ids; + } + + /** + * 根据边的连通性去深度优先搜索所有的小聚簇 + */ + private void searchSmallCluster() { + int currentId = 0; + Point p; + Cluster cluster; + initClusters = new ArrayList<>(); + ArrayList pointList = null; + + // 以id的方式逐个去dfs搜索 + for (int i = 0; i < pointNum; i++) { + p = totalPoints.get(i); + + if (p.isVisited) { + continue; + } + + pointList = new ArrayList<>(); + pointList.add(p); + recusiveDfsSearch(p, -1, pointList); + + cluster = new Cluster(currentId, pointList); + initClusters.add(cluster); + + currentId++; + } + } + + /** + * 深度优先的方式找到边所连接着的所有坐标点 + * + * @param p + * 当前搜索的起点 + * @param lastId + * 此点的父坐标点 + * @param pList + * 坐标点列表 + */ + private void recusiveDfsSearch(Point p, int parentId, ArrayList pList) { + int id1 = 0; + int id2 = 0; + Point newPoint; + + if (p.isVisited) { + return; + } + + p.isVisited = true; + for (int j = 0; j < pointNum; j++) { + id1 = p.id; + id2 = j; + + if (edges[id1][id2] == 1 && id2 != parentId) { + newPoint = totalPoints.get(j); + pList.add(newPoint); + // 以此点为起点,继续递归搜索 + recusiveDfsSearch(newPoint, id1, pList); + } + } + } + + /** + * 计算连接2个簇的边的权重 + * + * @param c1 + * 聚簇1 + * @param c2 + * 聚簇2 + * @return + */ + private double calEC(Cluster c1, Cluster c2) { + double resultEC = 0; + ArrayList connectedEdges = null; + + connectedEdges = c1.calNearestEdge(c2, 2); + + // 计算连接2部分的边的权重和 + for (int[] array : connectedEdges) { + resultEC += weights[array[0]][array[1]]; + } + + return resultEC; + } + + /** + * 计算2个簇的相对互连性 + * + * @param c1 + * @param c2 + * @return + */ + private double calRI(Cluster c1, Cluster c2) { + double RI = 0; + double EC1 = 0; + double EC2 = 0; + double EC1To2 = 0; + + EC1 = c1.calEC(); + EC2 = c2.calEC(); + EC1To2 = calEC(c1, c2); + + RI = 2 * EC1To2 / (EC1 + EC2); + + return RI; + } + + /** + * 计算簇的相对近似度 + * + * @param c1 + * 簇1 + * @param c2 + * 簇2 + * @return + */ + private double calRC(Cluster c1, Cluster c2) { + double RC = 0; + double EC1 = 0; + double EC2 = 0; + double EC1To2 = 0; + int pNum1 = c1.points.size(); + int pNum2 = c2.points.size(); + + EC1 = c1.calEC(); + EC2 = c2.calEC(); + EC1To2 = calEC(c1, c2); + + RC = EC1To2 * (pNum1 + pNum2) / (pNum2 * EC1 + pNum1 * EC2); + + return RC; + } + + /** + * 计算度量函数的值 + * + * @param c1 + * 簇1 + * @param c2 + * 簇2 + * @param alpha + * 幂的参数值 + * @return + */ + private double calMetricfunction(Cluster c1, Cluster c2, int alpha) { + // 度量函数值 + double metricValue = 0; + double RI = 0; + double RC = 0; + + RI = calRI(c1, c2); + RC = calRC(c1, c2); + // 如果alpha大于1,则更重视相对近似性,如果alpha逍遥于1,注重相对互连性 + metricValue = RI * Math.pow(RC, alpha); + + return metricValue; + } + + /** + * 输出聚簇列 + * @param clusterList + * 输出聚簇列 + */ + private void printClusters(ArrayList clusterList) { + int i = 1; + + for (Cluster cluster : clusterList) { + System.out.print("聚簇" + i + ":"); + for (Point p : cluster.points) { + System.out.print(MessageFormat.format("({0}, {1}) ", p.x, p.y)); + } + System.out.println(); + i++; + } + + } + + /** + * 创建聚簇 + */ + public void buildCluster() { + // 第一阶段形成小聚簇 + connectedGraph(); + searchSmallCluster(); + System.out.println("第一阶段形成的小簇集合:"); + printClusters(initClusters); + + // 第二阶段根据RI和RC的值合并小聚簇形成最终结果聚簇 + combineSubClusters(); + System.out.println("最终的聚簇集合:"); + printClusters(resultClusters); + } +} diff --git a/Others/DataMining_Chameleon/Client.java b/Others/DataMining_Chameleon/Client.java new file mode 100644 index 0000000..254f760 --- /dev/null +++ b/Others/DataMining_Chameleon/Client.java @@ -0,0 +1,19 @@ +package DataMining_Chameleon; + +/** + * Chameleon(变色龙)两阶段聚类算法 + * @author lyq + * + */ +public class Client { + public static void main(String[] args){ + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\graphData.txt"; + //k-近邻的k设置 + int k = 1; + //度量函数阈值 + double minMetric = 0.1; + + ChameleonTool tool = new ChameleonTool(filePath, k, minMetric); + tool.buildCluster(); + } +} diff --git a/Others/DataMining_Chameleon/Cluster.java b/Others/DataMining_Chameleon/Cluster.java new file mode 100644 index 0000000..42e1f94 --- /dev/null +++ b/Others/DataMining_Chameleon/Cluster.java @@ -0,0 +1,119 @@ +package DataMining_Chameleon; + +import java.util.ArrayList; + +/** + * 聚簇类 + * + * @author lyq + * + */ +public class Cluster implements Cloneable{ + //簇唯一id标识号 + int id; + // 聚簇内的坐标点集合 + ArrayList points; + // 聚簇内的所有边的权重和 + double weightSum = 0; + + public Cluster(int id, ArrayList points) { + this.id = id; + this.points = points; + } + + /** + * 计算聚簇的内部的边权重和 + * + * @return + */ + public double calEC() { + int id1 = 0; + int id2 = 0; + weightSum = 0; + + for (Point p1 : points) { + for (Point p2 : points) { + id1 = p1.id; + id2 = p2.id; + + // 为了避免重复计算,取id1小的对应大的 + if (id1 < id2 && ChameleonTool.edges[id1][id2] == 1) { + weightSum += ChameleonTool.weights[id1][id2]; + } + } + } + + return weightSum; + } + + /** + * 计算2个簇之间最近的n条边 + * + * @param otherCluster + * 待比较的簇 + * @param n + * 最近的边的数目 + * @return + */ + public ArrayList calNearestEdge(Cluster otherCluster, int n){ + int count = 0; + double distance = 0; + double minDistance = Integer.MAX_VALUE; + Point point1 = null; + Point point2 = null; + ArrayList edgeList = new ArrayList<>(); + ArrayList pointList1 = (ArrayList) points.clone(); + ArrayList pointList2 = null; + Cluster c2 = null; + + try { + c2 = (Cluster) otherCluster.clone(); + pointList2 = c2.points; + } catch (CloneNotSupportedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + int[] tempEdge; + // 循环计算出每次的最近距离 + while (count < n) { + tempEdge = new int[2]; + minDistance = Integer.MAX_VALUE; + + for (Point p1 : pointList1) { + for (Point p2 : pointList2) { + distance = p1.ouDistance(p2); + if (distance < minDistance) { + point1 = p1; + point2 = p2; + tempEdge[0] = p1.id; + tempEdge[1] = p2.id; + + minDistance = distance; + } + } + } + + pointList1.remove(point1); + pointList2.remove(point2); + edgeList.add(tempEdge); + count++; + } + + return edgeList; + } + + @Override + protected Object clone() throws CloneNotSupportedException { + // TODO Auto-generated method stub + + //引用需要再次复制,实现深拷贝 + ArrayList pointList = (ArrayList) this.points.clone(); + Cluster cluster = new Cluster(id, pointList); + + return cluster; + } + + + +} diff --git a/Others/DataMining_Chameleon/Point.java b/Others/DataMining_Chameleon/Point.java new file mode 100644 index 0000000..2a3b8cc --- /dev/null +++ b/Others/DataMining_Chameleon/Point.java @@ -0,0 +1,59 @@ +package DataMining_Chameleon; + + + +/** + * 坐标点类 + * @author lyq + * + */ +public class Point{ + //坐标点id号,id号唯一 + int id; + //坐标横坐标 + Integer x; + //坐标纵坐标 + Integer y; + //是否已经被访问过 + boolean isVisited; + + public Point(String id, String x, String y){ + this.id = Integer.parseInt(id); + this.x = Integer.parseInt(x); + this.y = Integer.parseInt(y); + } + + /** + * 计算当前点与制定点之间的欧式距离 + * + * @param p + * 待计算聚类的p点 + * @return + */ + public double ouDistance(Point p) { + double distance = 0; + + distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) + * (this.y - p.y); + distance = Math.sqrt(distance); + + return distance; + } + + /** + * 判断2个坐标点是否为用个坐标点 + * + * @param p + * 待比较坐标点 + * @return + */ + public boolean isTheSame(Point p) { + boolean isSamed = false; + + if (this.x == p.x && this.y == p.y) { + isSamed = true; + } + + return isSamed; + } +} diff --git a/Others/DataMining_Chameleon/graphData.txt b/Others/DataMining_Chameleon/graphData.txt new file mode 100644 index 0000000..d618d9a --- /dev/null +++ b/Others/DataMining_Chameleon/graphData.txt @@ -0,0 +1,19 @@ +0 2 2 +1 3 1 +2 3 4 +3 3 14 +4 5 3 +5 8 3 +6 8 6 +7 9 8 +8 10 4 +9 10 7 +10 10 10 +11 10 14 +12 11 13 +13 12 8 +14 12 15 +15 14 7 +16 14 9 +17 14 15 +18 15 8 \ No newline at end of file diff --git a/Others/DataMining_DBSCAN/Client.java b/Others/DataMining_DBSCAN/Client.java new file mode 100644 index 0000000..f3d810c --- /dev/null +++ b/Others/DataMining_DBSCAN/Client.java @@ -0,0 +1,19 @@ +package DataMining_DBSCAN; + +/** + * Dbscan基于密度的聚类算法测试类 + * @author lyq + * + */ +public class Client { + public static void main(String[] args){ + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + //簇扫描半径 + double eps = 3; + //最小包含点数阈值 + int minPts = 3; + + DBSCANTool tool = new DBSCANTool(filePath, eps, minPts); + tool.dbScanCluster(); + } +} diff --git a/Others/DataMining_DBSCAN/DBSCANTool.java b/Others/DataMining_DBSCAN/DBSCANTool.java new file mode 100644 index 0000000..27f2f8e --- /dev/null +++ b/Others/DataMining_DBSCAN/DBSCANTool.java @@ -0,0 +1,209 @@ +package DataMining_DBSCAN; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.text.MessageFormat; +import java.util.ArrayList; + +/** + * DBSCAN基于密度聚类算法工具类 + * + * @author lyq + * + */ +public class DBSCANTool { + // 测试数据文件地址 + private String filePath; + // 簇扫描半径 + private double eps; + // 最小包含点数阈值 + private int minPts; + // 所有的数据坐标点 + private ArrayList totalPoints; + // 聚簇结果 + private ArrayList> resultClusters; + //噪声数据 + private ArrayList noisePoint; + + public DBSCANTool(String filePath, double eps, int minPts) { + this.filePath = filePath; + this.eps = eps; + this.minPts = minPts; + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + public void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + Point p; + totalPoints = new ArrayList<>(); + for (String[] array : dataArray) { + p = new Point(array[0], array[1]); + totalPoints.add(p); + } + } + + /** + * 递归的寻找聚簇 + * + * @param pointList + * 当前的点列表 + * @param parentCluster + * 父聚簇 + */ + private void recursiveCluster(Point point, ArrayList parentCluster) { + double distance = 0; + ArrayList cluster; + + // 如果已经访问过了,则跳过 + if (point.isVisited) { + return; + } + + point.isVisited = true; + cluster = new ArrayList<>(); + for (Point p2 : totalPoints) { + // 过滤掉自身的坐标点 + if (point.isTheSame(p2)) { + continue; + } + + distance = point.ouDistance(p2); + if (distance <= eps) { + // 如果聚类小于给定的半径,则加入簇中 + cluster.add(p2); + } + } + + if (cluster.size() >= minPts) { + // 将自己也加入到聚簇中 + cluster.add(point); + // 如果附近的节点个数超过最下值,则加入到父聚簇中,同时去除重复的点 + addCluster(parentCluster, cluster); + + for (Point p : cluster) { + recursiveCluster(p, parentCluster); + } + } + } + + /** + * 往父聚簇中添加局部簇坐标点 + * + * @param parentCluster + * 原始父聚簇坐标点 + * @param cluster + * 待合并的聚簇 + */ + private void addCluster(ArrayList parentCluster, + ArrayList cluster) { + boolean isCotained = false; + ArrayList addPoints = new ArrayList<>(); + + for (Point p : cluster) { + isCotained = false; + for (Point p2 : parentCluster) { + if (p.isTheSame(p2)) { + isCotained = true; + break; + } + } + + if (!isCotained) { + addPoints.add(p); + } + } + + parentCluster.addAll(addPoints); + } + + /** + * dbScan算法基于密度的聚类 + */ + public void dbScanCluster() { + ArrayList cluster = null; + resultClusters = new ArrayList<>(); + noisePoint = new ArrayList<>(); + + for (Point p : totalPoints) { + if(p.isVisited){ + continue; + } + + cluster = new ArrayList<>(); + recursiveCluster(p, cluster); + + if (cluster.size() > 0) { + resultClusters.add(cluster); + }else{ + noisePoint.add(p); + } + } + removeFalseNoise(); + + printClusters(); + } + + /** + * 移除被错误分类的噪声点数据 + */ + private void removeFalseNoise(){ + ArrayList totalCluster = new ArrayList<>(); + ArrayList deletePoints = new ArrayList<>(); + + //将聚簇合并 + for(ArrayList list: resultClusters){ + totalCluster.addAll(list); + } + + for(Point p: noisePoint){ + for(Point p2: totalCluster){ + if(p2.isTheSame(p)){ + deletePoints.add(p); + } + } + } + + noisePoint.removeAll(deletePoints); + } + + /** + * 输出聚类结果 + */ + private void printClusters() { + int i = 1; + for (ArrayList pList : resultClusters) { + System.out.print("聚簇" + (i++) + ":"); + for (Point p : pList) { + System.out.print(MessageFormat.format("({0},{1}) ", p.x, p.y)); + } + System.out.println(); + } + + System.out.println(); + System.out.print("噪声数据:"); + for (Point p : noisePoint) { + System.out.print(MessageFormat.format("({0},{1}) ", p.x, p.y)); + } + System.out.println(); + } +} diff --git a/Others/DataMining_DBSCAN/Point.java b/Others/DataMining_DBSCAN/Point.java new file mode 100644 index 0000000..f773bad --- /dev/null +++ b/Others/DataMining_DBSCAN/Point.java @@ -0,0 +1,56 @@ +package DataMining_DBSCAN; + +/** + * 坐标点类 + * + * @author lyq + * + */ +public class Point { + // 坐标点横坐标 + int x; + // 坐标点纵坐标 + int y; + // 此节点是否已经被访问过 + boolean isVisited; + + public Point(String x, String y) { + this.x = (Integer.parseInt(x)); + this.y = (Integer.parseInt(y)); + this.isVisited = false; + } + + /** + * 计算当前点与制定点之间的欧式距离 + * + * @param p + * 待计算聚类的p点 + * @return + */ + public double ouDistance(Point p) { + double distance = 0; + + distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) + * (this.y - p.y); + distance = Math.sqrt(distance); + + return distance; + } + + /** + * 判断2个坐标点是否为用个坐标点 + * + * @param p + * 待比较坐标点 + * @return + */ + public boolean isTheSame(Point p) { + boolean isSamed = false; + + if (this.x == p.x && this.y == p.y) { + isSamed = true; + } + + return isSamed; + } +} diff --git a/Others/DataMining_DBSCAN/input.txt b/Others/DataMining_DBSCAN/input.txt new file mode 100644 index 0000000..5bd1c13 --- /dev/null +++ b/Others/DataMining_DBSCAN/input.txt @@ -0,0 +1,19 @@ +2 2 +3 1 +3 4 +3 14 +5 3 +8 3 +8 6 +9 8 +10 4 +10 7 +10 10 +10 14 +11 13 +12 8 +12 15 +14 7 +14 9 +14 15 +15 8 \ No newline at end of file diff --git a/Others/DataMining_GA/Client.java b/Others/DataMining_GA/Client.java new file mode 100644 index 0000000..eff2dbc --- /dev/null +++ b/Others/DataMining_GA/Client.java @@ -0,0 +1,19 @@ +package GA; + +/** + * Genetic遗传算法测试类 + * @author lyq + * + */ +public class Client { + public static void main(String[] args){ + //变量最小值和最大值 + int minNum = 1; + int maxNum = 7; + //初始群体规模 + int initSetsNum = 4; + + GATool tool = new GATool(minNum, maxNum, initSetsNum); + tool.geneticCal(); + } +} diff --git a/Others/DataMining_GA/GATool.java b/Others/DataMining_GA/GATool.java new file mode 100644 index 0000000..567c393 --- /dev/null +++ b/Others/DataMining_GA/GATool.java @@ -0,0 +1,361 @@ +package GA; + +import java.util.ArrayList; +import java.util.Random; + +/** + * 遗传算法工具类 + * + * @author lyq + * + */ +public class GATool { + // 变量最小值 + private int minNum; + // 变量最大值 + private int maxNum; + // 单个变量的编码位数 + private int codeNum; + // 初始种群的数量 + private int initSetsNum; + // 随机数生成器 + private Random random; + // 初始群体 + private ArrayList initSets; + + public GATool(int minNum, int maxNum, int initSetsNum) { + this.minNum = minNum; + this.maxNum = maxNum; + this.initSetsNum = initSetsNum; + + this.random = new Random(); + produceInitSets(); + } + + /** + * 产生初始化群体 + */ + private void produceInitSets() { + this.codeNum = 0; + int num = maxNum; + int[] array; + + initSets = new ArrayList<>(); + + // 确定编码位数 + while (num != 0) { + codeNum++; + num /= 2; + } + + for (int i = 0; i < initSetsNum; i++) { + array = produceInitCode(); + initSets.add(array); + } + } + + /** + * 产生初始个体的编码 + * + * @return + */ + private int[] produceInitCode() { + int num = 0; + int num2 = 0; + int[] tempArray; + int[] array1; + int[] array2; + + tempArray = new int[2 * codeNum]; + array1 = new int[codeNum]; + array2 = new int[codeNum]; + + num = 0; + while (num < minNum || num > maxNum) { + num = random.nextInt(maxNum) + 1; + } + numToBinaryArray(array1, num); + + while (num2 < minNum || num2 > maxNum) { + num2 = random.nextInt(maxNum) + 1; + } + numToBinaryArray(array2, num2); + + // 组成总的编码 + for (int i = 0, k = 0; i < tempArray.length; i++, k++) { + if (k < codeNum) { + tempArray[i] = array1[k]; + } else { + tempArray[i] = array2[k - codeNum]; + } + } + + return tempArray; + } + + /** + * 选择操作,把适值较高的个体优先遗传到下一代 + * + * @param initCodes + * 初始个体编码 + * @return + */ + private ArrayList selectOperate(ArrayList initCodes) { + double randomNum = 0; + double sumAdaptiveValue = 0; + ArrayList resultCodes = new ArrayList<>(); + double[] adaptiveValue = new double[initSetsNum]; + + for (int i = 0; i < initSetsNum; i++) { + adaptiveValue[i] = calCodeAdaptiveValue(initCodes.get(i)); + sumAdaptiveValue += adaptiveValue[i]; + } + + // 转成概率的形式,做归一化操作 + for (int i = 0; i < initSetsNum; i++) { + adaptiveValue[i] = adaptiveValue[i] / sumAdaptiveValue; + } + + for (int i = 0; i < initSetsNum; i++) { + randomNum = random.nextInt(100) + 1; + randomNum = randomNum / 100; + //因为1.0是无法判断到的,,总和会无限接近1.0取为0.99做判断 + if(randomNum == 1){ + randomNum = randomNum - 0.01; + } + + sumAdaptiveValue = 0; + // 确定区间 + for (int j = 0; j < initSetsNum; j++) { + if (randomNum > sumAdaptiveValue + && randomNum <= sumAdaptiveValue + adaptiveValue[j]) { + //采用拷贝的方式避免引用重复 + resultCodes.add(initCodes.get(j).clone()); + break; + } else { + sumAdaptiveValue += adaptiveValue[j]; + } + } + } + + return resultCodes; + } + + /** + * 交叉运算 + * + * @param selectedCodes + * 上步骤的选择后的编码 + * @return + */ + private ArrayList crossOperate(ArrayList selectedCodes) { + int randomNum = 0; + // 交叉点 + int crossPoint = 0; + ArrayList resultCodes = new ArrayList<>(); + // 随机编码队列,进行随机交叉配对 + ArrayList randomCodeSeqs = new ArrayList<>(); + + // 进行随机排序 + while (selectedCodes.size() > 0) { + randomNum = random.nextInt(selectedCodes.size()); + + randomCodeSeqs.add(selectedCodes.get(randomNum)); + selectedCodes.remove(randomNum); + } + + int temp = 0; + int[] array1; + int[] array2; + // 进行两两交叉运算 + for (int i = 1; i < randomCodeSeqs.size(); i++) { + if (i % 2 == 1) { + array1 = randomCodeSeqs.get(i - 1); + array2 = randomCodeSeqs.get(i); + crossPoint = random.nextInt(2 * codeNum - 1) + 1; + + // 进行交叉点位置后的编码调换 + for (int j = 0; j < 2 * codeNum; j++) { + if (j >= crossPoint) { + temp = array1[j]; + array1[j] = array2[j]; + array2[j] = temp; + } + } + + // 加入到交叉运算结果中 + resultCodes.add(array1); + resultCodes.add(array2); + } + } + + return resultCodes; + } + + /** + * 变异操作 + * + * @param crossCodes + * 交叉运算后的结果 + * @return + */ + private ArrayList variationOperate(ArrayList crossCodes) { + // 变异点 + int variationPoint = 0; + ArrayList resultCodes = new ArrayList<>(); + + for (int[] array : crossCodes) { + variationPoint = random.nextInt(codeNum * 2); + + for (int i = 0; i < array.length; i++) { + // 变异点进行变异 + if (i == variationPoint) { + array[i] = (array[i] == 0 ? 1 : 0); + break; + } + } + + resultCodes.add(array); + } + + return resultCodes; + } + + /** + * 数字转为二进制形式 + * + * @param binaryArray + * 转化后的二进制数组形式 + * @param num + * 待转化数字 + */ + private void numToBinaryArray(int[] binaryArray, int num) { + int index = 0; + int temp = 0; + while (num != 0) { + binaryArray[index] = num % 2; + index++; + num /= 2; + } + + //进行数组前和尾部的调换 + for(int i=0; i=0 ; i--, k++) { + if (binaryArray[i] == 1) { + result += Math.pow(2, k); + } + } + + return result; + } + + /** + * 计算个体编码的适值 + * + * @param codeArray + */ + private int calCodeAdaptiveValue(int[] codeArray) { + int result = 0; + int x1 = 0; + int x2 = 0; + int[] array1 = new int[codeNum]; + int[] array2 = new int[codeNum]; + + for (int i = 0, k = 0; i < codeArray.length; i++, k++) { + if (k < codeNum) { + array1[k] = codeArray[i]; + } else { + array2[k - codeNum] = codeArray[i]; + } + } + + // 进行适值的叠加 + x1 = binaryArrayToNum(array1); + x2 = binaryArrayToNum(array2); + result = x1 * x1 + x2 * x2; + + return result; + } + + /** + * 进行遗传算法计算 + */ + public void geneticCal() { + // 最大适值 + int maxFitness; + //迭代遗传次数 + int loopCount = 0; + boolean canExit = false; + ArrayList initCodes; + ArrayList selectedCodes; + ArrayList crossedCodes; + ArrayList variationCodes; + + int[] maxCode = new int[2*codeNum]; + //计算最大适值 + for(int i=0; i<2*codeNum; i++){ + maxCode[i] = 1; + } + maxFitness = calCodeAdaptiveValue(maxCode); + + initCodes = initSets; + while (true) { + for (int[] array : initCodes) { + // 遗传迭代的终止条件为存在编码达到最大适值 + if (maxFitness == calCodeAdaptiveValue(array)) { + canExit = true; + break; + } + } + + if (canExit) { + break; + } + + selectedCodes = selectOperate(initCodes); + crossedCodes = crossOperate(selectedCodes); + variationCodes = variationOperate(crossedCodes); + initCodes = variationCodes; + + loopCount++; + } + + System.out.println("总共遗传进化了" + loopCount +"次" ); + printFinalCodes(initCodes); + } + + /** + * 输出最后的编码集 + * + * @param finalCodes + * 最后的结果编码 + */ + private void printFinalCodes(ArrayList finalCodes) { + int j = 0; + + for (int[] array : finalCodes) { + System.out.print("个体" + (j + 1) + ":"); + for (int i = 0; i < array.length; i++) { + System.out.print(array[i]); + } + System.out.println(); + j++; + } + } + +} diff --git a/Others/DataMining_GA_Maze/Client.java b/Others/DataMining_GA_Maze/Client.java new file mode 100644 index 0000000..0cec9c9 --- /dev/null +++ b/Others/DataMining_GA_Maze/Client.java @@ -0,0 +1,19 @@ +package GA_Maze; + +/** + * 遗传算法在走迷宫游戏的应用 + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + //迷宫地图文件数据地址 + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\mapData.txt"; + //初始个体数量 + int initSetsNum = 4; + + GATool tool = new GATool(filePath, initSetsNum); + tool.goOutMaze(); + } + +} diff --git a/Others/DataMining_GA_Maze/GATool.java b/Others/DataMining_GA_Maze/GATool.java new file mode 100644 index 0000000..39c8270 --- /dev/null +++ b/Others/DataMining_GA_Maze/GATool.java @@ -0,0 +1,452 @@ +package GA_Maze; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.Random; + +/** + * 遗传算法在走迷宫游戏的应用-遗传算法工具类 + * + * @author lyq + * + */ +public class GATool { + // 迷宫出入口标记 + public static final int MAZE_ENTRANCE_POS = 1; + public static final int MAZE_EXIT_POS = 2; + // 方向对应的编码数组 + public static final int[][] MAZE_DIRECTION_CODE = new int[][] { { 0, 0 }, + { 0, 1 }, { 1, 0 }, { 1, 1 }, }; + // 坐标点方向改变 + public static final int[][] MAZE_DIRECTION_CHANGE = new int[][] { + { -1, 0 }, { 1, 0 }, { 0, -1 }, { 0, 1 }, }; + // 方向的文字描述 + public static final String[] MAZE_DIRECTION_LABEL = new String[] { "上", + "下", "左", "右" }; + + // 地图数据文件地址 + private String filePath; + // 走迷宫的最短步数 + private int stepNum; + // 初始个体的数量 + private int initSetsNum; + // 迷宫入口位置 + private int[] startPos; + // 迷宫出口位置 + private int[] endPos; + // 迷宫地图数据 + private int[][] mazeData; + // 初始个体集 + private ArrayList initSets; + // 随机数产生器 + private Random random; + + public GATool(String filePath, int initSetsNum) { + this.filePath = filePath; + this.initSetsNum = initSetsNum; + + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + public void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + int rowNum = dataArray.size(); + mazeData = new int[rowNum][rowNum]; + for (int i = 0; i < rowNum; i++) { + String[] data = dataArray.get(i); + for (int j = 0; j < data.length; j++) { + mazeData[i][j] = Integer.parseInt(data[j]); + + // 赋值入口和出口位置 + if (mazeData[i][j] == MAZE_ENTRANCE_POS) { + startPos = new int[2]; + startPos[0] = i; + startPos[1] = j; + } else if (mazeData[i][j] == MAZE_EXIT_POS) { + endPos = new int[2]; + endPos[0] = i; + endPos[1] = j; + } + } + } + + // 计算走出迷宫的最短步数 + stepNum = Math.abs(startPos[0] - endPos[0]) + + Math.abs(startPos[1] - endPos[1]); + } + + /** + * 产生初始数据集 + */ + private void produceInitSet() { + // 方向编码 + int directionCode = 0; + random = new Random(); + initSets = new ArrayList<>(); + // 每个步骤的操作需要用2位数字表示 + int[] codeNum; + + for (int i = 0; i < initSetsNum; i++) { + codeNum = new int[stepNum * 2]; + for (int j = 0; j < stepNum; j++) { + directionCode = random.nextInt(4); + codeNum[2 * j] = MAZE_DIRECTION_CODE[directionCode][0]; + codeNum[2 * j + 1] = MAZE_DIRECTION_CODE[directionCode][1]; + } + + initSets.add(codeNum); + } + } + + /** + * 选择操作,把适值较高的个体优先遗传到下一代 + * + * @param initCodes + * 初始个体编码 + * @return + */ + private ArrayList selectOperate(ArrayList initCodes) { + double randomNum = 0; + double sumFitness = 0; + ArrayList resultCodes = new ArrayList<>(); + double[] adaptiveValue = new double[initSetsNum]; + + for (int i = 0; i < initSetsNum; i++) { + adaptiveValue[i] = calFitness(initCodes.get(i)); + sumFitness += adaptiveValue[i]; + } + + // 转成概率的形式,做归一化操作 + for (int i = 0; i < initSetsNum; i++) { + adaptiveValue[i] = adaptiveValue[i] / sumFitness; + } + + for (int i = 0; i < initSetsNum; i++) { + randomNum = random.nextInt(100) + 1; + randomNum = randomNum / 100; + //因为1.0是无法判断到的,,总和会无限接近1.0取为0.99做判断 + if(randomNum == 1){ + randomNum = randomNum - 0.01; + } + + sumFitness = 0; + // 确定区间 + for (int j = 0; j < initSetsNum; j++) { + if (randomNum > sumFitness + && randomNum <= sumFitness + adaptiveValue[j]) { + // 采用拷贝的方式避免引用重复 + resultCodes.add(initCodes.get(j).clone()); + break; + } else { + sumFitness += adaptiveValue[j]; + } + } + } + + return resultCodes; + } + + /** + * 交叉运算 + * + * @param selectedCodes + * 上步骤的选择后的编码 + * @return + */ + private ArrayList crossOperate(ArrayList selectedCodes) { + int randomNum = 0; + // 交叉点 + int crossPoint = 0; + ArrayList resultCodes = new ArrayList<>(); + // 随机编码队列,进行随机交叉配对 + ArrayList randomCodeSeqs = new ArrayList<>(); + + // 进行随机排序 + while (selectedCodes.size() > 0) { + randomNum = random.nextInt(selectedCodes.size()); + + randomCodeSeqs.add(selectedCodes.get(randomNum)); + selectedCodes.remove(randomNum); + } + + int temp = 0; + int[] array1; + int[] array2; + // 进行两两交叉运算 + for (int i = 1; i < randomCodeSeqs.size(); i++) { + if (i % 2 == 1) { + array1 = randomCodeSeqs.get(i - 1); + array2 = randomCodeSeqs.get(i); + crossPoint = random.nextInt(stepNum - 1) + 1; + + // 进行交叉点位置后的编码调换 + for (int j = 0; j < 2 * stepNum; j++) { + if (j >= 2 * crossPoint) { + temp = array1[j]; + array1[j] = array2[j]; + array2[j] = temp; + } + } + + // 加入到交叉运算结果中 + resultCodes.add(array1); + resultCodes.add(array2); + } + } + + return resultCodes; + } + + /** + * 变异操作 + * + * @param crossCodes + * 交叉运算后的结果 + * @return + */ + private ArrayList variationOperate(ArrayList crossCodes) { + // 变异点 + int variationPoint = 0; + ArrayList resultCodes = new ArrayList<>(); + + for (int[] array : crossCodes) { + variationPoint = random.nextInt(stepNum); + + for (int i = 0; i < array.length; i += 2) { + // 变异点进行变异 + if (i % 2 == 0 && i / 2 == variationPoint) { + array[i] = (array[i] == 0 ? 1 : 0); + array[i + 1] = (array[i + 1] == 0 ? 1 : 0); + break; + } + } + + resultCodes.add(array); + } + + return resultCodes; + } + + /** + * 根据编码计算适值 + * + * @param code + * 当前的编码 + * @return + */ + public double calFitness(int[] code) { + double fintness = 0; + // 由编码计算所得的终点横坐标 + int endX = 0; + // 由编码计算所得的终点纵坐标 + int endY = 0; + // 基于片段所代表的行走方向 + int direction = 0; + // 临时坐标点横坐标 + int tempX = 0; + // 临时坐标点纵坐标 + int tempY = 0; + + endX = startPos[0]; + endY = startPos[1]; + for (int i = 0; i < stepNum; i++) { + direction = binaryArrayToNum(new int[] { code[2 * i], + code[2 * i + 1] }); + + // 根据方向改变数组做坐标点的改变 + tempX = endX + MAZE_DIRECTION_CHANGE[direction][0]; + tempY = endY + MAZE_DIRECTION_CHANGE[direction][1]; + + // 判断坐标点是否越界 + if (tempX >= 0 && tempX < mazeData.length && tempY >= 0 + && tempY < mazeData[0].length) { + // 判断坐标点是否走到阻碍块 + if (mazeData[tempX][tempY] != -1) { + endX = tempX; + endY = tempY; + } + } + } + + // 根据适值函数进行适值的计算 + fintness = 1.0 / (Math.abs(endX - endPos[0]) + + Math.abs(endY - endPos[1]) + 1); + + return fintness; + } + + /** + * 根据当前编码判断是否已经找到出口位置 + * + * @param code + * 经过若干次遗传的编码 + * @return + */ + private boolean ifArriveEndPos(int[] code) { + boolean isArrived = false; + // 由编码计算所得的终点横坐标 + int endX = 0; + // 由编码计算所得的终点纵坐标 + int endY = 0; + // 基于片段所代表的行走方向 + int direction = 0; + // 临时坐标点横坐标 + int tempX = 0; + // 临时坐标点纵坐标 + int tempY = 0; + + endX = startPos[0]; + endY = startPos[1]; + for (int i = 0; i < stepNum; i++) { + direction = binaryArrayToNum(new int[] { code[2 * i], + code[2 * i + 1] }); + + // 根据方向改变数组做坐标点的改变 + tempX = endX + MAZE_DIRECTION_CHANGE[direction][0]; + tempY = endY + MAZE_DIRECTION_CHANGE[direction][1]; + + // 判断坐标点是否越界 + if (tempX >= 0 && tempX < mazeData.length && tempY >= 0 + && tempY < mazeData[0].length) { + // 判断坐标点是否走到阻碍块 + if (mazeData[tempX][tempY] != -1) { + endX = tempX; + endY = tempY; + } + } + } + + if (endX == endPos[0] && endY == endPos[1]) { + isArrived = true; + } + + return isArrived; + } + + /** + * 二进制数组转化为数字 + * + * @param binaryArray + * 待转化二进制数组 + */ + private int binaryArrayToNum(int[] binaryArray) { + int result = 0; + + for (int i = binaryArray.length - 1, k = 0; i >= 0; i--, k++) { + if (binaryArray[i] == 1) { + result += Math.pow(2, k); + } + } + + return result; + } + + /** + * 进行遗传算法走出迷宫 + */ + public void goOutMaze() { + // 迭代遗传次数 + int loopCount = 0; + boolean canExit = false; + // 结果路径 + int[] resultCode = null; + ArrayList initCodes; + ArrayList selectedCodes; + ArrayList crossedCodes; + ArrayList variationCodes; + + // 产生初始数据集 + produceInitSet(); + initCodes = initSets; + + while (true) { + for (int[] array : initCodes) { + // 遗传迭代的终止条件为是否找到出口位置 + if (ifArriveEndPos(array)) { + resultCode = array; + canExit = true; + break; + } + } + + if (canExit) { + break; + } + + selectedCodes = selectOperate(initCodes); + crossedCodes = crossOperate(selectedCodes); + variationCodes = variationOperate(crossedCodes); + initCodes = variationCodes; + + loopCount++; + + //如果遗传次数超过100次,则退出 + if(loopCount >= 100){ + break; + } + } + + System.out.println("总共遗传进化了" + loopCount + "次"); + printFindedRoute(resultCode); + } + + /** + * 输出找到的路径 + * + * @param code + */ + private void printFindedRoute(int[] code) { + if(code == null){ + System.out.println("在有限的遗传进化次数内,没有找到最优路径"); + return; + } + + int tempX = startPos[0]; + int tempY = startPos[1]; + int direction = 0; + + System.out.println(MessageFormat.format( + "起始点位置({0},{1}), 出口点位置({2}, {3})", tempX, tempY, endPos[0], + endPos[1])); + + System.out.print("搜索到的结果编码:"); + for(int value: code){ + System.out.print("" + value); + } + System.out.println(); + + for (int i = 0, k = 1; i < code.length; i += 2, k++) { + direction = binaryArrayToNum(new int[] { code[i], code[i + 1] }); + + tempX += MAZE_DIRECTION_CHANGE[direction][0]; + tempY += MAZE_DIRECTION_CHANGE[direction][1]; + + System.out.println(MessageFormat.format( + "第{0}步,编码为{1}{2},向{3}移动,移动后到达({4},{5})", k, code[i], code[i+1], + MAZE_DIRECTION_LABEL[direction], tempX, tempY)); + } + } + +} diff --git a/Others/DataMining_GA_Maze/mapData.txt b/Others/DataMining_GA_Maze/mapData.txt new file mode 100644 index 0000000..e3566d7 --- /dev/null +++ b/Others/DataMining_GA_Maze/mapData.txt @@ -0,0 +1,5 @@ +0 0 0 0 0 +2 0 0 -1 0 +0 0 0 0 0 +0 -1 0 0 -1 +0 0 0 0 1 \ No newline at end of file diff --git a/Others/DataMining_KDTree/Client.java b/Others/DataMining_KDTree/Client.java new file mode 100644 index 0000000..bba7377 --- /dev/null +++ b/Others/DataMining_KDTree/Client.java @@ -0,0 +1,36 @@ +package DataMining_KDTree; + +import java.text.MessageFormat; + +/** + * KD树算法测试类 + * + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + Point queryNode; + Point searchedNode; + KDTreeTool tool = new KDTreeTool(filePath); + + // 进行KD树的构建 + tool.createKDTree(); + + // 通过KD树进行数据点的最近点查询 + queryNode = new Point(2.1, 3.1); + searchedNode = tool.searchNearestData(queryNode); + System.out.println(MessageFormat.format( + "距离查询点({0}, {1})最近的坐标点为({2}, {3})", queryNode.x, queryNode.y, + searchedNode.x, searchedNode.y)); + + //重新构造KD树,去除之前的访问记录 + tool.createKDTree(); + queryNode = new Point(2, 4.5); + searchedNode = tool.searchNearestData(queryNode); + System.out.println(MessageFormat.format( + "距离查询点({0}, {1})最近的坐标点为({2}, {3})", queryNode.x, queryNode.y, + searchedNode.x, searchedNode.y)); + } +} diff --git a/Others/DataMining_KDTree/KDTreeTool.java b/Others/DataMining_KDTree/KDTreeTool.java new file mode 100644 index 0000000..0b5a53c --- /dev/null +++ b/Others/DataMining_KDTree/KDTreeTool.java @@ -0,0 +1,386 @@ +package DataMining_KDTree; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Stack; + +/** + * KD树-k维空间关键数据检索算法工具类 + * + * @author lyq + * + */ +public class KDTreeTool { + // 空间平面的方向 + public static final int DIRECTION_X = 0; + public static final int DIRECTION_Y = 1; + + // 输入的测试数据坐标点文件 + private String filePath; + // 原始所有数据点数据 + private ArrayList totalDatas; + // KD树根节点 + private TreeNode rootNode; + + public KDTreeTool(String filePath) { + this.filePath = filePath; + + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + private void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + Point p; + totalDatas = new ArrayList<>(); + for (String[] array : dataArray) { + p = new Point(array[0], array[1]); + totalDatas.add(p); + } + } + + /** + * 创建KD树 + * + * @return + */ + public TreeNode createKDTree() { + ArrayList copyDatas; + + rootNode = new TreeNode(); + // 根据节点开始时所表示的空间时无限大的 + rootNode.range = new Range(); + copyDatas = (ArrayList) totalDatas.clone(); + recusiveConstructNode(rootNode, copyDatas); + + return rootNode; + } + + /** + * 递归进行KD树的构造 + * + * @param node + * 当前正在构造的节点 + * @param datas + * 该节点对应的正在处理的数据 + * @return + */ + private void recusiveConstructNode(TreeNode node, ArrayList datas) { + int direction = 0; + ArrayList leftSideDatas; + ArrayList rightSideDatas; + Point p; + TreeNode leftNode; + TreeNode rightNode; + Range range; + Range range2; + + // 如果划分的数据点集合只有1个数据,则不再划分 + if (datas.size() == 1) { + node.nodeData = datas.get(0); + return; + } + + // 首先在当前的数据点集合中进行分割方向的选择 + direction = selectSplitDrc(datas); + // 根据方向取出中位数点作为数据矢量 + p = getMiddlePoint(datas, direction); + + node.spilt = direction; + node.nodeData = p; + + leftSideDatas = getLeftSideDatas(datas, p, direction); + datas.removeAll(leftSideDatas); + // 还要去掉自身 + datas.remove(p); + rightSideDatas = datas; + + if (leftSideDatas.size() > 0) { + leftNode = new TreeNode(); + leftNode.parentNode = node; + range2 = Range.initLeftRange(p, direction); + // 获取父节点的空间矢量,进行交集运算做范围拆分 + range = node.range.crossOperation(range2); + leftNode.range = range; + + node.leftNode = leftNode; + recusiveConstructNode(leftNode, leftSideDatas); + } + + if (rightSideDatas.size() > 0) { + rightNode = new TreeNode(); + rightNode.parentNode = node; + range2 = Range.initRightRange(p, direction); + // 获取父节点的空间矢量,进行交集运算做范围拆分 + range = node.range.crossOperation(range2); + rightNode.range = range; + + node.rightNode = rightNode; + recusiveConstructNode(rightNode, rightSideDatas); + } + } + + /** + * 搜索出给定数据点的最近点 + * + * @param p + * 待比较坐标点 + */ + public Point searchNearestData(Point p) { + // 节点距离给定数据点的距离 + TreeNode nearestNode = null; + // 用栈记录遍历过的节点 + Stack stackNodes; + + stackNodes = new Stack<>(); + findedNearestLeafNode(p, rootNode, stackNodes); + + // 取出叶子节点,作为当前找到的最近节点 + nearestNode = stackNodes.pop(); + nearestNode = dfsSearchNodes(stackNodes, p, nearestNode); + + return nearestNode.nodeData; + } + + /** + * 深度优先的方式进行最近点的查找 + * + * @param stack + * KD树节点栈 + * @param desPoint + * 给定的数据点 + * @param nearestNode + * 当前找到的最近节点 + * @return + */ + private TreeNode dfsSearchNodes(Stack stack, Point desPoint, + TreeNode nearestNode) { + // 是否碰到父节点边界 + boolean isCollision; + double minDis; + double dis; + TreeNode parentNode; + + // 如果栈内节点已经全部弹出,则遍历结束 + if (stack.isEmpty()) { + return nearestNode; + } + + // 获取父节点 + parentNode = stack.pop(); + + minDis = desPoint.ouDistance(nearestNode.nodeData); + dis = desPoint.ouDistance(parentNode.nodeData); + + // 如果与当前回溯到的父节点距离更短,则搜索到的节点进行更新 + if (dis < minDis) { + minDis = dis; + nearestNode = parentNode; + } + + // 默认没有碰撞到 + isCollision = false; + // 判断是否触碰到了父节点的空间分割线 + if (parentNode.spilt == DIRECTION_X) { + if (parentNode.nodeData.x > desPoint.x - minDis + && parentNode.nodeData.x < desPoint.x + minDis) { + isCollision = true; + } + } else { + if (parentNode.nodeData.y > desPoint.y - minDis + && parentNode.nodeData.y < desPoint.y + minDis) { + isCollision = true; + } + } + + // 如果触碰到父边界了,并且此节点的孩子节点还未完全遍历完,则可以继续遍历 + if (isCollision + && (!parentNode.leftNode.isVisited || !parentNode.rightNode.isVisited)) { + TreeNode newNode; + // 新建当前的小局部节点栈 + Stack otherStack = new Stack<>(); + // 从parentNode的树以下继续寻找 + findedNearestLeafNode(desPoint, parentNode, otherStack); + newNode = dfsSearchNodes(otherStack, desPoint, otherStack.pop()); + + dis = newNode.nodeData.ouDistance(desPoint); + if (dis < minDis) { + nearestNode = newNode; + } + } + + // 继续往上回溯 + nearestNode = dfsSearchNodes(stack, desPoint, nearestNode); + + return nearestNode; + } + + /** + * 找到与所给定节点的最近的叶子节点 + * + * @param p + * 待比较节点 + * @param node + * 当前搜索到的节点 + * @param stack + * 遍历过的节点栈 + */ + private void findedNearestLeafNode(Point p, TreeNode node, + Stack stack) { + // 分割方向 + int splitDic; + + // 将遍历过的节点加入栈中 + stack.push(node); + // 标记为访问过 + node.isVisited = true; + // 如果此节点没有左右孩子节点说明已经是叶子节点了 + if (node.leftNode == null && node.rightNode == null) { + return; + } + + splitDic = node.spilt; + // 选择一个符合分割范围的节点继续递归搜寻 + if ((splitDic == DIRECTION_X && p.x < node.nodeData.x) + || (splitDic == DIRECTION_Y && p.y < node.nodeData.y)) { + if (!node.leftNode.isVisited) { + findedNearestLeafNode(p, node.leftNode, stack); + } else { + // 如果左孩子节点已经访问过,则访问另一边 + findedNearestLeafNode(p, node.rightNode, stack); + } + } else if ((splitDic == DIRECTION_X && p.x > node.nodeData.x) + || (splitDic == DIRECTION_Y && p.y > node.nodeData.y)) { + if (!node.rightNode.isVisited) { + findedNearestLeafNode(p, node.rightNode, stack); + } else { + // 如果右孩子节点已经访问过,则访问另一边 + findedNearestLeafNode(p, node.leftNode, stack); + } + } + } + + /** + * 根据给定的数据点通过计算反差选择的分割点 + * + * @param datas + * 部分的集合点集合 + * @return + */ + private int selectSplitDrc(ArrayList datas) { + int direction = 0; + double avgX = 0; + double avgY = 0; + double varianceX = 0; + double varianceY = 0; + + for (Point p : datas) { + avgX += p.x; + avgY += p.y; + } + + avgX /= datas.size(); + avgY /= datas.size(); + + for (Point p : datas) { + varianceX += (p.x - avgX) * (p.x - avgX); + varianceY += (p.y - avgY) * (p.y - avgY); + } + + // 求最后的方差 + varianceX /= datas.size(); + varianceY /= datas.size(); + + // 通过比较方差的大小决定分割方向,选择波动较大的进行划分 + direction = varianceX > varianceY ? DIRECTION_X : DIRECTION_Y; + + return direction; + } + + /** + * 根据坐标点方位进行排序,选出中间点的坐标数据 + * + * @param datas + * 数据点集合 + * @param dir + * 排序的坐标方向 + */ + private Point getMiddlePoint(ArrayList datas, int dir) { + int index = 0; + Point middlePoint; + + index = datas.size() / 2; + if (dir == DIRECTION_X) { + Collections.sort(datas, new Comparator() { + + @Override + public int compare(Point o1, Point o2) { + // TODO Auto-generated method stub + return o1.x.compareTo(o2.x); + } + }); + } else { + Collections.sort(datas, new Comparator() { + + @Override + public int compare(Point o1, Point o2) { + // TODO Auto-generated method stub + return o1.y.compareTo(o2.y); + } + }); + } + + // 取出中位数 + middlePoint = datas.get(index); + + return middlePoint; + } + + /** + * 根据方向得到原部分节点集合左侧的数据点 + * + * @param datas + * 原始数据点集合 + * @param nodeData + * 数据矢量 + * @param dir + * 分割方向 + * @return + */ + private ArrayList getLeftSideDatas(ArrayList datas, + Point nodeData, int dir) { + ArrayList leftSideDatas = new ArrayList<>(); + + for (Point p : datas) { + if (dir == DIRECTION_X && p.x < nodeData.x) { + leftSideDatas.add(p); + } else if (dir == DIRECTION_Y && p.y < nodeData.y) { + leftSideDatas.add(p); + } + } + + return leftSideDatas; + } +} diff --git a/Others/DataMining_KDTree/Point.java b/Others/DataMining_KDTree/Point.java new file mode 100644 index 0000000..c98a770 --- /dev/null +++ b/Others/DataMining_KDTree/Point.java @@ -0,0 +1,58 @@ +package DataMining_KDTree; + +/** + * 坐标点类 + * + * @author lyq + * + */ +public class Point{ + // 坐标点横坐标 + Double x; + // 坐标点纵坐标 + Double y; + + public Point(double x, double y){ + this.x = x; + this.y = y; + } + + public Point(String x, String y) { + this.x = (Double.parseDouble(x)); + this.y = (Double.parseDouble(y)); + } + + /** + * 计算当前点与制定点之间的欧式距离 + * + * @param p + * 待计算聚类的p点 + * @return + */ + public double ouDistance(Point p) { + double distance = 0; + + distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) + * (this.y - p.y); + distance = Math.sqrt(distance); + + return distance; + } + + /** + * 判断2个坐标点是否为用个坐标点 + * + * @param p + * 待比较坐标点 + * @return + */ + public boolean isTheSame(Point p) { + boolean isSamed = false; + + if (this.x == p.x && this.y == p.y) { + isSamed = true; + } + + return isSamed; + } +} diff --git a/Others/DataMining_KDTree/Range.java b/Others/DataMining_KDTree/Range.java new file mode 100644 index 0000000..b36d3d3 --- /dev/null +++ b/Others/DataMining_KDTree/Range.java @@ -0,0 +1,114 @@ +package DataMining_KDTree; + +/** + * 空间矢量,表示所代表的空间范围 + * + * @author lyq + * + */ +public class Range { + // 边界左边界 + double left; + // 边界右边界 + double right; + // 边界上边界 + double top; + // 边界下边界 + double bottom; + + public Range() { + this.left = -Integer.MAX_VALUE; + this.right = Integer.MAX_VALUE; + this.top = Integer.MAX_VALUE; + this.bottom = -Integer.MAX_VALUE; + } + + public Range(int left, int right, int top, int bottom) { + this.left = left; + this.right = right; + this.top = top; + this.bottom = bottom; + } + + /** + * 空间矢量进行并操作 + * + * @param range + * @return + */ + public Range crossOperation(Range r) { + Range range = new Range(); + + // 取靠近右侧的左边界 + if (r.left > this.left) { + range.left = r.left; + } else { + range.left = this.left; + } + + // 取靠近左侧的右边界 + if (r.right < this.right) { + range.right = r.right; + } else { + range.right = this.right; + } + + // 取靠近下侧的上边界 + if (r.top < this.top) { + range.top = r.top; + } else { + range.top = this.top; + } + + // 取靠近上侧的下边界 + if (r.bottom > this.bottom) { + range.bottom = r.bottom; + } else { + range.bottom = this.bottom; + } + + return range; + } + + /** + * 根据坐标点分割方向确定左侧空间矢量 + * + * @param p + * 数据矢量 + * @param dir + * 分割方向 + * @return + */ + public static Range initLeftRange(Point p, int dir) { + Range range = new Range(); + + if (dir == KDTreeTool.DIRECTION_X) { + range.right = p.x; + } else { + range.bottom = p.y; + } + + return range; + } + + /** + * 根据坐标点分割方向确定右侧空间矢量 + * + * @param p + * 数据矢量 + * @param dir + * 分割方向 + * @return + */ + public static Range initRightRange(Point p, int dir) { + Range range = new Range(); + + if (dir == KDTreeTool.DIRECTION_X) { + range.left = p.x; + } else { + range.top = p.y; + } + + return range; + } +} diff --git a/Others/DataMining_KDTree/TreeNode.java b/Others/DataMining_KDTree/TreeNode.java new file mode 100644 index 0000000..127833c --- /dev/null +++ b/Others/DataMining_KDTree/TreeNode.java @@ -0,0 +1,27 @@ +package DataMining_KDTree; + +/** + * KD树节点 + * @author lyq + * + */ +public class TreeNode { + //数据矢量 + Point nodeData; + //分割平面的分割线 + int spilt; + //空间矢量,该节点所表示的空间范围 + Range range; + //父节点 + TreeNode parentNode; + //位于分割超平面左侧的孩子节点 + TreeNode leftNode; + //位于分割超平面右侧的孩子节点 + TreeNode rightNode; + //节点是否被访问过,用于回溯时使用 + boolean isVisited; + + public TreeNode(){ + this.isVisited = false; + } +} diff --git a/Others/DataMining_KDTree/input.txt b/Others/DataMining_KDTree/input.txt new file mode 100644 index 0000000..f7d49f3 --- /dev/null +++ b/Others/DataMining_KDTree/input.txt @@ -0,0 +1,6 @@ +4 7 +5 4 +9 6 +7 2 +2 3 +8 1 diff --git a/Others/DataMining_MSApriori/Client.java b/Others/DataMining_MSApriori/Client.java new file mode 100644 index 0000000..f49e83d --- /dev/null +++ b/Others/DataMining_MSApriori/Client.java @@ -0,0 +1,45 @@ +package DataMining_MSApriori; + +/** + * 基于多支持度的Apriori算法测试类 + * @author lyq + * + */ +public class Client { + public static void main(String[] args){ + //是否是事务型数据 + boolean isTransaction; + //测试数据文件地址 + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + //关系表型数据文件地址 + String tableFilePath = "C:\\Users\\lyq\\Desktop\\icon\\input2.txt"; + //最小支持度阈值 + double minSup; + // 最小置信度率 + double minConf; + //最大支持度差别阈值 + double delta; + //多项目的最小支持度数,括号中的下标代表的是商品的ID + double[] mis; + //msApriori算法工具类 + MSAprioriTool tool; + + //为了测试的方便,取一个偏低的置信度值0.3 + minConf = 0.3; + minSup = 0.1; + delta = 0.5; + //每项的支持度率都默认为0.1,第一项不使用 + mis = new double[]{-1, 0.1, 0.1, 0.1, 0.1, 0.1}; + isTransaction = true; + + isTransaction = true; + tool = new MSAprioriTool(filePath, minConf, delta, mis, isTransaction); + tool.calFItems(); + System.out.println(); + + isTransaction = false; + //重新初始化数据 + tool = new MSAprioriTool(tableFilePath, minConf, minSup, isTransaction); + tool.calFItems(); + } +} diff --git a/Others/DataMining_MSApriori/FrequentItem.java b/Others/DataMining_MSApriori/FrequentItem.java new file mode 100644 index 0000000..2ba88c4 --- /dev/null +++ b/Others/DataMining_MSApriori/FrequentItem.java @@ -0,0 +1,56 @@ +package DataMining_MSApriori; + +/** + * 频繁项集 + * + * @author lyq + * + */ +public class FrequentItem implements Comparable{ + // 频繁项集的集合ID + private String[] idArray; + // 频繁项集的支持度计数 + private int count; + //频繁项集的长度,1项集或是2项集,亦或是3项集 + private int length; + + public FrequentItem(String[] idArray, int count){ + this.idArray = idArray; + this.count = count; + length = idArray.length; + } + + public String[] getIdArray() { + return idArray; + } + + public void setIdArray(String[] idArray) { + this.idArray = idArray; + } + + public int getCount() { + return count; + } + + public void setCount(int count) { + this.count = count; + } + + public int getLength() { + return length; + } + + public void setLength(int length) { + this.length = length; + } + + @Override + public int compareTo(FrequentItem o) { + // TODO Auto-generated method stub + Integer int1 = Integer.parseInt(this.getIdArray()[0]); + Integer int2 = Integer.parseInt(o.getIdArray()[0]); + + return int1.compareTo(int2); + } + +} diff --git a/Others/DataMining_MSApriori/MSAprioriTool.java b/Others/DataMining_MSApriori/MSAprioriTool.java new file mode 100644 index 0000000..ba5d444 --- /dev/null +++ b/Others/DataMining_MSApriori/MSAprioriTool.java @@ -0,0 +1,780 @@ +package DataMining_MSApriori; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import DataMining_Apriori.FrequentItem; + +/** + * 基于多支持度的Apriori算法工具类 + * + * @author lyq + * + */ +public class MSAprioriTool { + // 前件判断的结果值,用于关联规则的推导 + public static final int PREFIX_NOT_SUB = -1; + public static final int PREFIX_EQUAL = 1; + public static final int PREFIX_IS_SUB = 2; + + // 是否读取的是事务型数据 + private boolean isTransaction; + // 最大频繁k项集的k值 + private int initFItemNum; + // 事务数据文件地址 + private String filePath; + // 最小支持度阈值 + private double minSup; + // 最小置信度率 + private double minConf; + // 最大支持度差别阈值 + private double delta; + // 多项目的最小支持度数,括号中的下标代表的是商品的ID + private double[] mis; + // 每个事务中的商品ID + private ArrayList totalGoodsIDs; + // 关系表数据所转化的事务数据 + private ArrayList transactionDatas; + // 过程中计算出来的所有频繁项集列表 + private ArrayList resultItem; + // 过程中计算出来频繁项集的ID集合 + private ArrayList resultItemID; + // 属性到数字的映射图 + private HashMap attr2Num; + // 数字id对应属性的映射图 + private HashMap num2Attr; + // 频繁项集所覆盖的id数值 + private Map fItem2Id; + + /** + * 事务型数据关联挖掘算法 + * + * @param filePath + * @param minConf + * @param delta + * @param mis + * @param isTransaction + */ + public MSAprioriTool(String filePath, double minConf, double delta, + double[] mis, boolean isTransaction) { + this.filePath = filePath; + this.minConf = minConf; + this.delta = delta; + this.mis = mis; + this.isTransaction = isTransaction; + this.fItem2Id = new HashMap<>(); + + readDataFile(); + } + + /** + * 非事务型关联挖掘 + * + * @param filePath + * @param minConf + * @param minSup + * @param isTransaction + */ + public MSAprioriTool(String filePath, double minConf, double minSup, + boolean isTransaction) { + this.filePath = filePath; + this.minConf = minConf; + this.minSup = minSup; + this.isTransaction = isTransaction; + this.delta = 1.0; + this.fItem2Id = new HashMap<>(); + + readRDBMSData(filePath); + } + + /** + * 从文件中读取数据 + */ + private void readDataFile() { + String[] temp = null; + ArrayList dataArray; + + dataArray = readLine(filePath); + totalGoodsIDs = new ArrayList<>(); + + for (String[] array : dataArray) { + temp = new String[array.length - 1]; + System.arraycopy(array, 1, temp, 0, array.length - 1); + + // 将事务ID加入列表吧中 + totalGoodsIDs.add(temp); + } + } + + /** + * 从文件中逐行读数据 + * + * @param filePath + * 数据文件地址 + * @return + */ + private ArrayList readLine(String filePath) { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + return dataArray; + } + + /** + * 计算频繁项集 + */ + public void calFItems() { + FrequentItem fItem; + + computeLink(); + printFItems(); + + if (isTransaction) { + fItem = resultItem.get(resultItem.size() - 1); + // 取出最后一个频繁项集做关联规则的推导 + System.out.println("最后一个频繁项集做关联规则的推导结果:"); + printAttachRuls(fItem.getIdArray()); + } + } + + /** + * 输出频繁项集 + */ + private void printFItems() { + if (isTransaction) { + System.out.println("事务型数据频繁项集输出结果:"); + } else { + System.out.println("非事务(关系)型数据频繁项集输出结果:"); + } + + // 输出频繁项集 + for (int k = 1; k <= initFItemNum; k++) { + System.out.println("频繁" + k + "项集:"); + for (FrequentItem i : resultItem) { + if (i.getLength() == k) { + System.out.print("{"); + for (String t : i.getIdArray()) { + if (!isTransaction) { + // 如果原本是非事务型数据,需要重新做替换 + t = num2Attr.get(Integer.parseInt(t)); + } + + System.out.print(t + ","); + } + System.out.print("},"); + } + } + System.out.println(); + } + } + + /** + * 项集进行连接运算 + */ + private void computeLink() { + // 连接计算的终止数,k项集必须算到k-1子项集为止 + int endNum = 0; + // 当前已经进行连接运算到几项集,开始时就是1项集 + int currentNum = 1; + // 商品,1频繁项集映射图 + HashMap itemMap = new HashMap<>(); + FrequentItem tempItem; + // 初始列表 + ArrayList list = new ArrayList<>(); + // 经过连接运算后产生的结果项集 + resultItem = new ArrayList<>(); + resultItemID = new ArrayList<>(); + // 商品ID的种类 + ArrayList idType = new ArrayList<>(); + for (String[] a : totalGoodsIDs) { + for (String s : a) { + if (!idType.contains(s)) { + tempItem = new FrequentItem(new String[] { s }, 1); + idType.add(s); + resultItemID.add(new String[] { s }); + } else { + // 支持度计数加1 + tempItem = itemMap.get(s); + tempItem.setCount(tempItem.getCount() + 1); + } + itemMap.put(s, tempItem); + } + } + // 将初始频繁项集转入到列表中,以便继续做连接运算 + for (Map.Entry entry : itemMap.entrySet()) { + tempItem = entry.getValue(); + + // 判断1频繁项集是否满足支持度阈值的条件 + if (judgeFItem(tempItem.getIdArray())) { + list.add(tempItem); + } + } + + // 按照商品ID进行排序,否则连接计算结果将会不一致,将会减少 + Collections.sort(list); + resultItem.addAll(list); + + String[] array1; + String[] array2; + String[] resultArray; + ArrayList tempIds; + ArrayList resultContainer; + // 总共要算到endNum项集 + endNum = list.size() - 1; + initFItemNum = list.size() - 1; + + while (currentNum < endNum) { + resultContainer = new ArrayList<>(); + for (int i = 0; i < list.size() - 1; i++) { + tempItem = list.get(i); + array1 = tempItem.getIdArray(); + + for (int j = i + 1; j < list.size(); j++) { + tempIds = new ArrayList<>(); + array2 = list.get(j).getIdArray(); + + for (int k = 0; k < array1.length; k++) { + // 如果对应位置上的值相等的时候,只取其中一个值,做了一个连接删除操作 + if (array1[k].equals(array2[k])) { + tempIds.add(array1[k]); + } else { + tempIds.add(array1[k]); + tempIds.add(array2[k]); + } + } + + resultArray = new String[tempIds.size()]; + tempIds.toArray(resultArray); + + boolean isContain = false; + // 过滤不符合条件的的ID数组,包括重复的和长度不符合要求的 + if (resultArray.length == (array1.length + 1)) { + isContain = isIDArrayContains(resultContainer, + resultArray); + if (!isContain) { + resultContainer.add(resultArray); + } + } + } + } + + // 做频繁项集的剪枝处理,必须保证新的频繁项集的子项集也必须是频繁项集 + list = cutItem(resultContainer); + currentNum++; + } + } + + /** + * 对频繁项集做剪枝步骤,必须保证新的频繁项集的子项集也必须是频繁项集 + */ + private ArrayList cutItem(ArrayList resultIds) { + String[] temp; + // 忽略的索引位置,以此构建子集 + int igNoreIndex = 0; + FrequentItem tempItem; + // 剪枝生成新的频繁项集 + ArrayList newItem = new ArrayList<>(); + // 不符合要求的id + ArrayList deleteIdArray = new ArrayList<>(); + // 子项集是否也为频繁子项集 + boolean isContain = true; + + for (String[] array : resultIds) { + // 列举出其中的一个个的子项集,判断存在于频繁项集列表中 + temp = new String[array.length - 1]; + for (igNoreIndex = 0; igNoreIndex < array.length; igNoreIndex++) { + isContain = true; + for (int j = 0, k = 0; j < array.length; j++) { + if (j != igNoreIndex) { + temp[k] = array[j]; + k++; + } + } + + if (!isIDArrayContains(resultItemID, temp)) { + isContain = false; + break; + } + } + + if (!isContain) { + deleteIdArray.add(array); + } + } + + // 移除不符合条件的ID组合 + resultIds.removeAll(deleteIdArray); + + // 移除支持度计数不够的id集合 + int tempCount = 0; + boolean isSatisfied = false; + for (String[] array : resultIds) { + isSatisfied = judgeFItem(array); + + // 如果此频繁项集满足多支持度阈值限制条件和支持度差别限制条件,则添加入结果集中 + if (isSatisfied) { + tempItem = new FrequentItem(array, tempCount); + newItem.add(tempItem); + resultItemID.add(array); + resultItem.add(tempItem); + } + } + + return newItem; + } + + /** + * 判断列表结果中是否已经包含此数组 + * + * @param container + * ID数组容器 + * @param array + * 待比较数组 + * @return + */ + private boolean isIDArrayContains(ArrayList container, + String[] array) { + boolean isContain = true; + if (container.size() == 0) { + isContain = false; + return isContain; + } + + for (String[] s : container) { + // 比较的视乎必须保证长度一样 + if (s.length != array.length) { + continue; + } + + isContain = true; + for (int i = 0; i < s.length; i++) { + // 只要有一个id不等,就算不相等 + if (s[i] != array[i]) { + isContain = false; + break; + } + } + + // 如果已经判断是包含在容器中时,直接退出 + if (isContain) { + break; + } + } + + return isContain; + } + + /** + * 判断一个频繁项集是否满足条件 + * + * @param frequentItem + * 待判断频繁项集 + * @return + */ + private boolean judgeFItem(String[] frequentItem) { + boolean isSatisfied = true; + int id; + int count; + double tempMinSup; + // 最小的支持度阈值 + double minMis = Integer.MAX_VALUE; + // 最大的支持度阈值 + double maxMis = -Integer.MAX_VALUE; + + // 如果是事务型数据,用mis数组判断,如果不是统一用同样的最小支持度阈值判断 + if (isTransaction) { + // 寻找频繁项集中的最小支持度阈值 + for (int i = 0; i < frequentItem.length; i++) { + id = i + 1; + + if (mis[id] < minMis) { + minMis = mis[id]; + } + + if (mis[id] > maxMis) { + maxMis = mis[id]; + } + } + } else { + minMis = minSup; + maxMis = minSup; + } + + count = calSupportCount(frequentItem); + tempMinSup = 1.0 * count / totalGoodsIDs.size(); + // 判断频繁项集的支持度阈值是否超过最小的支持度阈值 + if (tempMinSup < minMis) { + isSatisfied = false; + } + + // 如果误差超过了最大支持度差别,也算不满足条件 + if (Math.abs(maxMis - minMis) > delta) { + isSatisfied = false; + } + + return isSatisfied; + } + + /** + * 统计候选频繁项集的支持度数,利用他的子集进行技术,无须扫描整个数据集 + * + * @param frequentItem + * 待计算频繁项集 + * @return + */ + private int calSupportCount(String[] frequentItem) { + int count = 0; + int[] ids; + String key; + String[] array; + ArrayList newIds; + + key = ""; + for (int i = 1; i < frequentItem.length; i++) { + key += frequentItem[i]; + } + + newIds = new ArrayList<>(); + // 找出所属的事务ID + ids = fItem2Id.get(key); + + // 如果没有找到子项集的事务id,则全盘扫描数据集 + if (ids == null || ids.length == 0) { + for (int j = 0; j < totalGoodsIDs.size(); j++) { + array = totalGoodsIDs.get(j); + if (isStrArrayContain(array, frequentItem)) { + count++; + newIds.add(j); + } + } + } else { + for (int index : ids) { + array = totalGoodsIDs.get(index); + if (isStrArrayContain(array, frequentItem)) { + count++; + newIds.add(index); + } + } + } + + ids = new int[count]; + for (int i = 0; i < ids.length; i++) { + ids[i] = newIds.get(i); + } + + key = frequentItem[0] + key; + // 将所求值存入图中,便于下次的计数 + fItem2Id.put(key, ids); + + return count; + } + + /** + * 根据给定的频繁项集输出关联规则 + * + * @param frequentItems + * 频繁项集 + */ + public void printAttachRuls(String[] frequentItem) { + // 关联规则前件,后件对 + Map, ArrayList> rules; + // 前件搜索历史 + Map, ArrayList> searchHistory; + ArrayList prefix; + ArrayList suffix; + + rules = new HashMap, ArrayList>(); + searchHistory = new HashMap<>(); + + for (int i = 0; i < frequentItem.length; i++) { + suffix = new ArrayList<>(); + for (int j = 0; j < frequentItem.length; j++) { + suffix.add(frequentItem[j]); + } + prefix = new ArrayList<>(); + + recusiveFindRules(rules, searchHistory, prefix, suffix); + } + + // 依次输出找到的关联规则 + for (Map.Entry, ArrayList> entry : rules + .entrySet()) { + prefix = entry.getKey(); + suffix = entry.getValue(); + + printRuleDetail(prefix, suffix); + } + } + + /** + * 根据前件后件,输出关联规则 + * + * @param prefix + * @param suffix + */ + private void printRuleDetail(ArrayList prefix, + ArrayList suffix) { + // {A}-->{B}的意思为在A的情况下发生B的概率 + System.out.print("{"); + for (String s : prefix) { + System.out.print(s + ", "); + } + System.out.print("}-->"); + System.out.print("{"); + for (String s : suffix) { + System.out.print(s + ", "); + } + System.out.println("}"); + } + + /** + * 递归扩展关联规则解 + * + * @param rules + * 关联规则结果集 + * @param history + * 前件搜索历史 + * @param prefix + * 关联规则前件 + * @param suffix + * 关联规则后件 + */ + private void recusiveFindRules( + Map, ArrayList> rules, + Map, ArrayList> history, + ArrayList prefix, ArrayList suffix) { + int count1; + int count2; + int compareResult; + // 置信度大小 + double conf; + String[] temp1; + String[] temp2; + ArrayList copyPrefix; + ArrayList copySuffix; + + // 如果后件只有1个,则函数返回 + if (suffix.size() == 1) { + return; + } + + for (String s : suffix) { + count1 = 0; + count2 = 0; + + copyPrefix = (ArrayList) prefix.clone(); + copyPrefix.add(s); + + copySuffix = (ArrayList) suffix.clone(); + // 将拷贝的后件移除添加的一项 + copySuffix.remove(s); + + compareResult = isSubSetInRules(history, copyPrefix); + if (compareResult == PREFIX_EQUAL) { + // 如果曾经已经被搜索过,则跳过 + continue; + } + + // 判断是否为子集,如果是子集则无需计算 + compareResult = isSubSetInRules(rules, copyPrefix); + if (compareResult == PREFIX_IS_SUB) { + rules.put(copyPrefix, copySuffix); + // 加入到搜索历史中 + history.put(copyPrefix, copySuffix); + recusiveFindRules(rules, history, copyPrefix, copySuffix); + continue; + } + + // 暂时合并为总的集合 + copySuffix.addAll(copyPrefix); + temp1 = new String[copyPrefix.size()]; + temp2 = new String[copySuffix.size()]; + copyPrefix.toArray(temp1); + copySuffix.toArray(temp2); + // 之后再次移除之前天剑的前件 + copySuffix.removeAll(copyPrefix); + + for (String[] a : totalGoodsIDs) { + if (isStrArrayContain(a, temp1)) { + count1++; + + // 在group1的条件下,统计group2的事件发生次数 + if (isStrArrayContain(a, temp2)) { + count2++; + } + } + } + + conf = 1.0 * count2 / count1; + if (conf > minConf) { + // 设置此前件条件下,能导出关联规则 + rules.put(copyPrefix, copySuffix); + } + + // 加入到搜索历史中 + history.put(copyPrefix, copySuffix); + recusiveFindRules(rules, history, copyPrefix, copySuffix); + } + } + + /** + * 判断当前的前件是否会关联规则的子集 + * + * @param rules + * 当前已经判断出的关联规则 + * @param prefix + * 待判断的前件 + * @return + */ + private int isSubSetInRules( + Map, ArrayList> rules, + ArrayList prefix) { + int result = PREFIX_NOT_SUB; + String[] temp1; + String[] temp2; + ArrayList tempPrefix; + + for (Map.Entry, ArrayList> entry : rules + .entrySet()) { + tempPrefix = entry.getKey(); + + temp1 = new String[tempPrefix.size()]; + temp2 = new String[prefix.size()]; + + tempPrefix.toArray(temp1); + prefix.toArray(temp2); + + // 判断当前构造的前件是否已经是存在前件的子集 + if (isStrArrayContain(temp2, temp1)) { + if (temp2.length == temp1.length) { + result = PREFIX_EQUAL; + } else { + result = PREFIX_IS_SUB; + } + } + + if (result == PREFIX_EQUAL) { + break; + } + } + + return result; + } + + /** + * 数组array2是否包含于array1中,不需要完全一样 + * + * @param array1 + * @param array2 + * @return + */ + private boolean isStrArrayContain(String[] array1, String[] array2) { + boolean isContain = true; + for (String s2 : array2) { + isContain = false; + for (String s1 : array1) { + // 只要s2字符存在于array1中,这个字符就算包含在array1中 + if (s2.equals(s1)) { + isContain = true; + break; + } + } + + // 一旦发现不包含的字符,则array2数组不包含于array1中 + if (!isContain) { + break; + } + } + + return isContain; + } + + /** + * 读关系表中的数据,并转化为事务数据 + * + * @param filePath + */ + private void readRDBMSData(String filePath) { + String str; + // 属性名称行 + String[] attrNames = null; + String[] temp; + String[] newRecord; + ArrayList datas = null; + + datas = readLine(filePath); + + // 获取首行 + attrNames = datas.get(0); + this.transactionDatas = new ArrayList<>(); + + // 去除首行数据 + for (int i = 1; i < datas.size(); i++) { + temp = datas.get(i); + + // 过滤掉首列id列 + for (int j = 1; j < temp.length; j++) { + str = ""; + // 采用属性名+属性值的形式避免数据的重复 + str = attrNames[j] + ":" + temp[j]; + temp[j] = str; + } + + newRecord = new String[attrNames.length - 1]; + System.arraycopy(temp, 1, newRecord, 0, attrNames.length - 1); + this.transactionDatas.add(newRecord); + } + + attributeReplace(); + // 将事务数转到totalGoodsID中做统一处理 + this.totalGoodsIDs = transactionDatas; + } + + /** + * 属性值的替换,替换成数字的形式,以便进行频繁项的挖掘 + */ + private void attributeReplace() { + int currentValue = 1; + String s; + // 属性名到数字的映射图 + attr2Num = new HashMap<>(); + num2Attr = new HashMap<>(); + + // 按照1列列的方式来,从左往右边扫描,跳过列名称行和id列 + for (int j = 0; j < transactionDatas.get(0).length; j++) { + for (int i = 0; i < transactionDatas.size(); i++) { + s = transactionDatas.get(i)[j]; + + if (!attr2Num.containsKey(s)) { + attr2Num.put(s, currentValue); + num2Attr.put(currentValue, s); + + transactionDatas.get(i)[j] = currentValue + ""; + currentValue++; + } else { + transactionDatas.get(i)[j] = attr2Num.get(s) + ""; + } + } + } + } +} diff --git a/Others/DataMining_MSApriori/testInput.txt b/Others/DataMining_MSApriori/testInput.txt new file mode 100644 index 0000000..9769e26 --- /dev/null +++ b/Others/DataMining_MSApriori/testInput.txt @@ -0,0 +1,9 @@ +T1 1 2 5 +T2 2 4 +T3 2 3 +T4 1 2 4 +T5 1 3 +T6 2 3 +T7 1 3 +T8 1 2 3 5 +T9 1 2 3 \ No newline at end of file diff --git a/Others/DataMining_MSApriori/testInput2.txt b/Others/DataMining_MSApriori/testInput2.txt new file mode 100644 index 0000000..ac50350 --- /dev/null +++ b/Others/DataMining_MSApriori/testInput2.txt @@ -0,0 +1,15 @@ +Rid Age Income Student CreditRating BuysComputer +1 Youth High No Fair No +2 Youth High No Excellent No +3 MiddleAged High No Fair Yes +4 Senior Medium No Fair Yes +5 Senior Low Yes Fair Yes +6 Senior Low Yes Excellent No +7 MiddleAged Low Yes Excellent Yes +8 Youth Medium No Fair No +9 Youth Low Yes Fair Yes +10 Senior Medium Yes Fair Yes +11 Youth Medium Yes Excellent Yes +12 MiddleAged Medium No Excellent Yes +13 MiddleAged High Yes Fair Yes +14 Senior Medium No Excellent No \ No newline at end of file diff --git a/Others/DataMining_RandomForest/CARTTool.java b/Others/DataMining_RandomForest/CARTTool.java new file mode 100644 index 0000000..d68aab4 --- /dev/null +++ b/Others/DataMining_RandomForest/CARTTool.java @@ -0,0 +1,511 @@ +package DataMining_RandomForest; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Queue; + +/** + * CART分类回归树算法工具类 + * + * @author lyq + * + */ +public class CARTTool { + // 类标号的值类型 + private final String YES = "Yes"; + private final String NO = "No"; + + // 所有属性的类型总数,在这里就是data源数据的列数 + private int attrNum; + private String filePath; + // 初始源数据,用一个二维字符数组存放模仿表格数据 + private String[][] data; + // 数据的属性行的名字 + private String[] attrNames; + // 每个属性的值所有类型 + private HashMap> attrValue; + + public CARTTool(ArrayList dataArray) { + attrValue = new HashMap<>(); + readData(dataArray); + } + + /** + * 根据随机选取的样本数据进行初始化 + * @param dataArray + * 已经读入的样本数据 + */ + public void readData(ArrayList dataArray) { + data = new String[dataArray.size()][]; + dataArray.toArray(data); + attrNum = data[0].length; + attrNames = data[0]; + } + + /** + * 首先初始化每种属性的值的所有类型,用于后面的子类熵的计算时用 + */ + public void initAttrValue() { + ArrayList tempValues; + + // 按照列的方式,从左往右找 + for (int j = 1; j < attrNum; j++) { + // 从一列中的上往下开始寻找值 + tempValues = new ArrayList<>(); + for (int i = 1; i < data.length; i++) { + if (!tempValues.contains(data[i][j])) { + // 如果这个属性的值没有添加过,则添加 + tempValues.add(data[i][j]); + } + } + + // 一列属性的值已经遍历完毕,复制到map属性表中 + attrValue.put(data[0][j], tempValues); + } + } + + /** + * 计算机基尼指数 + * + * @param remainData + * 剩余数据 + * @param attrName + * 属性名称 + * @param value + * 属性值 + * @param beLongValue + * 分类是否属于此属性值 + * @return + */ + public double computeGini(String[][] remainData, String attrName, + String value, boolean beLongValue) { + // 实例总数 + int total = 0; + // 正实例数 + int posNum = 0; + // 负实例数 + int negNum = 0; + // 基尼指数 + double gini = 0; + + // 还是按列从左往右遍历属性 + for (int j = 1; j < attrNames.length; j++) { + // 找到了指定的属性 + if (attrName.equals(attrNames[j])) { + for (int i = 1; i < remainData.length; i++) { + // 统计正负实例按照属于和不属于值类型进行划分 + if ((beLongValue && remainData[i][j].equals(value)) + || (!beLongValue && !remainData[i][j].equals(value))) { + if (remainData[i][attrNames.length - 1].equals(YES)) { + // 判断此行数据是否为正实例 + posNum++; + } else { + negNum++; + } + } + } + } + } + + total = posNum + negNum; + double posProbobly = (double) posNum / total; + double negProbobly = (double) negNum / total; + gini = 1 - posProbobly * posProbobly - negProbobly * negProbobly; + + // 返回计算基尼指数 + return gini; + } + + /** + * 计算属性划分的最小基尼指数,返回最小的属性值划分和最小的基尼指数,保存在一个数组中 + * + * @param remainData + * 剩余谁 + * @param attrName + * 属性名称 + * @return + */ + public String[] computeAttrGini(String[][] remainData, String attrName) { + String[] str = new String[2]; + // 最终该属性的划分类型值 + String spiltValue = ""; + // 临时变量 + int tempNum = 0; + // 保存属性的值划分时的最小的基尼指数 + double minGini = Integer.MAX_VALUE; + ArrayList valueTypes = attrValue.get(attrName); + // 属于此属性值的实例数 + HashMap belongNum = new HashMap<>(); + + for (String string : valueTypes) { + // 重新计数的时候,数字归0 + tempNum = 0; + // 按列从左往右遍历属性 + for (int j = 1; j < attrNames.length; j++) { + // 找到了指定的属性 + if (attrName.equals(attrNames[j])) { + for (int i = 1; i < remainData.length; i++) { + // 统计正负实例按照属于和不属于值类型进行划分 + if (remainData[i][j].equals(string)) { + tempNum++; + } + } + } + } + + belongNum.put(string, tempNum); + } + + double tempGini = 0; + double posProbably = 1.0; + double negProbably = 1.0; + for (String string : valueTypes) { + tempGini = 0; + + posProbably = 1.0 * belongNum.get(string) / (remainData.length - 1); + negProbably = 1 - posProbably; + + tempGini += posProbably + * computeGini(remainData, attrName, string, true); + tempGini += negProbably + * computeGini(remainData, attrName, string, false); + + if (tempGini < minGini) { + minGini = tempGini; + spiltValue = string; + } + } + + str[0] = spiltValue; + str[1] = minGini + ""; + + return str; + } + + public void buildDecisionTree(TreeNode node, String parentAttrValue, + String[][] remainData, ArrayList remainAttr, + boolean beLongParentValue) { + // 属性划分值 + String valueType = ""; + // 划分属性名称 + String spiltAttrName = ""; + double minGini = Integer.MAX_VALUE; + double tempGini = 0; + // 基尼指数数组,保存了基尼指数和此基尼指数的划分属性值 + String[] giniArray; + + if (beLongParentValue) { + node.setParentAttrValue(parentAttrValue); + } else { + node.setParentAttrValue("!" + parentAttrValue); + } + + if (remainAttr.size() == 0) { + if (remainData.length > 1) { + ArrayList indexArray = new ArrayList<>(); + for (int i = 1; i < remainData.length; i++) { + indexArray.add(remainData[i][0]); + } + node.setDataIndex(indexArray); + } + // System.out.println("attr remain null"); + return; + } + + for (String str : remainAttr) { + giniArray = computeAttrGini(remainData, str); + tempGini = Double.parseDouble(giniArray[1]); + + if (tempGini < minGini) { + spiltAttrName = str; + minGini = tempGini; + valueType = giniArray[0]; + } + } + // 移除划分属性 + remainAttr.remove(spiltAttrName); + node.setAttrName(spiltAttrName); + + // 孩子节点,分类回归树中,每次二元划分,分出2个孩子节点 + TreeNode[] childNode = new TreeNode[2]; + String[][] rData; + + boolean[] bArray = new boolean[] { true, false }; + for (int i = 0; i < bArray.length; i++) { + // 二元划分属于属性值的划分 + rData = removeData(remainData, spiltAttrName, valueType, bArray[i]); + + boolean sameClass = true; + ArrayList indexArray = new ArrayList<>(); + for (int k = 1; k < rData.length; k++) { + indexArray.add(rData[k][0]); + // 判断是否为同一类的 + if (!rData[k][attrNames.length - 1] + .equals(rData[1][attrNames.length - 1])) { + // 只要有1个不相等,就不是同类型的 + sameClass = false; + break; + } + } + + childNode[i] = new TreeNode(); + if (!sameClass) { + // 创建新的对象属性,对象的同个引用会出错 + ArrayList rAttr = new ArrayList<>(); + for (String str : remainAttr) { + rAttr.add(str); + } + buildDecisionTree(childNode[i], valueType, rData, rAttr, + bArray[i]); + } else { + String pAtr = (bArray[i] ? valueType : "!" + valueType); + childNode[i].setParentAttrValue(pAtr); + childNode[i].setDataIndex(indexArray); + } + } + + node.setChildAttrNode(childNode); + } + + /** + * 属性划分完毕,进行数据的移除 + * + * @param srcData + * 源数据 + * @param attrName + * 划分的属性名称 + * @param valueType + * 属性的值类型 + * @parame beLongValue 分类是否属于此值类型 + */ + private String[][] removeData(String[][] srcData, String attrName, + String valueType, boolean beLongValue) { + String[][] desDataArray; + ArrayList desData = new ArrayList<>(); + // 待删除数据 + ArrayList selectData = new ArrayList<>(); + selectData.add(attrNames); + + // 数组数据转化到列表中,方便移除 + for (int i = 0; i < srcData.length; i++) { + desData.add(srcData[i]); + } + + // 还是从左往右一列列的查找 + for (int j = 1; j < attrNames.length; j++) { + if (attrNames[j].equals(attrName)) { + for (int i = 1; i < desData.size(); i++) { + if (desData.get(i)[j].equals(valueType)) { + // 如果匹配这个数据,则移除其他的数据 + selectData.add(desData.get(i)); + } + } + } + } + + if (beLongValue) { + desDataArray = new String[selectData.size()][]; + selectData.toArray(desDataArray); + } else { + // 属性名称行不移除 + selectData.remove(attrNames); + // 如果是划分不属于此类型的数据时,进行移除 + desData.removeAll(selectData); + desDataArray = new String[desData.size()][]; + desData.toArray(desDataArray); + } + + return desDataArray; + } + + /** + * 构造分类回归树,并返回根节点 + * @return + */ + public TreeNode startBuildingTree() { + initAttrValue(); + + ArrayList remainAttr = new ArrayList<>(); + // 添加属性,除了最后一个类标号属性 + for (int i = 1; i < attrNames.length - 1; i++) { + remainAttr.add(attrNames[i]); + } + + TreeNode rootNode = new TreeNode(); + buildDecisionTree(rootNode, "", data, remainAttr, false); + setIndexAndAlpah(rootNode, 0, false); + showDecisionTree(rootNode, 1); + + return rootNode; + } + + /** + * 显示决策树 + * + * @param node + * 待显示的节点 + * @param blankNum + * 行空格符,用于显示树型结构 + */ + private void showDecisionTree(TreeNode node, int blankNum) { + System.out.println(); + for (int i = 0; i < blankNum; i++) { + System.out.print(" "); + } + System.out.print("--"); + // 显示分类的属性值 + if (node.getParentAttrValue() != null + && node.getParentAttrValue().length() > 0) { + System.out.print(node.getParentAttrValue()); + } else { + System.out.print("--"); + } + System.out.print("--"); + + if (node.getDataIndex() != null && node.getDataIndex().size() > 0) { + String i = node.getDataIndex().get(0); + System.out.print("【" + node.getNodeIndex() + "】类别:" + + data[Integer.parseInt(i)][attrNames.length - 1]); + System.out.print("["); + for (String index : node.getDataIndex()) { + System.out.print(index + ", "); + } + System.out.print("]"); + } else { + // 递归显示子节点 + System.out.print("【" + node.getNodeIndex() + ":" + + node.getAttrName() + "】"); + if (node.getChildAttrNode() != null) { + for (TreeNode childNode : node.getChildAttrNode()) { + showDecisionTree(childNode, 2 * blankNum); + } + } else { + System.out.print("【 Child Null】"); + } + } + } + + /** + * 为节点设置序列号,并计算每个节点的误差率,用于后面剪枝 + * + * @param node + * 开始的时候传入的是根节点 + * @param index + * 开始的索引号,从1开始 + * @param ifCutNode + * 是否需要剪枝 + */ + private void setIndexAndAlpah(TreeNode node, int index, boolean ifCutNode) { + TreeNode tempNode; + // 最小误差代价节点,即将被剪枝的节点 + TreeNode minAlphaNode = null; + double minAlpah = Integer.MAX_VALUE; + Queue nodeQueue = new LinkedList(); + + nodeQueue.add(node); + while (nodeQueue.size() > 0) { + index++; + // 从队列头部获取首个节点 + tempNode = nodeQueue.poll(); + tempNode.setNodeIndex(index); + if (tempNode.getChildAttrNode() != null) { + for (TreeNode childNode : tempNode.getChildAttrNode()) { + nodeQueue.add(childNode); + } + computeAlpha(tempNode); + if (tempNode.getAlpha() < minAlpah) { + minAlphaNode = tempNode; + minAlpah = tempNode.getAlpha(); + } else if (tempNode.getAlpha() == minAlpah) { + // 如果误差代价值一样,比较包含的叶子节点个数,剪枝有多叶子节点数的节点 + if (tempNode.getLeafNum() > minAlphaNode.getLeafNum()) { + minAlphaNode = tempNode; + } + } + } + } + + if (ifCutNode) { + // 进行树的剪枝,让其左右孩子节点为null + minAlphaNode.setChildAttrNode(null); + } + } + + /** + * 为非叶子节点计算误差代价,这里的后剪枝法用的是CCP代价复杂度剪枝 + * + * @param node + * 待计算的非叶子节点 + */ + private void computeAlpha(TreeNode node) { + double rt = 0; + double Rt = 0; + double alpha = 0; + // 当前节点的数据总数 + int sumNum = 0; + // 最少的偏差数 + int minNum = 0; + + ArrayList dataIndex; + ArrayList leafNodes = new ArrayList<>(); + + addLeafNode(node, leafNodes); + node.setLeafNum(leafNodes.size()); + for (TreeNode attrNode : leafNodes) { + dataIndex = attrNode.getDataIndex(); + + int num = 0; + sumNum += dataIndex.size(); + for (String s : dataIndex) { + // 统计分类数据中的正负实例数 + if (data[Integer.parseInt(s)][attrNames.length - 1].equals(YES)) { + num++; + } + } + minNum += num; + + // 取小数量的值部分 + if (1.0 * num / dataIndex.size() > 0.5) { + num = dataIndex.size() - num; + } + + rt += (1.0 * num / (data.length - 1)); + } + + //同样取出少偏差的那部分 + if (1.0 * minNum / sumNum > 0.5) { + minNum = sumNum - minNum; + } + + Rt = 1.0 * minNum / (data.length - 1); + alpha = 1.0 * (Rt - rt) / (leafNodes.size() - 1); + node.setAlpha(alpha); + } + + /** + * 筛选出节点所包含的叶子节点数 + * + * @param node + * 待筛选节点 + * @param leafNode + * 叶子节点列表容器 + */ + private void addLeafNode(TreeNode node, ArrayList leafNode) { + ArrayList dataIndex; + + if (node.getChildAttrNode() != null) { + for (TreeNode childNode : node.getChildAttrNode()) { + dataIndex = childNode.getDataIndex(); + if (dataIndex != null && dataIndex.size() > 0) { + // 说明此节点为叶子节点 + leafNode.add(childNode); + } else { + // 如果还是非叶子节点则继续递归调用 + addLeafNode(childNode, leafNode); + } + } + } + } + +} diff --git a/Others/DataMining_RandomForest/Client.java b/Others/DataMining_RandomForest/Client.java new file mode 100644 index 0000000..6139d3e --- /dev/null +++ b/Others/DataMining_RandomForest/Client.java @@ -0,0 +1,33 @@ +package DataMining_RandomForest; + +import java.text.MessageFormat; + +/** + * 随机森林算法测试场景 + * + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + String queryStr = "Age=Youth,Income=Low,Student=No,CreditRating=Fair"; + String resultClassType = ""; + // 决策树的样本占总数的占比率 + double sampleNumRatio = 0.4; + // 样本数据的采集特征数量占总特征的比例 + double featureNumRatio = 0.5; + + RandomForestTool tool = new RandomForestTool(filePath, sampleNumRatio, + featureNumRatio); + tool.constructRandomTree(); + + resultClassType = tool.judgeClassType(queryStr); + + System.out.println(); + System.out + .println(MessageFormat.format( + "查询属性描述{0},预测的分类结果为BuysCompute:{1}", queryStr, + resultClassType)); + } +} diff --git a/Others/DataMining_RandomForest/DecisionTree.java b/Others/DataMining_RandomForest/DecisionTree.java new file mode 100644 index 0000000..119254e --- /dev/null +++ b/Others/DataMining_RandomForest/DecisionTree.java @@ -0,0 +1,165 @@ +package DataMining_RandomForest; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +/** + * 决策树 + * + * @author lyq + * + */ +public class DecisionTree { + // 树的根节点 + TreeNode rootNode; + // 数据的属性列名称 + String[] featureNames; + // 这棵树所包含的数据 + ArrayList datas; + // 决策树构造的的工具类 + CARTTool tool; + + public DecisionTree(ArrayList datas) { + this.datas = datas; + this.featureNames = datas.get(0); + + tool = new CARTTool(datas); + // 通过CART工具类进行决策树的构建,并返回树的根节点 + rootNode = tool.startBuildingTree(); + } + + /** + * 根据给定的数据特征描述进行类别的判断 + * + * @param features + * @return + */ + public String decideClassType(String features) { + String classType = ""; + // 查询属性组 + String[] queryFeatures; + // 在本决策树中对应的查询的属性值描述 + ArrayList featureStrs; + + featureStrs = new ArrayList<>(); + queryFeatures = features.split(","); + + String[] array; + for (String name : featureNames) { + for (String featureValue : queryFeatures) { + array = featureValue.split("="); + // 将对应的属性值加入到列表中 + if (array[0].equals(name)) { + featureStrs.add(array); + } + } + } + + // 开始从根据节点往下递归搜索 + classType = recusiveSearchClassType(rootNode, featureStrs); + + return classType; + } + + /** + * 递归搜索树,查询属性的分类类别 + * + * @param node + * 当前搜索到的节点 + * @param remainFeatures + * 剩余未判断的属性 + * @return + */ + private String recusiveSearchClassType(TreeNode node, + ArrayList remainFeatures) { + String classType = null; + + // 如果节点包含了数据的id索引,说明已经分类到底了 + if (node.getDataIndex() != null && node.getDataIndex().size() > 0) { + classType = judgeClassType(node.getDataIndex()); + + return classType; + } + + // 取出剩余属性中的一个匹配属性作为当前的判断属性名称 + String[] currentFeature = null; + for (String[] featureValue : remainFeatures) { + if (node.getAttrName().equals(featureValue[0])) { + currentFeature = featureValue; + break; + } + } + + for (TreeNode childNode : node.getChildAttrNode()) { + // 寻找子节点中属于此属性值的分支 + if (childNode.getParentAttrValue().equals(currentFeature[1])) { + remainFeatures.remove(currentFeature); + classType = recusiveSearchClassType(childNode, remainFeatures); + + // 如果找到了分类结果,则直接挑出循环 + break; + }else{ + //进行第二种情况的判断加上!符号的情况 + String value = childNode.getParentAttrValue(); + + if(value.charAt(0) == '!'){ + //去掉第一个!字符 + value = value.substring(1, value.length()); + + if(!value.equals(currentFeature[1])){ + remainFeatures.remove(currentFeature); + classType = recusiveSearchClassType(childNode, remainFeatures); + + break; + } + } + } + } + + return classType; + } + + /** + * 根据得到的数据行分类进行类别的决策 + * + * @param dataIndex + * 根据分类的数据索引号 + * @return + */ + public String judgeClassType(ArrayList dataIndex) { + // 结果类型值 + String resultClassType = ""; + String classType = ""; + int count = 0; + int temp = 0; + Map type2Num = new HashMap(); + + for (String index : dataIndex) { + temp = Integer.parseInt(index); + // 取最后一列的决策类别数据 + classType = datas.get(temp)[featureNames.length - 1]; + + if (type2Num.containsKey(classType)) { + // 如果类别已经存在,则使其计数加1 + count = type2Num.get(classType); + count++; + } else { + count = 1; + } + + type2Num.put(classType, count); + } + + // 选出其中类别支持计数最多的一个类别值 + count = -1; + for (Map.Entry entry : type2Num.entrySet()) { + if ((int) entry.getValue() > count) { + count = (int) entry.getValue(); + resultClassType = (String) entry.getKey(); + } + } + + return resultClassType; + } +} diff --git a/Others/DataMining_RandomForest/RandomForestTool.java b/Others/DataMining_RandomForest/RandomForestTool.java new file mode 100644 index 0000000..a244cd9 --- /dev/null +++ b/Others/DataMining_RandomForest/RandomForestTool.java @@ -0,0 +1,223 @@ +package DataMining_RandomForest; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +/** + * 随机森林算法工具类 + * + * @author lyq + * + */ +public class RandomForestTool { + // 测试数据文件地址 + private String filePath; + // 决策树的样本占总数的占比率 + private double sampleNumRatio; + // 样本数据的采集特征数量占总特征的比例 + private double featureNumRatio; + // 决策树的采样样本数 + private int sampleNum; + // 样本数据的采集采样特征数 + private int featureNum; + // 随机森林中的决策树的数目,等于总的数据数/用于构造每棵树的数据的数量 + private int treeNum; + // 随机数产生器 + private Random random; + // 样本数据列属性名称行 + private String[] featureNames; + // 原始的总的数据 + private ArrayList totalDatas; + // 决策树森林 + private ArrayList decisionForest; + + public RandomForestTool(String filePath, double sampleNumRatio, + double featureNumRatio) { + this.filePath = filePath; + this.sampleNumRatio = sampleNumRatio; + this.featureNumRatio = featureNumRatio; + + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + private void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + totalDatas = dataArray; + featureNames = totalDatas.get(0); + sampleNum = (int) ((totalDatas.size() - 1) * sampleNumRatio); + //算属性数量的时候需要去掉id属性和决策属性,用条件属性计算 + featureNum = (int) ((featureNames.length -2) * featureNumRatio); + // 算数量的时候需要去掉首行属性名称行 + treeNum = (totalDatas.size() - 1) / sampleNum; + } + + /** + * 产生决策树 + */ + private DecisionTree produceDecisionTree() { + int temp = 0; + DecisionTree tree; + String[] tempData; + //采样数据的随机行号组 + ArrayList sampleRandomNum; + //采样属性特征的随机列号组 + ArrayList featureRandomNum; + ArrayList datas; + + sampleRandomNum = new ArrayList<>(); + featureRandomNum = new ArrayList<>(); + datas = new ArrayList<>(); + + for(int i=0; i 0){ + array[0] = temp + ""; + } + + temp++; + } + + tree = new DecisionTree(datas); + + return tree; + } + + /** + * 构造随机森林 + */ + public void constructRandomTree() { + DecisionTree tree; + random = new Random(); + decisionForest = new ArrayList<>(); + + System.out.println("下面是随机森林中的决策树:"); + // 构造决策树加入森林中 + for (int i = 0; i < treeNum; i++) { + System.out.println("\n决策树" + (i+1)); + tree = produceDecisionTree(); + decisionForest.add(tree); + } + } + + /** + * 根据给定的属性条件进行类别的决策 + * + * @param features + * 给定的已知的属性描述 + * @return + */ + public String judgeClassType(String features) { + // 结果类型值 + String resultClassType = ""; + String classType = ""; + int count = 0; + Map type2Num = new HashMap(); + + for (DecisionTree tree : decisionForest) { + classType = tree.decideClassType(features); + if (type2Num.containsKey(classType)) { + // 如果类别已经存在,则使其计数加1 + count = type2Num.get(classType); + count++; + } else { + count = 1; + } + + type2Num.put(classType, count); + } + + // 选出其中类别支持计数最多的一个类别值 + count = -1; + for (Map.Entry entry : type2Num.entrySet()) { + if ((int) entry.getValue() > count) { + count = (int) entry.getValue(); + resultClassType = (String) entry.getKey(); + } + } + + return resultClassType; + } +} diff --git a/Others/DataMining_RandomForest/TreeNode.java b/Others/DataMining_RandomForest/TreeNode.java new file mode 100644 index 0000000..b118472 --- /dev/null +++ b/Others/DataMining_RandomForest/TreeNode.java @@ -0,0 +1,85 @@ +package DataMining_RandomForest; + +import java.util.ArrayList; + +/** + * 回归分类树节点 + * + * @author lyq + * + */ +public class TreeNode { + // 节点属性名字 + private String attrName; + // 节点索引标号 + private int nodeIndex; + //包含的叶子节点数 + private int leafNum; + // 节点误差率 + private double alpha; + // 父亲分类属性值 + private String parentAttrValue; + // 孩子节点 + private TreeNode[] childAttrNode; + // 数据记录索引 + private ArrayList dataIndex; + + public String getAttrName() { + return attrName; + } + + public void setAttrName(String attrName) { + this.attrName = attrName; + } + + public int getNodeIndex() { + return nodeIndex; + } + + public void setNodeIndex(int nodeIndex) { + this.nodeIndex = nodeIndex; + } + + public double getAlpha() { + return alpha; + } + + public void setAlpha(double alpha) { + this.alpha = alpha; + } + + public String getParentAttrValue() { + return parentAttrValue; + } + + public void setParentAttrValue(String parentAttrValue) { + this.parentAttrValue = parentAttrValue; + } + + public TreeNode[] getChildAttrNode() { + return childAttrNode; + } + + public void setChildAttrNode(TreeNode[] childAttrNode) { + this.childAttrNode = childAttrNode; + } + + public ArrayList getDataIndex() { + return dataIndex; + } + + public void setDataIndex(ArrayList dataIndex) { + this.dataIndex = dataIndex; + } + + public int getLeafNum() { + return leafNum; + } + + public void setLeafNum(int leafNum) { + this.leafNum = leafNum; + } + + + +} diff --git a/Others/DataMining_RandomForest/input.txt b/Others/DataMining_RandomForest/input.txt new file mode 100644 index 0000000..ac50350 --- /dev/null +++ b/Others/DataMining_RandomForest/input.txt @@ -0,0 +1,15 @@ +Rid Age Income Student CreditRating BuysComputer +1 Youth High No Fair No +2 Youth High No Excellent No +3 MiddleAged High No Fair Yes +4 Senior Medium No Fair Yes +5 Senior Low Yes Fair Yes +6 Senior Low Yes Excellent No +7 MiddleAged Low Yes Excellent Yes +8 Youth Medium No Fair No +9 Youth Low Yes Fair Yes +10 Senior Medium Yes Fair Yes +11 Youth Medium Yes Excellent Yes +12 MiddleAged Medium No Excellent Yes +13 MiddleAged High Yes Fair Yes +14 Senior Medium No Excellent No \ No newline at end of file diff --git a/Others/DataMining_TAN/AttrMutualInfo.java b/Others/DataMining_TAN/AttrMutualInfo.java new file mode 100644 index 0000000..6caf12d --- /dev/null +++ b/Others/DataMining_TAN/AttrMutualInfo.java @@ -0,0 +1,28 @@ +package DataMining_TAN; + +/** + * 属性之间的互信息值,表示属性之间的关联性大小 + * @author lyq + * + */ +public class AttrMutualInfo implements Comparable{ + //互信息值 + Double value; + //关联属性值对 + Node[] nodeArray; + + public AttrMutualInfo(double value, Node node1, Node node2){ + this.value = value; + + this.nodeArray = new Node[2]; + this.nodeArray[0] = node1; + this.nodeArray[1] = node2; + } + + @Override + public int compareTo(AttrMutualInfo o) { + // TODO Auto-generated method stub + return o.value.compareTo(this.value); + } + +} diff --git a/Others/DataMining_TAN/Client.java b/Others/DataMining_TAN/Client.java new file mode 100644 index 0000000..bd104bc --- /dev/null +++ b/Others/DataMining_TAN/Client.java @@ -0,0 +1,36 @@ +package DataMining_TAN; + +/** + * TAN树型朴素贝叶斯算法 + * + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + String filePath = "C:\\Users\\lyq\\Desktop\\icon\\input.txt"; + // 条件查询语句 + String queryStr; + // 分类结果概率1 + double classResult1; + // 分类结果概率2 + double classResult2; + + TANTool tool = new TANTool(filePath); + queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=No"; + classResult1 = tool.calHappenedPro(queryStr); + + queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=Yes"; + classResult2 = tool.calHappenedPro(queryStr); + + System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=No", + classResult1)); + System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=Yes", + classResult2)); + if (classResult1 > classResult2) { + System.out.println("分类类别为PlayTennis=No"); + } else { + System.out.println("分类类别为PlayTennis=Yes"); + } + } +} diff --git a/Others/DataMining_TAN/Node.java b/Others/DataMining_TAN/Node.java new file mode 100644 index 0000000..f3a3b51 --- /dev/null +++ b/Others/DataMining_TAN/Node.java @@ -0,0 +1,63 @@ +package DataMining_TAN; + +import java.util.ArrayList; + +/** + * 贝叶斯网络节点类 + * + * @author lyq + * + */ +public class Node { + //节点唯一id,方便后面节点连接方向的确定 + int id; + // 节点的属性名称 + String name; + // 该节点所连续的节点 + ArrayList connectedNodes; + + public Node(int id, String name) { + this.id = id; + this.name = name; + + // 初始化变量 + this.connectedNodes = new ArrayList<>(); + } + + /** + * 将自身节点连接到目标给定的节点 + * + * @param node + * 下游节点 + */ + public void connectNode(Node node) { + //避免连接自身 + if(this.id == node.id){ + return; + } + + // 将节点加入自身节点的节点列表中 + this.connectedNodes.add(node); + // 将自身节点加入到目标节点的列表中 + node.connectedNodes.add(this); + } + + /** + * 判断与目标节点是否相同,主要比较名称是否相同即可 + * + * @param node + * 目标结点 + * @return + */ + public boolean isEqual(Node node) { + boolean isEqual; + + isEqual = false; + // 节点名称相同则视为相等 + if (this.id == node.id) { + isEqual = true; + } + + return isEqual; + } +} diff --git a/Others/DataMining_TAN/TANTool.java b/Others/DataMining_TAN/TANTool.java new file mode 100644 index 0000000..56e90a6 --- /dev/null +++ b/Others/DataMining_TAN/TANTool.java @@ -0,0 +1,571 @@ +package DataMining_TAN; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + +/** + * TAN树型朴素贝叶斯算法工具类 + * + * @author lyq + * + */ +public class TANTool { + // 测试数据集地址 + private String filePath; + // 数据集属性总数,其中一个个分类属性 + private int attrNum; + // 分类属性名 + private String classAttrName; + // 属性列名称行 + private String[] attrNames; + // 贝叶斯网络边的方向,数组内的数值为节点id,从i->j + private int[][] edges; + // 属性名到列下标的映射 + private HashMap attr2Column; + // 属性,属性对取值集合映射对 + private HashMap> attr2Values; + // 贝叶斯网络总节点列表 + private ArrayList totalNodes; + // 总的测试数据 + private ArrayList totalDatas; + + public TANTool(String filePath) { + this.filePath = filePath; + + readDataFile(); + } + + /** + * 从文件中读取数据 + */ + private void readDataFile() { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] array; + + while ((str = in.readLine()) != null) { + array = str.split(" "); + dataArray.add(array); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + this.totalDatas = dataArray; + this.attrNames = this.totalDatas.get(0); + this.attrNum = this.attrNames.length; + this.classAttrName = this.attrNames[attrNum - 1]; + + Node node; + this.edges = new int[attrNum][attrNum]; + this.totalNodes = new ArrayList<>(); + this.attr2Column = new HashMap<>(); + this.attr2Values = new HashMap<>(); + + // 分类属性节点id最小设为0 + node = new Node(0, attrNames[attrNum - 1]); + this.totalNodes.add(node); + for (int i = 0; i < attrNames.length; i++) { + if (i < attrNum - 1) { + // 创建贝叶斯网络节点,每个属性一个节点 + node = new Node(i + 1, attrNames[i]); + this.totalNodes.add(node); + } + + // 添加属性到列下标的映射 + this.attr2Column.put(attrNames[i], i); + } + + String[] temp; + ArrayList values; + // 进行属性名,属性值对的映射匹配 + for (int i = 1; i < this.totalDatas.size(); i++) { + temp = this.totalDatas.get(i); + + for (int j = 0; j < temp.length; j++) { + // 判断map中是否包含此属性名 + if (this.attr2Values.containsKey(attrNames[j])) { + values = this.attr2Values.get(attrNames[j]); + } else { + values = new ArrayList<>(); + } + + if (!values.contains(temp[j])) { + // 加入新的属性值 + values.add(temp[j]); + } + + this.attr2Values.put(attrNames[j], values); + } + } + } + + /** + * 根据条件互信息度对构建最大权重跨度树,返回第一个节点为根节点 + * + * @param iArray + */ + private Node constructWeightTree(ArrayList iArray) { + Node node1; + Node node2; + Node root; + ArrayList existNodes; + + existNodes = new ArrayList<>(); + + for (Node[] i : iArray) { + node1 = i[0]; + node2 = i[1]; + + // 将2个节点进行连接 + node1.connectNode(node2); + // 避免出现环路现象 + addIfNotExist(node1, existNodes); + addIfNotExist(node2, existNodes); + + if (existNodes.size() == attrNum - 1) { + break; + } + } + + // 返回第一个作为根节点 + root = existNodes.get(0); + return root; + } + + /** + * 为树型结构确定边的方向,方向为属性根节点方向指向其他属性节点方向 + * + * @param root + * 当前遍历到的节点 + */ + private void confirmGraphDirection(Node currentNode) { + int i; + int j; + ArrayList connectedNodes; + + connectedNodes = currentNode.connectedNodes; + + i = currentNode.id; + for (Node n : connectedNodes) { + j = n.id; + + // 判断连接此2节点的方向是否被确定 + if (edges[i][j] == 0 && edges[j][i] == 0) { + // 如果没有确定,则制定方向为i->j + edges[i][j] = 1; + + // 递归继续搜索 + confirmGraphDirection(n); + } + } + } + + /** + * 为属性节点添加分类属性节点为父节点 + * + * @param parentNode + * 父节点 + * @param nodeList + * 子节点列表 + */ + private void addParentNode() { + // 分类属性节点 + Node parentNode; + + parentNode = null; + for (Node n : this.totalNodes) { + if (n.id == 0) { + parentNode = n; + break; + } + } + + for (Node child : this.totalNodes) { + parentNode.connectNode(child); + + if (child.id != 0) { + // 确定连接方向 + this.edges[0][child.id] = 1; + } + } + } + + /** + * 在节点集合中添加节点 + * + * @param node + * 待添加节点 + * @param existNodes + * 已存在的节点列表 + * @return + */ + public boolean addIfNotExist(Node node, ArrayList existNodes) { + boolean canAdd; + + canAdd = true; + for (Node n : existNodes) { + // 如果节点列表中已经含有节点,则算添加失败 + if (n.isEqual(node)) { + canAdd = false; + break; + } + } + + if (canAdd) { + existNodes.add(node); + } + + return canAdd; + } + + /** + * 计算节点条件概率 + * + * @param node + * 关于node的后验概率 + * @param queryParam + * 查询的属性参数 + * @return + */ + private double calConditionPro(Node node, HashMap queryParam) { + int id; + double pro; + String value; + String[] attrValue; + + ArrayList priorAttrInfos; + ArrayList backAttrInfos; + ArrayList parentNodes; + + pro = 1; + id = node.id; + parentNodes = new ArrayList<>(); + priorAttrInfos = new ArrayList<>(); + backAttrInfos = new ArrayList<>(); + + for (int i = 0; i < this.edges.length; i++) { + // 寻找父节点id + if (this.edges[i][id] == 1) { + for (Node temp : this.totalNodes) { + // 寻找目标节点id + if (temp.id == i) { + parentNodes.add(temp); + break; + } + } + } + } + + // 获取先验属性的属性值,首先添加先验属性 + value = queryParam.get(node.name); + attrValue = new String[2]; + attrValue[0] = node.name; + attrValue[1] = value; + priorAttrInfos.add(attrValue); + + // 逐一添加后验属性 + for (Node p : parentNodes) { + value = queryParam.get(p.name); + attrValue = new String[2]; + attrValue[0] = p.name; + attrValue[1] = value; + + backAttrInfos.add(attrValue); + } + + pro = queryConditionPro(priorAttrInfos, backAttrInfos); + + return pro; + } + + /** + * 查询条件概率 + * + * @param attrValues + * 条件属性值 + * @return + */ + private double queryConditionPro(ArrayList priorValues, + ArrayList backValues) { + // 判断是否满足先验属性值条件 + boolean hasPrior; + // 判断是否满足后验属性值条件 + boolean hasBack; + int attrIndex; + double backPro; + double totalPro; + double pro; + String[] tempData; + + pro = 0; + totalPro = 0; + backPro = 0; + + // 跳过第一行的属性名称行 + for (int i = 1; i < this.totalDatas.size(); i++) { + tempData = this.totalDatas.get(i); + + hasPrior = true; + hasBack = true; + + // 判断是否满足先验条件 + for (String[] array : priorValues) { + attrIndex = this.attr2Column.get(array[0]); + + // 判断值是否满足条件 + if (!tempData[attrIndex].equals(array[1])) { + hasPrior = false; + break; + } + } + + // 判断是否满足后验条件 + for (String[] array : backValues) { + attrIndex = this.attr2Column.get(array[0]); + + // 判断值是否满足条件 + if (!tempData[attrIndex].equals(array[1])) { + hasBack = false; + break; + } + } + + // 进行计数统计,分别计算满足后验属性的值和同时满足条件的个数 + if (hasBack) { + backPro++; + if (hasPrior) { + totalPro++; + } + } else if (hasPrior && backValues.size() == 0) { + // 如果只有先验概率则为纯概率的计算 + totalPro++; + backPro = 1.0; + } + } + + if (backPro == 0) { + pro = 0; + } else { + // 计算总的概率=都发生概率/只发生后验条件的时间概率 + pro = totalPro / backPro; + } + + return pro; + } + + /** + * 输入查询条件参数,计算发生概率 + * + * @param queryParam + * 条件参数 + * @return + */ + public double calHappenedPro(String queryParam) { + double result; + double temp; + // 分类属性值 + String classAttrValue; + String[] array; + String[] array2; + HashMap params; + + result = 1; + params = new HashMap<>(); + + // 进行查询字符的参数分解 + array = queryParam.split(","); + for (String s : array) { + array2 = s.split("="); + params.put(array2[0], array2[1]); + } + + classAttrValue = params.get(classAttrName); + // 构建贝叶斯网络结构 + constructBayesNetWork(classAttrValue); + + for (Node n : this.totalNodes) { + temp = calConditionPro(n, params); + + // 为了避免出现条件概率为0的现象,进行轻微矫正 + if (temp == 0) { + temp = 0.001; + } + + // 按照联合概率公式,进行乘积运算 + result *= temp; + } + + return result; + } + + /** + * 构建树型贝叶斯网络结构 + * + * @param value + * 类别量值 + */ + private void constructBayesNetWork(String value) { + Node rootNode; + ArrayList mInfoArray; + // 互信息度对 + ArrayList iArray; + + iArray = null; + rootNode = null; + + // 在每次重新构建贝叶斯网络结构的时候,清空原有的连接结构 + for (Node n : this.totalNodes) { + n.connectedNodes.clear(); + } + this.edges = new int[attrNum][attrNum]; + + // 从互信息对象中取出属性值对 + iArray = new ArrayList<>(); + mInfoArray = calAttrMutualInfoArray(value); + for (AttrMutualInfo v : mInfoArray) { + iArray.add(v.nodeArray); + } + + // 构建最大权重跨度树 + rootNode = constructWeightTree(iArray); + // 为无向图确定边的方向 + confirmGraphDirection(rootNode); + // 为每个属性节点添加分类属性父节点 + addParentNode(); + } + + /** + * 给定分类变量值,计算属性之间的互信息值 + * + * @param value + * 分类变量值 + * @return + */ + private ArrayList calAttrMutualInfoArray(String value) { + double iValue; + Node node1; + Node node2; + AttrMutualInfo mInfo; + ArrayList mInfoArray; + + mInfoArray = new ArrayList<>(); + + for (int i = 0; i < this.totalNodes.size() - 1; i++) { + node1 = this.totalNodes.get(i); + // 跳过分类属性节点 + if (node1.id == 0) { + continue; + } + + for (int j = i + 1; j < this.totalNodes.size(); j++) { + node2 = this.totalNodes.get(j); + // 跳过分类属性节点 + if (node2.id == 0) { + continue; + } + + // 计算2个属性节点之间的互信息值 + iValue = calMutualInfoValue(node1, node2, value); + mInfo = new AttrMutualInfo(iValue, node1, node2); + mInfoArray.add(mInfo); + } + } + + // 将结果进行降序排列,让互信息值高的优先用于构建树 + Collections.sort(mInfoArray); + + return mInfoArray; + } + + /** + * 计算2个属性节点的互信息值 + * + * @param node1 + * 节点1 + * @param node2 + * 节点2 + * @param vlaue + * 分类变量值 + */ + private double calMutualInfoValue(Node node1, Node node2, String value) { + double iValue; + double temp; + // 三种不同条件的后验概率 + double pXiXj; + double pXi; + double pXj; + String[] array1; + String[] array2; + ArrayList attrValues1; + ArrayList attrValues2; + ArrayList priorValues; + // 后验概率,在这里就是类变量值 + ArrayList backValues; + + array1 = new String[2]; + array2 = new String[2]; + priorValues = new ArrayList<>(); + backValues = new ArrayList<>(); + + iValue = 0; + array1[0] = classAttrName; + array1[1] = value; + // 后验属性都是类属性 + backValues.add(array1); + + // 获取节点属性的属性值集合 + attrValues1 = this.attr2Values.get(node1.name); + attrValues2 = this.attr2Values.get(node2.name); + + for (String v1 : attrValues1) { + for (String v2 : attrValues2) { + priorValues.clear(); + + array1 = new String[2]; + array1[0] = node1.name; + array1[1] = v1; + priorValues.add(array1); + + array2 = new String[2]; + array2[0] = node2.name; + array2[1] = v2; + priorValues.add(array2); + + // 计算3种条件下的概率 + pXiXj = queryConditionPro(priorValues, backValues); + + priorValues.clear(); + priorValues.add(array1); + pXi = queryConditionPro(priorValues, backValues); + + priorValues.clear(); + priorValues.add(array2); + pXj = queryConditionPro(priorValues, backValues); + + // 如果出现其中一个计数概率为0,则直接赋值为0处理 + if (pXiXj == 0 || pXi == 0 || pXj == 0) { + temp = 0; + } else { + // 利用公式计算针对此属性值对组合的概率 + temp = pXiXj * Math.log(pXiXj / (pXi * pXj)) / Math.log(2); + } + + // 进行和属性值对组合的累加即为整个属性的互信息值 + iValue += temp; + } + } + + return iValue; + } +} diff --git a/Others/DataMining_TAN/input.txt b/Others/DataMining_TAN/input.txt new file mode 100644 index 0000000..aea7074 --- /dev/null +++ b/Others/DataMining_TAN/input.txt @@ -0,0 +1,15 @@ +OutLook Temperature Humidity Wind PlayTennis +Sunny Hot High Weak No +Sunny Hot High Strong No +Overcast Hot High Weak Yes +Rainy Mild High Weak Yes +Rainy Cool Normal Weak Yes +Rainy Cool Normal Strong No +Overcast Cool Normal Strong Yes +Sunny Mild High Weak No +Sunny Cool Normal Weak Yes +Rainy Mild Normal Weak Yes +Sunny Mild Normal Strong Yes +Overcast Mild High Strong Yes +Overcast Hot Normal Weak Yes +Rainy Mild High Strong No \ No newline at end of file diff --git a/Others/DataMining_Viterbi/BaseNames.java b/Others/DataMining_Viterbi/BaseNames.java new file mode 100644 index 0000000..cca0aaa --- /dev/null +++ b/Others/DataMining_Viterbi/BaseNames.java @@ -0,0 +1,24 @@ +package DataMining_Viterbi; + +/** + * 基本变量定义类 + * @author lyq + * + */ +public class BaseNames { + //日期天数下标 + public static final int DAY1 = 0; + public static final int DAY2 = 1; + public static final int DAY3 = 2; + + //天气属性类别 + public static final int WEATHER_SUNNY = 0; + public static final int WEATHER_CLOUDY = 1; + public static final int WEATHER_RAINY = 2; + + //湿度属性类别 + public static final int HUMIDITY_DRY = 0; + public static final int HUMIDITY_DRYISH = 1; + public static final int HUMIDITY_DAMP = 1; + public static final int HUMIDITY_SOGGY = 1; +} diff --git a/Others/DataMining_Viterbi/Client.java b/Others/DataMining_Viterbi/Client.java new file mode 100644 index 0000000..577eabd --- /dev/null +++ b/Others/DataMining_Viterbi/Client.java @@ -0,0 +1,31 @@ +package DataMining_Viterbi; + +/** + * 维特比算法 + * + * @author lyq + * + */ +public class Client { + public static void main(String[] args) { + // 状态转移概率矩阵路径 + String stmFilePath; + // 混淆矩阵路径 + String cfFilePath; + // 观察到的状态 + String[] observeStates; + // 初始状态 + double[] initStatePro; + ViterbiTool tool; + + stmFilePath = "C:\\Users\\lyq\\Desktop\\icon\\stmatrix.txt"; + cfFilePath = "C:\\Users\\lyq\\Desktop\\icon\\humidity-matrix.txt"; + + initStatePro = new double[] { 0.63, 0.17, 0.20 }; + observeStates = new String[] { "Dry", "Damp", "Soggy" }; + + tool = new ViterbiTool(stmFilePath, cfFilePath, initStatePro, + observeStates); + tool.calHMMObserve(); + } +} diff --git a/Others/DataMining_Viterbi/ViterbiTool.java b/Others/DataMining_Viterbi/ViterbiTool.java new file mode 100644 index 0000000..6f1ade6 --- /dev/null +++ b/Others/DataMining_Viterbi/ViterbiTool.java @@ -0,0 +1,240 @@ +package DataMining_Viterbi; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +/** + * 维特比算法工具类 + * + * @author lyq + * + */ +public class ViterbiTool { + // 状态转移概率矩阵文件地址 + private String stmFilePath; + // 混淆矩阵文件地址 + private String confusionFilePath; + // 初始状态概率 + private double[] initStatePro; + // 观察到的状态序列 + public String[] observeStates; + // 状态转移矩阵值 + private double[][] stMatrix; + // 混淆矩阵值 + private double[][] confusionMatrix; + // 各个条件下的潜在特征概率值 + private double[][] potentialValues; + // 潜在特征 + private ArrayList potentialAttrs; + // 属性值列坐标映射图 + private HashMap name2Index; + // 列坐标属性值映射图 + private HashMap index2name; + + public ViterbiTool(String stmFilePath, String confusionFilePath, + double[] initStatePro, String[] observeStates) { + this.stmFilePath = stmFilePath; + this.confusionFilePath = confusionFilePath; + this.initStatePro = initStatePro; + this.observeStates = observeStates; + + initOperation(); + } + + /** + * 初始化数据操作 + */ + private void initOperation() { + double[] temp; + int index; + ArrayList smtDatas; + ArrayList cfDatas; + + smtDatas = readDataFile(stmFilePath); + cfDatas = readDataFile(confusionFilePath); + + index = 0; + this.stMatrix = new double[smtDatas.size()][]; + for (String[] array : smtDatas) { + temp = new double[array.length]; + for (int i = 0; i < array.length; i++) { + try { + temp[i] = Double.parseDouble(array[i]); + } catch (NumberFormatException e) { + temp[i] = -1; + } + } + + // 将转换后的值赋给数组中 + this.stMatrix[index] = temp; + index++; + } + + index = 0; + this.confusionMatrix = new double[cfDatas.size()][]; + for (String[] array : cfDatas) { + temp = new double[array.length]; + for (int i = 0; i < array.length; i++) { + try { + temp[i] = Double.parseDouble(array[i]); + } catch (NumberFormatException e) { + temp[i] = -1; + } + } + + // 将转换后的值赋给数组中 + this.confusionMatrix[index] = temp; + index++; + } + + this.potentialAttrs = new ArrayList<>(); + // 添加潜在特征属性 + for (String s : smtDatas.get(0)) { + this.potentialAttrs.add(s); + } + // 去除首列无效列 + potentialAttrs.remove(0); + + this.name2Index = new HashMap<>(); + this.index2name = new HashMap<>(); + + // 添加名称下标映射关系 + for (int i = 1; i < smtDatas.get(0).length; i++) { + this.name2Index.put(smtDatas.get(0)[i], i); + // 添加下标到名称的映射 + this.index2name.put(i, smtDatas.get(0)[i]); + } + + for (int i = 1; i < cfDatas.get(0).length; i++) { + this.name2Index.put(cfDatas.get(0)[i], i); + } + } + + /** + * 从文件中读取数据 + */ + private ArrayList readDataFile(String filePath) { + File file = new File(filePath); + ArrayList dataArray = new ArrayList(); + + try { + BufferedReader in = new BufferedReader(new FileReader(file)); + String str; + String[] tempArray; + while ((str = in.readLine()) != null) { + tempArray = str.split(" "); + dataArray.add(tempArray); + } + in.close(); + } catch (IOException e) { + e.getStackTrace(); + } + + return dataArray; + } + + /** + * 根据观察特征计算隐藏的特征概率矩阵 + */ + private void calPotencialProMatrix() { + String curObserveState; + // 观察特征和潜在特征的下标 + int osIndex; + int psIndex; + double temp; + double maxPro; + // 混淆矩阵概率值,就是相关影响的因素概率 + double confusionPro; + + this.potentialValues = new double[observeStates.length][potentialAttrs + .size() + 1]; + for (int i = 0; i < this.observeStates.length; i++) { + curObserveState = this.observeStates[i]; + osIndex = this.name2Index.get(curObserveState); + maxPro = -1; + + // 因为是第一个观察特征,没有前面的影响,根据初始状态计算 + if (i == 0) { + for (String attr : this.potentialAttrs) { + psIndex = this.name2Index.get(attr); + confusionPro = this.confusionMatrix[psIndex][osIndex]; + + temp = this.initStatePro[psIndex - 1] * confusionPro; + this.potentialValues[BaseNames.DAY1][psIndex] = temp; + } + } else { + // 后面的潜在特征受前一个特征的影响,以及当前的混淆因素影响 + for (String toDayAttr : this.potentialAttrs) { + psIndex = this.name2Index.get(toDayAttr); + confusionPro = this.confusionMatrix[psIndex][osIndex]; + + int index; + maxPro = -1; + // 通过昨天的概率计算今天此特征的最大概率 + for (String yAttr : this.potentialAttrs) { + index = this.name2Index.get(yAttr); + temp = this.potentialValues[i - 1][index] + * this.stMatrix[index][psIndex]; + + // 计算得到今天此潜在特征的最大概率 + if (temp > maxPro) { + maxPro = temp; + } + } + + this.potentialValues[i][psIndex] = maxPro * confusionPro; + } + } + } + } + + /** + * 根据同时期最大概率值输出潜在特征值 + */ + private void outputResultAttr() { + double maxPro; + int maxIndex; + ArrayList psValues; + + psValues = new ArrayList<>(); + for (int i = 0; i < this.potentialValues.length; i++) { + maxPro = -1; + maxIndex = 0; + + for (int j = 0; j < potentialValues[i].length; j++) { + if (this.potentialValues[i][j] > maxPro) { + maxPro = potentialValues[i][j]; + maxIndex = j; + } + } + + // 取出最大概率下标对应的潜在特征 + psValues.add(this.index2name.get(maxIndex)); + } + + System.out.println("观察特征为:"); + for (String s : this.observeStates) { + System.out.print(s + ", "); + } + System.out.println(); + + System.out.println("潜在特征为:"); + for (String s : psValues) { + System.out.print(s + ", "); + } + System.out.println(); + } + + /** + * 根据观察属性,得到潜在属性信息 + */ + public void calHMMObserve() { + calPotencialProMatrix(); + outputResultAttr(); + } +} diff --git a/Others/DataMining_Viterbi/humidity-matrix.txt b/Others/DataMining_Viterbi/humidity-matrix.txt new file mode 100644 index 0000000..ff41df6 --- /dev/null +++ b/Others/DataMining_Viterbi/humidity-matrix.txt @@ -0,0 +1,4 @@ +# Dry Dryish Damp Soggy +Sunny 0.6 0.2 0.15 0.05 +Cloudy 0.25 0.25 0.25 0.25 +Rainy 0.05 0.10 0.35 0.50 \ No newline at end of file diff --git a/Others/DataMining_Viterbi/stmatrix.txt b/Others/DataMining_Viterbi/stmatrix.txt new file mode 100644 index 0000000..af66956 --- /dev/null +++ b/Others/DataMining_Viterbi/stmatrix.txt @@ -0,0 +1,4 @@ +# Sunny Cloudy Rainy +Sunny 0.5 0.375 0.125 +Cloudy 0.25 0.125 0.625 +Rainy 0.25 0.375 0.375 \ No newline at end of file diff --git a/README.md b/README.md index ed399c2..7dd82f0 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,142 @@ -# DataMiningAlgorithm -鏁版嵁鎸栨帢涓粡鍏哥殑绠楁硶瀹炵幇鍜岃缁嗙殑娉ㄩ噴 +# 鏁版嵁鎸栨帢绠楁硶 + +## 绠楁硶鐩綍 +#### 18澶M绠楁硶 +鍖呭悕 | 鐩綍鍚 | 绠楁硶鍚 | +-----| ------ |--------| +AssociationAnalysis | DataMining_Apriori | Apriori-鍏宠仈瑙勫垯鎸栨帢绠楁硶 +AssociationAnalysis | DataMining_FPTree | FPTree-棰戠箒妯″紡鏍戠畻娉 +BaggingAndBoosting | DataMining_AdaBoost | AdaBoost-瑁呰鎻愬崌绠楁硶 +Classification | DataMining_CART | CART-鍒嗙被鍥炲綊鏍戠畻娉 +Classification | DataMining_ID3 | ID3-鍐崇瓥鏍戝垎绫荤畻娉 +Classification | DataMining_KNN | KNN-k鏈杩戦偦绠楁硶宸ュ叿绫 +Classification | DataMining_NaiveBayes | NaiveBayes-鏈寸礌璐濆彾鏂畻娉 +Clustering | DataMining_BIRCH | BIRCH-灞傛鑱氱被绠楁硶 +Clustering | DataMining_KMeans | KMeans-K鍧囧肩畻娉 +GraphMining | DataMining_GSpan | GSpan-棰戠箒瀛愬浘鎸栨帢绠楁硶 +IntegratedMining | DataMining_CBA | CBA-鍩轰簬鍏宠仈瑙勫垯鐨勫垎绫荤畻娉 +LinkMining | DataMining_HITS | HITS-閾炬帴鍒嗘瀽绠楁硶 +LinkMining | DataMining_PageRank | PageRank-缃戦〉閲嶈鎬/鎺掑悕绠楁硶 +RoughSets | DataMining_RoughSets | RoughSets-绮楃硻闆嗗睘鎬х害绠绠楁硶 +SequentialPatterns | DataMining_GSP | GSP-搴忓垪妯″紡鍒嗘瀽绠楁硶 +SequentialPatterns | DataMining_PrefixSpan | PrefixSpan-搴忓垪妯″紡鍒嗘瀽绠楁硶 +StatisticalLearning | DataMining_EM | EM-鏈熸湜鏈澶у寲绠楁硶 +StatisticalLearning | DataMining_SVM | SVM-鏀寔鍚戦噺鏈虹畻娉 -18澶ф暟鎹寲鎺樼殑缁忓吀绠楁硶浠ュ強浠g爜瀹炵幇锛屾秹鍙婂埌浜嗗喅绛栧垎绫伙紝鑱氱被锛岄摼鎺ユ寲鎺橈紝鍏宠仈鎸栨帢锛屾ā寮忔寲鎺樼瓑绛夋柟闈,鍚庨潰閮芥槸鐩稿簲绠楁硶鐨勫崥鏂囬摼鎺ワ紝甯屾湜鑳藉甯姪澶у瀛︿範銆 +#### 鍏朵粬缁忓吀DM绠楁硶 +鍖呭悕 | 鐩綍鍚 | 绠楁硶鍚 | +-----| ------ |--------| +Others | DataMining_ACO | ACO-铓佺兢绠楁硶 +Others | DataMining_BayesNetwork | BayesNetwork-璐濆彾鏂綉缁滅畻娉 +Others | DataMining_CABDDCC | CABDDCC-鍩轰簬杩為氬浘鐨勫垎瑁傝仛绫荤畻娉 +Others | DataMining_Chameleon | Chameleon-涓ら樁娈靛悎骞惰仛绫荤畻娉 +Others | DataMining_DBSCAN | DBSCAN-鍩轰簬瀵嗗害鐨勮仛绫荤畻娉 +Others | DataMining_GA | GA-閬椾紶绠楁硶 +Others | DataMining_GA_Maze | GA_Maze-閬椾紶绠楁硶鍦ㄨ蛋杩峰娓告垙涓殑搴旂敤绠楁硶 +Others | DataMining_KDTree | KDTree-k缁寸┖闂村叧閿暟鎹绱㈢畻娉曞伐鍏风被 +Others | DataMining_MSApriori | MSApriori-鍩轰簬澶氭敮鎸佸害鐨凙priori绠楁硶 +Others | DataMining_RandomForest | RandomForest-闅忔満妫灄绠楁硶 +Others | DataMining_TAN | TAN-鏍戝瀷鏈寸礌璐濆彾鏂畻娉 +Others | DataMining_Viterbi | Viterbi-缁寸壒姣旂畻娉 -1.C4.5绠楁硶銆侰4.5绠楁硶涓嶪D3绠楁硶涓鏍凤紝閮芥槸鏁板鍒嗙被绠楁硶锛孋4.5绠楁硶鏄疘D3绠楁硶鐨勪竴涓敼杩涖侷D3绠楁硶閲囩敤淇℃伅澧炵泭杩涜鍐崇瓥鍒ゆ柇锛岃孋4.5閲囩敤鐨勬槸澧炵泭鐜囥 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/42395865 +## 18澶х粡鍏窪M绠楁硶 +18澶ф暟鎹寲鎺樼殑缁忓吀绠楁硶浠ュ強浠g爜瀹炵幇锛屾秹鍙婂埌浜嗗喅绛栧垎绫伙紝鑱氱被锛岄摼鎺ユ寲鎺橈紝鍏宠仈鎸栨帢锛屾ā寮忔寲鎺樼瓑绛夋柟闈,鍚庨潰閮芥槸鐩稿簲绠楁硶鐨勫崥鏂囬摼鎺ワ紝甯屾湜鑳藉甯姪澶у瀛︺ +鐩墠杩藉姞浜嗗叾浠栫殑涓浜涚粡鍏哥殑DM绠楁硶锛屽湪others鐨勫寘涓秹鍙婅仛绫伙紝鍒嗙被锛屽浘绠楁硶锛屾悳绱㈢畻绛夌瓑锛屾病鏈夊叿浣撳垎绫汇 -2.CART绠楁硶銆侰ART绠楁硶鐨勫叏绉版槸鍒嗙被鍥炲綊鏍戠畻娉曪紝浠栨槸涓涓簩鍏冨垎绫伙紝閲囩敤鐨勬槸绫讳技浜庣喌鐨勫熀灏兼寚鏁颁綔涓哄垎绫诲喅绛栵紝褰㈡垚鍐崇瓥鏍戝悗涔嬪悗杩樿杩涜鍓灊锛屾垜鑷繁鍦ㄥ疄鐜版暣涓畻娉曠殑鏃跺欓噰鐢ㄧ殑鏄唬浠峰鏉傚害绠楁硶锛 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/42558235 +* ### C4.5 +C4.5绠楁硶涓嶪D3绠楁硶涓鏍凤紝閮芥槸鏁板鍒嗙被绠楁硶锛孋4.5绠楁硶鏄疘D3绠楁硶鐨勪竴涓敼杩涖侷D3绠楁硶閲囩敤淇℃伅澧炵泭杩涜鍐崇瓥鍒ゆ柇锛岃孋4.5閲囩敤鐨勬槸澧炵泭鐜囥俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/42395865) -3.KNN(K鏈杩戦偦)绠楁硶銆傜粰瀹氫竴浜涘凡缁忚缁冨ソ鐨勬暟鎹紝杈撳叆涓涓柊鐨勬祴璇曟暟鎹偣锛岃绠楀寘鍚簬姝ゆ祴璇曟暟鎹偣鐨勬渶杩戠殑鐐圭殑鍒嗙被鎯呭喌锛屽摢涓垎绫荤殑绫诲瀷鍗犲鏁帮紝鍒欐娴嬭瘯鐐圭殑鍒嗙被涓庢鐩稿悓锛屾墍浠ュ湪杩欓噷,鏈夌殑鏃跺欏彲浠ュ鍒朵笉鍚岀殑鍒嗙被鐐逛笉鍚岀殑鏉冮噸銆傝繎鐨勭偣鐨勬潈閲嶅ぇ鐐癸紝杩滅殑鐐硅嚜鐒跺氨灏忕偣銆 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/42613011 +* ### CART +CART绠楁硶鐨勫叏绉版槸鍒嗙被鍥炲綊鏍戠畻娉曪紝浠栨槸涓涓簩鍏冨垎绫伙紝閲囩敤鐨勬槸绫讳技浜庣喌鐨勫熀灏兼寚鏁颁綔涓哄垎绫诲喅绛栵紝褰㈡垚鍐崇瓥鏍戝悗涔嬪悗杩樿杩涜鍓灊锛屾垜鑷繁鍦ㄥ疄鐜版暣涓畻娉曠殑鏃跺欓噰鐢ㄧ殑鏄唬浠峰鏉傚害绠楁硶锛孾璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/42558235) -4.Naive Bayes(鏈寸礌璐濆彾鏂)绠楁硶銆傛湸绱犺礉鍙舵柉绠楁硶鏄礉鍙舵柉绠楁硶閲岄潰涓绉嶆瘮杈冪畝鍗曠殑鍒嗙被绠楁硶锛岀敤鍒颁簡涓涓瘮杈冮噸瑕佺殑璐濆彾鏂畾鐞嗭紝鐢ㄤ竴鍙ョ畝鍗曠殑璇濇鎷氨鏄潯浠舵鐜囩殑鐩镐簰杞崲鎺ㄥ銆 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/42680161 +* ### KNN +K鏈杩戦偦绠楁硶銆傜粰瀹氫竴浜涘凡缁忚缁冨ソ鐨勬暟鎹紝杈撳叆涓涓柊鐨勬祴璇曟暟鎹偣锛岃绠楀寘鍚簬姝ゆ祴璇曟暟鎹偣鐨勬渶杩戠殑鐐圭殑鍒嗙被鎯呭喌锛屽摢涓垎绫荤殑绫诲瀷鍗犲鏁帮紝鍒欐娴嬭瘯鐐圭殑鍒嗙被涓庢鐩稿悓锛屾墍浠ュ湪杩欓噷,鏈夌殑鏃跺欏彲浠ュ鍒朵笉鍚岀殑鍒嗙被鐐逛笉鍚岀殑鏉冮噸銆傝繎鐨勭偣鐨勬潈閲嶅ぇ鐐癸紝杩滅殑鐐硅嚜鐒跺氨灏忕偣銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/42613011) -5.SVM(鏀寔鍚戦噺鏈)绠楁硶銆傛敮鎸佸悜閲忔満绠楁硶鏄竴绉嶅绾挎у拰闈炵嚎鎬ф暟鎹繘琛屽垎绫荤殑鏂规硶锛岄潪绾挎ф暟鎹繘琛屽垎绫荤殑鏃跺欏彲浠ラ氳繃鏍稿嚱鏁拌浆涓虹嚎鎬х殑鎯呭喌鍐嶅鐞嗐傚叾涓殑涓涓叧閿殑姝ラ鏄悳绱㈡渶澶ц竟缂樿秴骞抽潰銆 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/42780439 +* ### Naive Bayes +鏈寸礌璐濆彾鏂畻娉曘傛湸绱犺礉鍙舵柉绠楁硶鏄礉鍙舵柉绠楁硶閲岄潰涓绉嶆瘮杈冪畝鍗曠殑鍒嗙被绠楁硶锛岀敤鍒颁簡涓涓瘮杈冮噸瑕佺殑璐濆彾鏂畾鐞嗭紝鐢ㄤ竴鍙ョ畝鍗曠殑璇濇鎷氨鏄潯浠舵鐜囩殑鐩镐簰杞崲鎺ㄥ銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/42680161) -6.EM(鏈熸湜鏈澶у寲)绠楁硶銆傛湡鏈涙渶澶у寲绠楁硶锛屽彲浠ユ媶鍒嗕负2涓畻娉曪紝1涓狤-Step鏈熸湜鍖栨楠,鍜1涓狹-Step鏈澶у寲姝ラ銆備粬鏄竴绉嶇畻娉曟鏋讹紝鍦ㄦ瘡娆¤绠楃粨鏋滀箣鍚庯紝閫艰繎缁熻妯″瀷鍙傛暟鐨勬渶澶т技鐒舵垨鏈澶у悗楠屼及璁° -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/42921789 +* ### SVM +鏀寔鍚戦噺鏈虹畻娉曘傛敮鎸佸悜閲忔満绠楁硶鏄竴绉嶅绾挎у拰闈炵嚎鎬ф暟鎹繘琛屽垎绫荤殑鏂规硶锛岄潪绾挎ф暟鎹繘琛屽垎绫荤殑鏃跺欏彲浠ラ氳繃鏍稿嚱鏁拌浆涓虹嚎鎬х殑鎯呭喌鍐嶅鐞嗐傚叾涓殑涓涓叧閿殑姝ラ鏄悳绱㈡渶澶ц竟缂樿秴骞抽潰銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/42780439) -7.Apriori绠楁硶銆侫priori绠楁硶鏄叧鑱旇鍒欐寲鎺樼畻娉曪紝閫氳繃杩炴帴鍜屽壀鏋濊繍绠楁寲鎺樺嚭棰戠箒椤归泦锛岀劧鍚庢牴鎹绻侀」闆嗗緱鍒板叧鑱旇鍒欙紝鍏宠仈瑙勫垯鐨勫鍑洪渶瑕佹弧瓒虫渶灏忕疆淇″害鐨勮姹傘 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43059211 +* ### EM +鏈熸湜鏈澶у寲绠楁硶銆傛湡鏈涙渶澶у寲绠楁硶锛屽彲浠ユ媶鍒嗕负2涓畻娉曪紝1涓狤-Step鏈熸湜鍖栨楠,鍜1涓狹-Step鏈澶у寲姝ラ銆備粬鏄竴绉嶇畻娉曟鏋讹紝鍦ㄦ瘡娆¤绠楃粨鏋滀箣鍚庯紝閫艰繎缁熻妯″瀷鍙傛暟鐨勬渶澶т技鐒舵垨鏈澶у悗楠屼及璁°俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/42921789) -8.FP-Tree(棰戠箒妯″紡鏍)绠楁硶銆傝繖涓畻娉曚篃鏈夎绉颁负FP-growth绠楁硶锛岃繖涓畻娉曞厠鏈嶄簡Apriori绠楁硶鐨勪骇鐢熻繃澶氫警閫夐泦鐨勭己鐐癸紝閫氳繃閫掑綊鐨勪骇鐢熼搴︽ā寮忔爲锛岀劧鍚庡鏍戣繘琛屾寲鎺橈紝鍚庨潰鐨勮繃绋嬩笌Apriori绠楁硶涓鑷淬 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43234309 +* ### Apriori +Apriori绠楁硶鏄叧鑱旇鍒欐寲鎺樼畻娉曪紝閫氳繃杩炴帴鍜屽壀鏋濊繍绠楁寲鎺樺嚭棰戠箒椤归泦锛岀劧鍚庢牴鎹绻侀」闆嗗緱鍒板叧鑱旇鍒欙紝鍏宠仈瑙勫垯鐨勫鍑洪渶瑕佹弧瓒虫渶灏忕疆淇″害鐨勮姹傘俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43059211) -9.PageRank(缃戦〉閲嶈鎬/鎺掑悕)绠楁硶銆侾ageRank绠楁硶鏈鏃╀骇鐢熶簬Google,鏍稿績鎬濇兂鏄氳繃缃戦〉鐨勫叆閾炬暟浣滀负涓涓綉椤靛ソ蹇殑鍒ゅ畾鏍囧噯锛屽鏋1涓綉椤靛唴閮ㄥ寘鍚簡澶氫釜鎸囧悜澶栭儴鐨勯摼鎺ワ紝鍒橮R鍊煎皢浼氳鍧囧垎锛孭ageRank绠楁硶涔熶細閬埌Link Span鏀诲嚮銆 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43311943 +* ### FP-Tree +棰戠箒妯″紡鏍戠畻娉曘傝繖涓畻娉曚篃鏈夎绉颁负FP-growth绠楁硶锛岃繖涓畻娉曞厠鏈嶄簡Apriori绠楁硶鐨勪骇鐢熻繃澶氫警閫夐泦鐨勭己鐐癸紝閫氳繃閫掑綊鐨勪骇鐢熼搴︽ā寮忔爲锛岀劧鍚庡鏍戣繘琛屾寲鎺橈紝鍚庨潰鐨勮繃绋嬩笌Apriori绠楁硶涓鑷淬俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43234309) -10.HITS绠楁硶銆侶ITS绠楁硶鏄彟澶栦竴涓摼鎺ョ畻娉曪紝閮ㄥ垎鍘熺悊涓嶱ageRank绠楁硶鏄瘮杈冪浉浼肩殑锛孒ITS绠楁硶寮曞叆浜嗘潈濞佸煎拰涓績鍊肩殑姒傚康锛孒ITS绠楁硶鏄彈鐢ㄦ埛鏌ヨ鏉′欢褰卞搷鐨勶紝浠栦竴鑸敤浜庡皬瑙勬ā鐨勬暟鎹摼鎺ュ垎鏋愶紝涔熸洿瀹规槗閬彈鍒版敾鍑汇 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43311943 +* ### PageRank +缃戦〉閲嶈鎬/鎺掑悕绠楁硶銆侾ageRank绠楁硶鏈鏃╀骇鐢熶簬Google,鏍稿績鎬濇兂鏄氳繃缃戦〉鐨勫叆閾炬暟浣滀负涓涓綉椤靛ソ蹇殑鍒ゅ畾鏍囧噯锛屽鏋1涓綉椤靛唴閮ㄥ寘鍚簡澶氫釜鎸囧悜澶栭儴鐨勯摼鎺ワ紝鍒橮R鍊煎皢浼氳鍧囧垎锛孭ageRank绠楁硶涔熶細閬埌LinkSpan鏀诲嚮銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43311943) -11.K-Means(K鍧囧)绠楁硶銆侹-Means绠楁硶鏄仛绫荤畻娉曪紝k鍦ㄥ湪杩欓噷鎸囩殑鏄垎绫荤殑绫诲瀷鏁帮紝鎵浠ュ湪寮濮嬭瀹氱殑鏃跺欓潪甯稿叧閿紝绠楁硶鐨勫師鐞嗘槸棣栧厛鍋囧畾k涓垎绫荤偣锛岀劧鍚庢牴鎹寮忚窛绂昏绠楀垎绫伙紝鐒跺悗鍘诲悓鍒嗙被鐨勫潎鍊间綔涓烘柊鐨勮仛绨囦腑蹇冿紝寰幆鎿嶄綔鐩村埌鏀舵暃銆 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43373159 +* ### HITS +HITS绠楁硶鏄彟澶栦竴涓摼鎺ョ畻娉曪紝閮ㄥ垎鍘熺悊涓嶱ageRank绠楁硶鏄瘮杈冪浉浼肩殑锛孒ITS绠楁硶寮曞叆浜嗘潈濞佸煎拰涓績鍊肩殑姒傚康锛孒ITS绠楁硶鏄彈鐢ㄦ埛鏌ヨ鏉′欢褰卞搷鐨勶紝浠栦竴鑸敤浜庡皬瑙勬ā鐨勬暟鎹摼鎺ュ垎鏋愶紝涔熸洿瀹规槗閬彈鍒版敾鍑汇俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43311943) -12.BIRCH绠楁硶銆侭IRCH绠楁硶鍒╃敤鏋勫缓CF鑱氱被鐗瑰緛鏍戜綔涓虹畻娉曠殑鏍稿績锛岄氳繃鏍戠殑褰㈠紡锛孊IRCH绠楁硶鎵弿鏁版嵁搴擄紝鍦ㄥ唴瀛樹腑寤虹珛涓妫靛垵濮嬬殑CF-鏍戯紝鍙互鐪嬪仛鏁版嵁鐨勫灞傚帇缂┿ -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43532111 +* ### K-Means +K-Means绠楁硶鏄仛绫荤畻娉曪紝k鍦ㄥ湪杩欓噷鎸囩殑鏄垎绫荤殑绫诲瀷鏁帮紝鎵浠ュ湪寮濮嬭瀹氱殑鏃跺欓潪甯稿叧閿紝绠楁硶鐨勫師鐞嗘槸棣栧厛鍋囧畾k涓垎绫荤偣锛岀劧鍚庢牴鎹寮忚窛绂昏绠楀垎绫伙紝鐒跺悗鍘诲悓鍒嗙被鐨勫潎鍊间綔涓烘柊鐨勮仛绨囦腑蹇冿紝寰幆鎿嶄綔鐩村埌鏀舵暃銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43373159) -13.AdaBoost绠楁硶銆侫daBoost绠楁硶鏄竴绉嶆彁鍗囩畻娉曪紝閫氳繃瀵规暟鎹殑澶氭璁粌寰楀埌澶氫釜浜掕ˉ鐨勫垎绫诲櫒锛岀劧鍚庣粍鍚堝涓垎绫诲櫒锛屾瀯鎴愪竴涓洿鍔犲噯纭殑鍒嗙被鍣紝 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43635115 +* ### BIRCH +BIRCH绠楁硶鍒╃敤鏋勫缓CF鑱氱被鐗瑰緛鏍戜綔涓虹畻娉曠殑鏍稿績锛岄氳繃鏍戠殑褰㈠紡锛孊IRCH绠楁硶鎵弿鏁版嵁搴擄紝鍦ㄥ唴瀛樹腑寤虹珛涓妫靛垵濮嬬殑CF-鏍戯紝鍙互鐪嬪仛鏁版嵁鐨勫灞傚帇缂┿俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43532111) -14.GSP绠楁硶銆侴SP绠楁硶鏄簭鍒楁ā寮忔寲鎺樼畻娉曘侴SP绠楁硶涔熸槸Apriori绫荤畻娉曪紝鍦ㄧ畻娉曠殑杩囩▼涓篃浼氳繘琛岃繛鎺ュ拰鍓灊鎿嶄綔锛屼笉杩囧湪鍓灊鍒ゆ柇鐨勬椂鍊欒繕鍔犱笂浜嗕竴浜涙椂闂翠笂鐨勭害鏉熺瓑鏉′欢銆 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43699083 +* ### AdaBoost +AdaBoost绠楁硶鏄竴绉嶆彁鍗囩畻娉曪紝閫氳繃瀵规暟鎹殑澶氭璁粌寰楀埌澶氫釜浜掕ˉ鐨勫垎绫诲櫒锛岀劧鍚庣粍鍚堝涓垎绫诲櫒锛屾瀯鎴愪竴涓洿鍔犲噯纭殑鍒嗙被鍣ㄣ俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43635115) -15.PreFixSpan绠楁硶銆侾reFixSpan绠楁硶鏄彟涓涓簭鍒楁ā寮忔寲鎺樼畻娉曪紝鍦ㄧ畻娉曠殑杩囩▼涓笉浼氫骇鐢熷欓夐泦锛岀粰瀹氬垵濮嬪墠缂妯″紡锛屼笉鏂殑閫氳繃鍚庣紑妯″紡涓殑鍏冪礌杞埌鍓嶇紑妯″紡涓紝鑰屼笉鏂殑閫掑綊鎸栨帢涓嬪幓銆 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43766253 +* ### GSP +GSP绠楁硶鏄簭鍒楁ā寮忔寲鎺樼畻娉曘侴SP绠楁硶涔熸槸Apriori绫荤畻娉曪紝鍦ㄧ畻娉曠殑杩囩▼涓篃浼氳繘琛岃繛鎺ュ拰鍓灊鎿嶄綔锛屼笉杩囧湪鍓灊鍒ゆ柇鐨勬椂鍊欒繕鍔犱笂浜嗕竴浜涙椂闂翠笂鐨勭害鏉熺瓑鏉′欢銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43699083) -16.CBA(鍩轰簬鍏宠仈瑙勫垯鍒嗙被)绠楁硶銆侰BA绠楁硶鏄竴绉嶉泦鎴愭寲鎺樼畻娉曪紝鍥犱负浠栨槸寤虹珛鍦ㄥ叧鑱旇鍒欐寲鎺樼畻娉曚箣涓婄殑锛屽湪宸叉湁鐨勫叧鑱旇鍒欑悊璁哄墠鎻愪笅锛屽仛鍒嗙被鍒ゆ柇锛屽彧鏄湪绠楁硶鐨勫紑濮嬫椂瀵规暟鎹仛澶勭悊锛屽彉鎴愮被浼间簬浜嬪姟鐨勫舰寮忋 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43818787 +* ### PreFixSpan +PreFixSpan绠楁硶鏄彟涓涓簭鍒楁ā寮忔寲鎺樼畻娉曪紝鍦ㄧ畻娉曠殑杩囩▼涓笉浼氫骇鐢熷欓夐泦锛岀粰瀹氬垵濮嬪墠缂妯″紡锛屼笉鏂殑閫氳繃鍚庣紑妯″紡涓殑鍏冪礌杞埌鍓嶇紑妯″紡涓紝鑰屼笉鏂殑閫掑綊鎸栨帢涓嬪幓銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43766253) -17.RoughSets(绮楃硻闆)绠楁硶銆傜矖绯欓泦鐞嗚鏄竴涓瘮杈冩柊棰栫殑鏁版嵁鎸栨帢鎬濇兂銆傝繖閲屼娇鐢ㄧ殑鏄敤绮楃硻闆嗚繘琛屽睘鎬х害绠鐨勭畻娉曪紝閫氳繃涓婁笅杩戜技闆嗙殑鍒ゆ柇鍒犻櫎鏃犳晥鐨勫睘鎬э紝杩涜瑙勫埗鐨勮緭鍑恒 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43876001 +* ### CBA +鍩轰簬鍏宠仈瑙勫垯鍒嗙被绠楁硶銆侰BA绠楁硶鏄竴绉嶉泦鎴愭寲鎺樼畻娉曪紝鍥犱负浠栨槸寤虹珛鍦ㄥ叧鑱旇鍒欐寲鎺樼畻娉曚箣涓婄殑锛屽湪宸叉湁鐨勫叧鑱旇鍒欑悊璁哄墠鎻愪笅锛屽仛鍒嗙被鍒ゆ柇锛屽彧鏄湪绠楁硶鐨勫紑濮嬫椂瀵规暟鎹仛澶勭悊锛屽彉鎴愮被浼间簬浜嬪姟鐨勫舰寮忋俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43818787) -18.gSpan绠楁硶銆俫Span绠楁硶灞炰簬鍥炬寲鎺樼畻娉曢鍩熴傦紝涓昏鐢ㄤ簬棰戠箒瀛愬浘鐨勬寲鎺橈紝鐩歌緝浜庡叾浠栫殑鍥剧畻娉曪紝瀛愬浘鎸栨帢绠楁硶鏄粬浠殑涓涓墠鎻愭垨鍩虹绠楁硶銆俫Span绠楁硶鐢ㄥ埌浜咲FS缂栫爜锛屽拰Edge浜斿厓缁勶紝鏈鍙宠矾寰勫瓙鍥炬墿灞曠瓑姒傚康锛岀畻娉曟瘮杈冪殑鎶借薄鍜屽鏉傘 -璇︾粏浠嬬粛閾炬帴锛歨ttp://blog.csdn.net/androidlushangderen/article/details/43924273 +* ### RoughSets +绮楃硻闆嗙畻娉曘傜矖绯欓泦鐞嗚鏄竴涓瘮杈冩柊棰栫殑鏁版嵁鎸栨帢鎬濇兂銆傝繖閲屼娇鐢ㄧ殑鏄敤绮楃硻闆嗚繘琛屽睘鎬х害绠鐨勭畻娉曪紝閫氳繃涓婁笅杩戜技闆嗙殑鍒ゆ柇鍒犻櫎鏃犳晥鐨勫睘鎬э紝杩涜瑙勫埗鐨勮緭鍑恒俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43876001) + +* ### GSpan +gSpan绠楁硶灞炰簬鍥炬寲鎺樼畻娉曢鍩熴傦紝涓昏鐢ㄤ簬棰戠箒瀛愬浘鐨勬寲鎺橈紝鐩歌緝浜庡叾浠栫殑鍥剧畻娉曪紝瀛愬浘鎸栨帢绠楁硶鏄粬浠殑涓涓墠鎻愭垨鍩虹绠楁硶銆俫Span绠楁硶鐢ㄥ埌浜咲FS缂栫爜锛屽拰Edge浜斿厓缁勶紝鏈鍙宠矾寰勫瓙鍥炬墿灞曠瓑姒傚康锛岀畻娉曟瘮杈冪殑鎶借薄鍜屽鏉傘俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/43924273) + +##Others鐩綍涓嬬殑绠楁硶锛 + +* ### GA +閬椾紶绠楁硶銆傞仐浼犵畻娉曡繍鐢ㄤ簡鐢熺墿杩涘寲鐞嗚鐨勭煡璇嗘潵瀵绘壘闂鏈浼樿В鐨勭畻娉曪紝绠楁硶鐨勯仐浼犺繘鍖栬繃绋嬪垎閫夋嫨锛屼氦鍙夊拰鍙樺紓鎿嶄綔锛屽叾涓夋嫨鎿嶆槸闈炲父鍏抽敭鐨勬楠わ紝鎶婃洿閫傚簲鐨勫熀浜庣粍閬椾紶缁欎笅涓浠c俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/44041499) + +* ### DbScan +鍩轰簬绌洪棿瀵嗗害鑱氱被绠楁硶銆俤bScan浣滀负涓绉嶇壒娈婅仛绫荤畻娉曪紝寮ヨˉ浜嗗叾浠栫畻娉曠殑涓浜涗笉瓒筹紝鍩轰簬绌洪棿瀵嗭紝瀹炵幇鑱氱被鏁堟灉锛屽彲浠ュ彂鐜颁换鎰忓舰鐘剁殑鑱氱皣銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/44311309) + +* ### GA_Maze +閬椾紶绠楁硶鍦ㄨ蛋杩峰娓告垙涓殑搴旂敤銆傚皢璧拌糠瀹腑鐨勬悳绱㈠嚭鍙h矾寰勭殑闂杞寲涓洪仐浼犵畻娉曚腑鐨勯棶棰橀氳繃鏋勯犻拡瀵规鐗瑰畾闂鐨勯傚煎嚱鏁帮紝鍩哄洜绉诲姩鏂瑰悜鐨勫畾浣嶏紝宸х殑杩涜闂鐨勬眰瑙c俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/44656809) + +* ### CABDDCC +鍩轰簬杩為氬浘鐨勫垎瑁傝仛绫荤畻娉曘備篃鏄睘浜庡眰娆¤仛绫荤畻娉曚富瑕佸垎涓2涓樁娈碉紝绗竴闃舵鏋勯犺繛閫氬浘銆傜浜屼釜闃舵鏄垎瑁傝繛閫氬浘锛屾渶缁堝舰鎴愯仛绫荤粨鏋溿俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/44463997) + +* ### Chameleon +涓ら樁娈佃仛绫荤畻娉曘備笌CABDDCC绠楁硶鐩稿弽锛屾渶鍚庢槸閫氳繃瀵瑰皬绨囬泦鍚堢殑鍚堝苟锛屽舰鎴愭渶缁堢殑缁撴灉锛屽湪绗竴闃舵涓昏鏄氳繃K杩戦偦鐨勬濇兂褰㈡垚灏忚妯$殑杩為氬浘锛岀浜岄樁娈甸氳繃RI(鐩稿浜掕繛鎬)鍜孯C(鐩稿杩戜技鎬)鏉ラ変竴涓渶浣崇殑绨囪繘琛屽悎骞躲俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/44569077) + +* ### RandomForest +闅忔満妫灄绠楁硶銆傜畻娉曟濇兂鏄喅绛栨爲+boosting.鍐崇瓥鏍戦噰鐢ㄧ殑鏄疌ART鍒嗙被鍥炲綊鏁,閫氳繃缁勫悎鍚勪釜鍐崇瓥鏍戠殑寮卞垎绫诲櫒,鏋勬垚涓涓渶缁堢殑寮哄垎绫诲櫒,鍦ㄦ瀯閫犲喅绛栨爲鐨勬椂鍊欓噰鍙栭殢鏈烘暟閲忕殑鏍锋湰鏁板拰闅忔満鐨勯儴鍒嗗睘鎬ц繘琛屽瓙鍐崇瓥鏍戠殑鏋勫缓,閬垮厤浜嗚繃鍒嗘嫙鍚堢殑鐜拌薄鍙戠敓銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/44756943) + +* ### KDTree +K-Dimension Tree銆傚缁寸┖闂村垝鍒嗘爲锛屾暟鎹湪澶氱淮绌洪棿杩涜鍒掑垎涓庢煡鎵俱備富瑕佺敤浜庡叧閿俊鎭殑鎼滅储锛岀被浼间簬鍦ㄧ┖闂翠腑鐨勪簩鍒嗘悳绱紝澶уぇ鎻愰珮浜嗘悳绱㈡晥鐜囷紝鍦ㄥ鎵剧洰鏍囧厓绱犳椂锛屼娇鐢ㄤ簡DFS娣卞害浼樺厛鐨勬柟寮忓拰鍥炴函杩涜鏈杩戠偣鐨勫鎵俱俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/44985259) + +* ### MS-Apriori +鍩轰簬澶氭敮鎸佸害鐨凙priori绠楁硶銆傛槸Apriori绠楁硶鐨勫崌绾х畻娉曪紝寮ヨˉ浜嗗師鍏圓priori绠楁硶鐨勪笉瓒筹紝杩樺鍔犱簡鏀寔搴﹀樊鍒檺鍒朵互鍙婃敮鎸佸害璁℃暟缁熻鏂归潰鐨勪紭鍖栵紝鏃犻』鍐嶆閲嶆柊鎵弿鏁翠釜鏁版嵁闆嗭紝浜х敓鍏宠仈瑙勫垯鐨勬椂鍊欏彲浠ユ牴鎹瓙闆嗙殑鍏崇郴閬垮厤涓浜涚疆淇″害鐨勮绠椼俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/45082337) + +* ### ACO +铓佺兢绠楁硶銆傝殎缇ょ畻娉曞張绉颁负铓傝殎绠楁硶銆傚悓GA閬椾紶绠楁硶绫讳技锛屼篃鏄繍鐢ㄤ簡澶ц嚜鐒惰寰嬬殑绠楁硶锛岀敤浜庡湪鍥句腑瀵绘壘鏈浼樿矾寰勭殑姒傜巼鍨嬬畻娉曘傜伒鎰熸潵婧愪簬铓傝殎鍦ㄥ鎵鹃鐗╂椂浼氭暎鎾俊鎭礌鐨勫彂鐜拌矾寰勮涓恒俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/45395491) + +* ### BayesNetwork +璐濆彾鏂綉缁滅畻娉曘傚讥琛ヤ簡鏈寸礌璐濆彾鏂畻娉曚腑蹇呴』瑕佷簨浠剁嫭绔嬫х殑缂虹偣锛屽埄鐢ㄤ簡璐濆彾鏂綉缁滅殑DAG鏈夊悜鏃犵幆鍥撅紝鍏佽鍚勪釜浜嬩欢淇濈暀涓瀹氱殑渚濊禆鍏崇郴锛岀綉缁滅粨鏋勪腑鐨勬瘡涓妭鐐逛唬琛ㄤ竴绉嶅睘鎬э紝杈逛唬琛ㄧ浉搴旂殑鏉′欢姒傜巼鍊硷紝閫氳繃璁$畻浠庤岃兘寰楀埌绮惧噯鐨勫垎绫绘晥鏋溿俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/46683729) + +* ### TAN +鏍戝瀷鏈寸礌璐濆彾鏂畻娉曘傛绠楁硶鍙堣绉颁负鍔犲己鐗堟湸绱犺礉鍙舵柉绠楁硶銆傚湪婊¤冻鍘熸湁鏈寸礌璐濆彾鏂潯浠剁殑鍩虹涓婏紝浠栧厑璁搁儴鏉′欢灞炴х洿鎺ョ殑鍏宠仈鎬с傚舰鎴愭爲鍨嬬殑缁撴瀯銆俒璇︾粏浠嬬粛閾炬帴](http://blog.csdn.net/androidlushangderen/article/details/46763427) + +* ### Viterbi +缁寸壒姣旂畻娉曘傜粰瀹氫竴涓殣椹皵绉戝か妯″瀷浠ュ強涓涓瀵熷簭鍒楋紝姹傚嚭娼滃湪鐨勭姸鎬佸簭鍒椾俊鎭紝姣忎釜娼滃湪鐘舵佷俊鎭張浼氬彈鍒板墠涓涓姸鎬佷俊鎭殑褰卞搷銆 + +## 绠楁硶浣跨敤鏂规硶 +鍦ㄦ瘡涓畻娉曚腑缁欏嚭浜3澶х被鍨嬶紝涓荤畻娉曠▼搴忥紝璋冪敤绋嬪簭锛岃緭鍏ユ暟鎹紝璋冪敤鏂规硶濡備笅锛 +* 灏嗛渶瑕佹暟鎹殑娴嬭瘯鏁版嵁杞寲鎴愪笌缁欏畾鐨勮緭鍏ユ牸寮忕浉鍚 +* 鐒跺悗浠lient绫荤殑娴嬭瘯绋嬪簭璋冪敤鏂瑰紡杩涜浣跨敤銆 +* 涔熷彲浠ヨ嚜琛屼慨鏀圭畻娉曠▼搴忥紝鏉ラ傜敤浜庤嚜宸辩殑浣跨敤鍦烘櫙