Deployed to maven

chen0040 · chen0040 · commit b8a68bf66667 · 2017-05-25T16:51:48.000+08:00
diff --git a/README.md b/README.md
@@ -1,7 +1,63 @@
 # java-sequential-pattern-mining
-Package provides java implementation of sequential pattern mining algorithms
+Package provides java implementation of sequential pattern mining algorithm GSP
 
 [![Build Status](https://travis-ci.org/chen0040/java-sequential-pattern-mining.svg?branch=master)](https://travis-ci.org/chen0040/java-sequential-pattern-mining) [![Coverage Status](https://coveralls.io/repos/github/chen0040/java-sequential-pattern-mining/badge.svg?branch=master)](https://coveralls.io/github/chen0040/java-sequential-pattern-mining?branch=master) 
 
 
+# Overview of GSP
+The implementation of the algorithm is based on Srikant & Agrawal, 1996
+
+The algorithm makes multiple passes over the data. The first pass determines the support of each item, that
+is, the number of data-sequences that include the item. At the end of the first pas, the algorithm knows
+which items are frequent, that is, have minimum support. Each such item yields a 1-element frequent sequence
+consisting of that item.
+
+Each subsequent pass starts with a seed set: the frequent sequences found in the previous pass. The seed set
+is used to generate new potentially frequent sequences, called candidate sequences. Each candidate sequence
+has one more item than a seed sequence; so all the candidate sequences in a pass will have the same number of
+items. The support for these candidate sequences is found during the pass over the data. At the end of the
+pass, the algorithm determines which of the candidate sequences are actually frequent. These frequent candidates
+become the seed for the next pass.
+
+# Install
+
+Add the following dependency to your POM file:
+
+```xml
+<dependency>
+  <groupId>com.github.chen0040</groupId>
+  <artifactId>java-sequential-pattern-mining</artifactId>
+  <version>1.0.1</version>
+</dependency>
+```
+
+# Usage
+
+The sample code belows illustrates how to use the GSP to find the frequent sequential pattern in a simple sequence database.
+
+```java
+List<Sequence> database = new ArrayList<>();
+
+// Below is 4 sequences of transactions stored in the database 
+/*
+S1 	(1), (1 2 3), (1 3), (4), (3 6)
+S2 	(1 4), (3), (2 3), (1 5)
+S3 	(5 6), (1 2), (4 6), (3), (2)
+S4 	(5), (7), (1 6), (3), (2), (3)
+*/
+
+database.add(Sequence.make("1", "1,2,3", "1,3", "4", "3,6"));
+database.add(Sequence.make("1,4", "3", "2,3", "1,5"));
+database.add(Sequence.make("5,6", "1,2", "4,6", "3", "2"));
+database.add(Sequence.make("5", "7", "1,6", "3", "2", "3"));
+
+GSP method = new GSP();
+method.setMinSupportLevel(2);
+List<String> uniqueItems = new MetaData(database).getUniqueItems();
+Sequences result = method.minePatterns(database, uniqueItems, -1);
+
+result.getSequences().stream().forEach(sequence -> {
+ System.out.println("sequence: " + sequence);
+});
+```
 
diff --git a/pom.xml b/pom.xml
@@ -14,7 +14,7 @@
     <licenses>
         <license>
             <name>MIT</name>
-            <url>https://github.com/chen0040/java-sequential-pattern-mining/LICENSE</url>
+            <url>https://github.com/chen0040/java-sequential-pattern-mining/blob/master/LICENSE</url>
             <comments>MIT License</comments>
         </license>
     </licenses>
@@ -40,8 +40,8 @@
     </issueManagement>
 
 
-    <name>Frequent Pattern Mining Algorithms</name>
-    <description>Frequent pattern mining algorithms package</description>
+    <name>General Sequential Pattern Mining</name>
+    <description>Java implementation of GSP for frequent pattern mining</description>
     <url>https://github.com/chen0040/java-sequential-pattern-mining</url>
 
     <distributionManagement>
diff --git a/src/main/java/com/github/chen0040/spm/AbstractSequentialAssocRuleMiner.java b/src/main/java/com/github/chen0040/spm/AbstractSequentialAssocRuleMiner.java
@@ -3,7 +3,6 @@
 
 import com.github.chen0040.spm.data.Sequence;
 import com.github.chen0040.spm.data.Sequences;
-import sun.reflect.generics.reflectiveObjects.NotImplementedException;
 
 import java.util.List;
 
@@ -33,6 +32,6 @@ public  Sequences minePatterns(Iterable<? extends Sequence> database, List<Strin
 
 
    public Sequences findMaxPatterns(Iterable<? extends Sequence> database, List<String> uniqueItems) {
-       throw new NotImplementedException();
+       return null;
    }
 }
diff --git a/src/main/java/com/github/chen0040/spm/apriori/GSP.java b/src/main/java/com/github/chen0040/spm/apriori/GSP.java
@@ -13,19 +13,6 @@
 
 /**
  * Created by xschen on 8/2/2015.
- * implementation of the algorithm is based on Srikant & Agrawal, 1996
- *
- * The algorithm makes multiple passes over the data. The first pass determines the support of each item, that
- * is, the numbre of data-sequences that include the item. At the end of the first pas, the algorithm knows
- * which items are frequent, that is, have minimum support. Each such item yields a 1-element frequent sequence
- * consisting of that item.
- *
- * Each subsequent pass starts with a seed set: the frequent sequences found in the previous pass. The seed set
- * is used to generate new potentially frequent sequences, called candidate sequences. Each candidate sequence
- * has one more item than a seed sequence; so all the candidate sequences in a pass will have the same number of
- * items. The support for these candidate sequences is found during the pass over the data. At the end of the
- * pass, the algorithm determines which of the candidate sequences are actually frequent. These frequent candidates
- * become the seed for the next pass.
  */
 public class GSP extends AbstractSequentialAssocRuleMiner {
 
diff --git a/src/main/java/com/github/chen0040/spm/data/Sequence.java b/src/main/java/com/github/chen0040/spm/data/Sequence.java
@@ -24,6 +24,23 @@ public Sequence(){
 
    }
 
+   public static Sequence make(String... args){
+
+      List<String> texts = new ArrayList<>();
+      for(String items : args){
+         texts.add(items);
+      }
+
+      List<ItemSetWithTimeId> itemSets = texts.stream().map(text -> new ItemSetWithTimeId(text.split(","))).collect(Collectors.toList());
+      Sequence sequence = new Sequence();
+      for(ItemSetWithTimeId element : itemSets) {
+         sequence.addElement(element);
+      }
+
+      return sequence;
+
+   }
+
 
    public List<ItemSetWithTimeId> getElements(){
       return elements;
diff --git a/src/test/java/com/github/chen0040/spm/apriori/GSPUnitTest.java b/src/test/java/com/github/chen0040/spm/apriori/GSPUnitTest.java
@@ -1,7 +1,6 @@
 package com.github.chen0040.spm.apriori;
 
 
-import com.github.chen0040.spm.data.ItemSetWithTimeId;
 import com.github.chen0040.spm.data.MetaData;
 import com.github.chen0040.spm.data.Sequence;
 import com.github.chen0040.spm.data.Sequences;
@@ -11,9 +10,6 @@
 
 import java.util.ArrayList;
 import java.util.List;
-import java.util.stream.Collectors;
-
-import static org.testng.Assert.*;
 
 
 /**
@@ -37,7 +33,7 @@ public class GSPUnitTest {
 
 
    public List<Sequence> createSimpleSequentialData(){
-      List<Sequence> result = new ArrayList<>();
+      List<Sequence> database = new ArrayList<>();
 
       /*
       S1 	(1), (1 2 3), (1 3), (4), (3 6)
@@ -46,29 +42,14 @@ public List<Sequence> createSimpleSequentialData(){
       S4 	(5), (7), (1 6), (3), (2), (3)
       */
 
-      result.add(seq("1", "1,2,3", "1,3", "4", "3,6"));
-      result.add(seq("1,4", "3", "2,3", "1,5"));
-      result.add(seq("5,6", "1,2", "4,6", "3", "2"));
-      result.add(seq("5", "7", "1,6", "3", "2", "3"));
+      database.add(Sequence.make("1", "1,2,3", "1,3", "4", "3,6"));
+      database.add(Sequence.make("1,4", "3", "2,3", "1,5"));
+      database.add(Sequence.make("5,6", "1,2", "4,6", "3", "2"));
+      database.add(Sequence.make("5", "7", "1,6", "3", "2", "3"));
 
-      return result;
+      return database;
 
    }
 
-   private Sequence seq(String... args){
-
-      List<String> texts = new ArrayList<>();
-      for(String items : args){
-         texts.add(items);
-      }
-
-      List<ItemSetWithTimeId> itemSets = texts.stream().map(text -> new ItemSetWithTimeId(text.split(","))).collect(Collectors.toList());
-      Sequence sequence = new Sequence();
-      for(ItemSetWithTimeId element : itemSets) {
-         sequence.addElement(element);
-      }
 
-      return sequence;
-
-   }
 }

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,6 @@`
`3`	`3`
`4`	`4`	`import com.github.chen0040.spm.data.Sequence;`
`5`	`5`	`import com.github.chen0040.spm.data.Sequences;`
`6`		`-import sun.reflect.generics.reflectiveObjects.NotImplementedException;`
`7`	`6`
`8`	`7`	`import java.util.List;`
`9`	`8`
`@@ -33,6 +32,6 @@ public Sequences minePatterns(Iterable<? extends Sequence> database, List<Strin`
`33`	`32`
`34`	`33`
`35`	`34`	`public Sequences findMaxPatterns(Iterable<? extends Sequence> database, List<String> uniqueItems) {`
`36`		`- throw new NotImplementedException();`
	`35`	`+ return null;`
`37`	`36`	`}`
`38`	`37`	`}`