Skip to content

Commit b8a68bf

Browse files
committed
Deployed to maven
1 parent 84fdefd commit b8a68bf

File tree

6 files changed

+84
-44
lines changed

6 files changed

+84
-44
lines changed

README.md

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,63 @@
11
# java-sequential-pattern-mining
2-
Package provides java implementation of sequential pattern mining algorithms
2+
Package provides java implementation of sequential pattern mining algorithm GSP
33

44
[![Build Status](https://travis-ci.org/chen0040/java-sequential-pattern-mining.svg?branch=master)](https://travis-ci.org/chen0040/java-sequential-pattern-mining) [![Coverage Status](https://coveralls.io/repos/github/chen0040/java-sequential-pattern-mining/badge.svg?branch=master)](https://coveralls.io/github/chen0040/java-sequential-pattern-mining?branch=master)
55

66

7+
# Overview of GSP
8+
The implementation of the algorithm is based on Srikant & Agrawal, 1996
9+
10+
The algorithm makes multiple passes over the data. The first pass determines the support of each item, that
11+
is, the number of data-sequences that include the item. At the end of the first pas, the algorithm knows
12+
which items are frequent, that is, have minimum support. Each such item yields a 1-element frequent sequence
13+
consisting of that item.
14+
15+
Each subsequent pass starts with a seed set: the frequent sequences found in the previous pass. The seed set
16+
is used to generate new potentially frequent sequences, called candidate sequences. Each candidate sequence
17+
has one more item than a seed sequence; so all the candidate sequences in a pass will have the same number of
18+
items. The support for these candidate sequences is found during the pass over the data. At the end of the
19+
pass, the algorithm determines which of the candidate sequences are actually frequent. These frequent candidates
20+
become the seed for the next pass.
21+
22+
# Install
23+
24+
Add the following dependency to your POM file:
25+
26+
```xml
27+
<dependency>
28+
<groupId>com.github.chen0040</groupId>
29+
<artifactId>java-sequential-pattern-mining</artifactId>
30+
<version>1.0.1</version>
31+
</dependency>
32+
```
33+
34+
# Usage
35+
36+
The sample code belows illustrates how to use the GSP to find the frequent sequential pattern in a simple sequence database.
37+
38+
```java
39+
List<Sequence> database = new ArrayList<>();
40+
41+
// Below is 4 sequences of transactions stored in the database
42+
/*
43+
S1 (1), (1 2 3), (1 3), (4), (3 6)
44+
S2 (1 4), (3), (2 3), (1 5)
45+
S3 (5 6), (1 2), (4 6), (3), (2)
46+
S4 (5), (7), (1 6), (3), (2), (3)
47+
*/
48+
49+
database.add(Sequence.make("1", "1,2,3", "1,3", "4", "3,6"));
50+
database.add(Sequence.make("1,4", "3", "2,3", "1,5"));
51+
database.add(Sequence.make("5,6", "1,2", "4,6", "3", "2"));
52+
database.add(Sequence.make("5", "7", "1,6", "3", "2", "3"));
53+
54+
GSP method = new GSP();
55+
method.setMinSupportLevel(2);
56+
List<String> uniqueItems = new MetaData(database).getUniqueItems();
57+
Sequences result = method.minePatterns(database, uniqueItems, -1);
58+
59+
result.getSequences().stream().forEach(sequence -> {
60+
System.out.println("sequence: " + sequence);
61+
});
62+
```
763

pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
<licenses>
1515
<license>
1616
<name>MIT</name>
17-
<url>https://github.com/chen0040/java-sequential-pattern-mining/LICENSE</url>
17+
<url>https://github.com/chen0040/java-sequential-pattern-mining/blob/master/LICENSE</url>
1818
<comments>MIT License</comments>
1919
</license>
2020
</licenses>
@@ -40,8 +40,8 @@
4040
</issueManagement>
4141

4242

43-
<name>Frequent Pattern Mining Algorithms</name>
44-
<description>Frequent pattern mining algorithms package</description>
43+
<name>General Sequential Pattern Mining</name>
44+
<description>Java implementation of GSP for frequent pattern mining</description>
4545
<url>https://github.com/chen0040/java-sequential-pattern-mining</url>
4646

4747
<distributionManagement>

src/main/java/com/github/chen0040/spm/AbstractSequentialAssocRuleMiner.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
import com.github.chen0040.spm.data.Sequence;
55
import com.github.chen0040.spm.data.Sequences;
6-
import sun.reflect.generics.reflectiveObjects.NotImplementedException;
76

87
import java.util.List;
98

@@ -33,6 +32,6 @@ public Sequences minePatterns(Iterable<? extends Sequence> database, List<Strin
3332

3433

3534
public Sequences findMaxPatterns(Iterable<? extends Sequence> database, List<String> uniqueItems) {
36-
throw new NotImplementedException();
35+
return null;
3736
}
3837
}

src/main/java/com/github/chen0040/spm/apriori/GSP.java

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,6 @@
1313

1414
/**
1515
* Created by xschen on 8/2/2015.
16-
* implementation of the algorithm is based on Srikant & Agrawal, 1996
17-
*
18-
* The algorithm makes multiple passes over the data. The first pass determines the support of each item, that
19-
* is, the numbre of data-sequences that include the item. At the end of the first pas, the algorithm knows
20-
* which items are frequent, that is, have minimum support. Each such item yields a 1-element frequent sequence
21-
* consisting of that item.
22-
*
23-
* Each subsequent pass starts with a seed set: the frequent sequences found in the previous pass. The seed set
24-
* is used to generate new potentially frequent sequences, called candidate sequences. Each candidate sequence
25-
* has one more item than a seed sequence; so all the candidate sequences in a pass will have the same number of
26-
* items. The support for these candidate sequences is found during the pass over the data. At the end of the
27-
* pass, the algorithm determines which of the candidate sequences are actually frequent. These frequent candidates
28-
* become the seed for the next pass.
2916
*/
3017
public class GSP extends AbstractSequentialAssocRuleMiner {
3118

src/main/java/com/github/chen0040/spm/data/Sequence.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,23 @@ public Sequence(){
2424

2525
}
2626

27+
public static Sequence make(String... args){
28+
29+
List<String> texts = new ArrayList<>();
30+
for(String items : args){
31+
texts.add(items);
32+
}
33+
34+
List<ItemSetWithTimeId> itemSets = texts.stream().map(text -> new ItemSetWithTimeId(text.split(","))).collect(Collectors.toList());
35+
Sequence sequence = new Sequence();
36+
for(ItemSetWithTimeId element : itemSets) {
37+
sequence.addElement(element);
38+
}
39+
40+
return sequence;
41+
42+
}
43+
2744

2845
public List<ItemSetWithTimeId> getElements(){
2946
return elements;
Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package com.github.chen0040.spm.apriori;
22

33

4-
import com.github.chen0040.spm.data.ItemSetWithTimeId;
54
import com.github.chen0040.spm.data.MetaData;
65
import com.github.chen0040.spm.data.Sequence;
76
import com.github.chen0040.spm.data.Sequences;
@@ -11,9 +10,6 @@
1110

1211
import java.util.ArrayList;
1312
import java.util.List;
14-
import java.util.stream.Collectors;
15-
16-
import static org.testng.Assert.*;
1713

1814

1915
/**
@@ -37,7 +33,7 @@ public class GSPUnitTest {
3733

3834

3935
public List<Sequence> createSimpleSequentialData(){
40-
List<Sequence> result = new ArrayList<>();
36+
List<Sequence> database = new ArrayList<>();
4137

4238
/*
4339
S1 (1), (1 2 3), (1 3), (4), (3 6)
@@ -46,29 +42,14 @@ public List<Sequence> createSimpleSequentialData(){
4642
S4 (5), (7), (1 6), (3), (2), (3)
4743
*/
4844

49-
result.add(seq("1", "1,2,3", "1,3", "4", "3,6"));
50-
result.add(seq("1,4", "3", "2,3", "1,5"));
51-
result.add(seq("5,6", "1,2", "4,6", "3", "2"));
52-
result.add(seq("5", "7", "1,6", "3", "2", "3"));
45+
database.add(Sequence.make("1", "1,2,3", "1,3", "4", "3,6"));
46+
database.add(Sequence.make("1,4", "3", "2,3", "1,5"));
47+
database.add(Sequence.make("5,6", "1,2", "4,6", "3", "2"));
48+
database.add(Sequence.make("5", "7", "1,6", "3", "2", "3"));
5349

54-
return result;
50+
return database;
5551

5652
}
5753

58-
private Sequence seq(String... args){
59-
60-
List<String> texts = new ArrayList<>();
61-
for(String items : args){
62-
texts.add(items);
63-
}
64-
65-
List<ItemSetWithTimeId> itemSets = texts.stream().map(text -> new ItemSetWithTimeId(text.split(","))).collect(Collectors.toList());
66-
Sequence sequence = new Sequence();
67-
for(ItemSetWithTimeId element : itemSets) {
68-
sequence.addElement(element);
69-
}
7054

71-
return sequence;
72-
73-
}
7455
}

0 commit comments

Comments
 (0)