@@ -8,36 +8,49 @@ The joining functionality is heavily inspired by the APIs used by Pandas to merg
88
99=== "Python"
1010 ```python
11+ fs = ...
12+ credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1)
13+ account_details_fg = fs.get_feature_group(name="account_details", version=1)
14+ merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1)
15+
1116 # create a query
12- feature_join = rain_fg .select_all() \
13- .join(temperature_fg .select_all(), on=[ "date", "location_id "] ) \
14- .join(location_fg .select_all())
17+ selected_features = credit_card_transactions_fg .select_all() \
18+ .join(account_details_fg .select_all(), on=["cc_num "]) \
19+ .join(merchant_details_fg .select_all())
1520
1621 # save the query to feature view
1722 feature_view = fs.create_feature_view(
18- name='rain_dataset',
19- query=feature_join
23+ version=1,
24+ name='credit_card_fraud',
25+ labels=["is_fraud"],
26+ query=selected_features
2027 )
2128
2229 # retrieve the query back from the feature view
23- feature_view = fs.get_feature_view(“rain_dataset ”, version=1)
30+ feature_view = fs.get_feature_view(“credit_card_fraud ”, version=1)
2431 query = feature_view.query
2532 ```
2633
2734=== "Scala"
2835 ```scala
36+
37+ val fs = ...
38+ val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions", 1)
39+ val accountDetailsFg = fs.getFeatureGroup(name="account_details", version=1)
40+ val merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1)
41+
2942 // create a query
30- val featureJoin = (rainFg .selectAll()
31- .join(temperatureFg .selectAll(), on=Seq("date", "location_id "))
32- .join(locationFg .selectAll()))
43+ val selectedFeatures = (creditCardTransactionsFg .selectAll()
44+ .join(accountDetailsFg .selectAll(), on=Seq("cc_num "))
45+ .join(merchantDetailsFg .selectAll()))
3346
3447 val featureView = featureStore.createFeatureView()
35- .name("rain_dataset ")
36- .query(featureJoin )
48+ .name("credit_card_fraud ")
49+ .query(selectedFeatures )
3750 .build();
3851
3952 // retrieve the query back from the feature view
40- val featureView = fs.getFeatureView(“rain_dataset ”, 1)
53+ val featureView = fs.getFeatureView(“credit_card_fraud ”, 1)
4154 val query = featureView.getQuery()
4255 ```
4356
@@ -53,18 +66,18 @@ Selecting features from a feature group is a lazy operation, returning a query w
5366
5467=== "Python"
5568 ```python
56- rain_fg = fs.get_feature_group("rain_fg ")
69+ credit_card_transactions_fg = fs.get_feature_group("credit_card_transactions ")
5770
5871 # Returns Query
59- feature_join = rain_fg .select(["location_id ", "weekly_rainfall "])
72+ selected_features = credit_card_transactions_fg .select(["amount ", "latitude", "longitude "])
6073 ```
6174
6275=== "Scala"
6376 ```Scala
64- val rainFg = fs.getFeatureGroup("rain_fg ")
77+ val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions ")
6578
6679 # Returns Query
67- val featureJoin = rainFg .select(Seq("location_id ", "weekly_rainfall "))
80+ val selectedFeatures = creditCardTransactionsFg .select(Seq("amount ", "latitude", "longitude "))
6881 ```
6982
7083#### Join
@@ -75,35 +88,103 @@ By default, Hopsworks will use the maximal matching subset of the primary keys o
7588=== "Python"
7689 ```python
7790 # Returns Query
78- feature_join = rain_fg .join(temperature_fg )
91+ selected_features = credit_card_transactions_fg .join(account_details_fg )
7992 ```
8093
8194=== "Scala"
8295 ```Scala
8396 // Returns Query
84- val featureJoin = rainFg .join(temperatureFg )
97+ val selectedFeatures = creditCardTransactionsFg .join(accountDetailsFg )
8598 ```
8699More complex joins are possible by selecting subsets of features from the joined feature groups and by specifying a join key and type.
87- Possible join types are "inner", "left" or "right". Furthermore, it is possible to specify different features for the join key of the left and right feature group.
88- The join key lists should contain the names of the features to join on.
100+ Possible join types are "inner", "left" or "right". By default ` join_type ` is `"left". Furthermore, it is possible to specify different
101+ features for the join key of the left and right feature group. The join key lists should contain the names of the features to join on.
89102
90103=== "Python"
91104 ```python
92- feature_join = rain_fg .select_all() \
93- .join(temperature_fg .select_all(), on=[ "date", "location_id "] ) \
94- .join(location_fg .select_all(), left_on=[ "location_id "] , right_on=[ "id"] , join_type="left ")
105+ selected_features = credit_card_transactions_fg .select_all() \
106+ .join(account_details_fg .select_all(), on=[ "cc_num "] ) \
107+ .join(merchant_details_fg .select_all(), left_on=[ "merchant_id "] , right_on=[ "id"] , join_type="inner ")
95108 ```
96109
97110=== "Scala"
98111 ```scala
99- val featureJoin = (rainFg.selectAll()
100- .join(temperatureFg.selectAll(), Seq("date", "location_id"))
101- .join(locationFg.selectAll(), Seq("location_id"), Seq("id"), "left"))
112+ val selectedFeatures = (creditCardTransactionsFg.selectAll()
113+ .join(accountDetailsFg.selectAll(), Seq("cc_num"))
114+ .join(merchantDetailsFg.selectAll(), Seq("merchant_id"), Seq("id"), "inner"))
115+ ```
116+
117+ ### Data modeling in Hopsworks
118+
119+ Since v4.0 Hopsworks Feature selection API supports both Star and Snowflake Schema data models.
120+
121+ #### Star schema data model
122+
123+ When choosing Star Schema data model all tables are children of the parent (the left most) feature group, which has all
124+ foreign keys for its child feature groups.
125+
126+ <p align =" center " >
127+ <figure >
128+ <img src="../../../../assets/images/guides/fs/feature_view/star.png" alt="Star schema data model">
129+ <figcaption>Star schema data model</figcaption>
130+ </figure >
131+ </p >
132+
133+ === "Python"
134+ ```python
135+ selected_features = credit_card_transactions.select_all()
136+ .join(aggregated_cc_transactions.select_all())
137+ .join(account_details.select_all())
138+ .join(merchant_details.select_all())
139+ .join(cc_issuer_details.select_all())
102140 ```
103141
104- !!! error "Nested Joins"
105- The API currently does not support nested joins. That is joins of joins.
106- You can fall back to Spark DataFrames to cover these cases. However, if you have to use joins of joins, most likely there is potential to optimise your feature group structure.
142+ In online inference, when you want to retrieve features in your online model, you have to provide all foreign key values,
143+ known as the serving_keys, from the parent feature group to retrieve your precomputed feature values using the feature view.
144+
145+ === "Python"
146+ ```python
147+ feature vector = feature_view.get_feature_vector({
148+ ‘cc_num’: “1234 5555 3333 8888”,
149+ ‘issuer_id’: 20440455,
150+ ‘merchant_id’: 44208484,
151+ ‘account_id’: 84403331
152+ })
153+ ```
154+
155+ #### Snowflake schema
156+ Hopsworks also provides the possibility to define a feature view that consists of a nested tree of children (to up to a depth of 20)
157+ from the root (left most) feature group. This is called Snowflake Schema data model where you need to build nested tables (subtrees) using joins, and then join the
158+ subtrees to their parents iteratively until you reach the root node (the leftmost feature group in the feature selection):
159+
160+ <p align =" center " >
161+ <figure >
162+ <img src="../../../../assets/images/guides/fs/feature_view/snowflake.png" alt="Snowflake schema data model">
163+ <figcaption>Snowflake schema data model</figcaption>
164+ </figure >
165+ </p >
166+
167+ === "Python"
168+ ```python
169+ nested_selection = aggregated_cc_transactions.select_all()
170+ .join(account_details.select_all())
171+ .join(cc_issuer_details.select_all())
172+
173+ selected_features = credit_card_transactions.select_all()
174+ .join(nested_selection)
175+ .join(merchant_details.select_all())
176+ ```
177+
178+ Now, you have the benefit that in online inference you only need to pass two serving key values (the foreign keys of the leftmost feature group)
179+ to retrieve the precomputed features:
180+
181+ === "Python"
182+ ```python
183+ feature vector = feature_view.get_feature_vector({
184+ ‘cc_num’: “1234 5555 3333 8888”,
185+ ‘merchant_id’: 44208484,
186+ })
187+ ```
107188
108189#### Filter
109190
@@ -114,48 +195,48 @@ For the Scala part of the API, equivalent methods are available in the `Feature`
114195
115196=== "Python"
116197 ```python
117- filtered_rain = rain_fg .filter(rain_fg.location_id == 10 )
198+ filtered_credit_card_transactions = credit_card_transactions_fg .filter(credit_card_transactions_fg.category == "Grocery" )
118199 ```
119200
120201=== "Scala"
121202 ```scala
122- val filteredRain = rainFg .filter(rainFg .getFeature("location_id ").eq(10 ))
203+ val filteredCreditCardTransactions = creditCardTransactionsFg .filter(creditCardTransactionsFg .getFeature("category ").eq("Grocery" ))
123204 ```
124205
125206Filters are fully compatible with joins:
126207
127208=== "Python"
128209 ```python
129- feature_join = rain_fg .select_all() \
130- .join(temperature_fg .select_all(), on=[ "date", "location_id "] ) \
131- .join(location_fg .select_all(), left_on=[ "location_id "] , right_on=[ "id"] , join_type="left" ) \
132- .filter((rain_fg.location_id == 10 ) | (rain_fg.location_id == 20 ))
210+ selected_features = credit_card_transactions_fg .select_all() \
211+ .join(account_details_fg .select_all(), on=[ "cc_num "] ) \
212+ .join(merchant_details_fg .select_all(), left_on=[ "merchant_id "] , right_on=[ "id"] ) \
213+ .filter((credit_card_transactions_fg.category == "Grocery" ) | (credit_card_transactions_fg.category == "Restaurant/Cafeteria" ))
133214 ```
134215
135216=== "Scala"
136217 ```scala
137- val featureJoin = (rainFg .selectAll()
138- .join(temperatureFg .selectAll(), Seq("date", "location_id "))
139- .join(locationFg .selectAll(), Seq("location_id "), Seq("id"), "left")
140- .filter(rainFg .getFeature("location_id ").eq(10 ).or(rainFg .getFeature("location_id ").eq(20 ))))
218+ val selectedFeatures = (creditCardTransactionsFg .selectAll()
219+ .join(accountDetailsFg .selectAll(), Seq("cc_num "))
220+ .join(merchantDetailsFg .selectAll(), Seq("merchant_id "), Seq("id"), "left")
221+ .filter(creditCardTransactionsFg .getFeature("category ").eq("Grocery" ).or(creditCardTransactionsFg .getFeature("category ").eq("Restaurant/Cafeteria" ))))
141222 ```
142223
143224The filters can be applied at any point of the query:
144225
145226=== "Python"
146227 ```python
147- feature_join = rain_fg .select_all() \
148- .join(temperature_fg .select_all().filter(temperature_fg .avg_temp >= 22), on=[ "date", "location_id "] ) \
149- .join(location_fg .select_all(), left_on=[ "location_id "] , right_on=[ "id"] , join_type="left" ) \
150- .filter(rain_fg.location_id == 10 )
228+ selected_features = credit_card_transactions_fg .select_all() \
229+ .join(accountDetails_fg .select_all().filter(accountDetails_fg .avg_temp >= 22), on=[ "cc_num "] ) \
230+ .join(merchant_details_fg .select_all(), left_on=[ "merchant_id "] , right_on=[ "id"] ) \
231+ .filter(credit_card_transactions_fg.category == "Grocery" )
151232 ```
152233
153234=== "Scala"
154235 ```scala
155- val featureJoin = (rainFg .selectAll()
156- .join(temperatureFg .selectAll().filter(temperatureFg .getFeature("avg_temp").ge(22)), Seq("date", "location_id "))
157- .join(locationFg .selectAll(), Seq("location_id "), Seq("id"), "left")
158- .filter(rainFg .getFeature("location_id ").eq(10 )))
236+ val selectedFeatures = (creditCardTransactionsFg .selectAll()
237+ .join(accountDetailsFg .selectAll().filter(accountDetailsFg .getFeature("avg_temp").ge(22)), Seq("cc_num "))
238+ .join(merchantDetailsFg .selectAll(), Seq("merchant_id "), Seq("id"), "left")
239+ .filter(creditCardTransactionsFg .getFeature("category ").eq("Grocery" )))
159240 ```
160241
161242#### Joins and/or Filters on feature view query
@@ -166,23 +247,23 @@ However, this operation will not update the metadata and persist the updated que
166247=== "Python"
167248 ```python
168249 fs = ...
169- wind_speed_fg = fs.get_feature_group(name="wind_speed_fg ", version=1)
170- rain_fg = fs.get_feature_group(name="rain_fg ", version=1)
171- feature_view = fs.get_feature_view(“rain_dataset ”, version=1)
250+ merchant_details_fg = fs.get_feature_group(name="merchant_details ", version=1)
251+ credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions ", version=1)
252+ feature_view = fs.get_feature_view(“credit_card_fraud ”, version=1)
172253 feature_view.query \
173- .join(wind_speed_fg .select_all()) \
174- .filter((rain_fg.location_id == 54 )
254+ .join(merchant_details_fg .select_all()) \
255+ .filter((credit_card_transactions_fg.category == "Cash Withdrawal" )
175256 ```
176257
177258=== "Scala"
178259 ```scala
179260 val fs = ...
180- val windSpeedFg = fs.getFeatureGroup("wind_speed_fg ", 1)
181- val rainFg = fs.getFeatureGroup("rain_fg ", 1)
182- val featureView = fs.getFeatureView(“rain_dataset ”, 1)
261+ val merchantDetailsFg = fs.getFeatureGroup("merchant_details ", 1)
262+ val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions ", 1)
263+ val featureView = fs.getFeatureView(“credit_card_fraud ”, 1)
183264 featureView.getQuery()
184- .join(windSpeedFg .selectAll())
185- .filter(rainFg .getFeature("location_id ").eq(54 ))
265+ .join(merchantDetailsFg .selectAll())
266+ .filter(creditCardTransactionsFg .getFeature("category ").eq("Cash Withdrawal" ))
186267 ```
187268
188269!!! warning
@@ -192,45 +273,46 @@ However, this operation will not update the metadata and persist the updated que
192273=== "Python"
193274 ```python
194275 fs = ...
195- wind_speed_fg = fs.get_feature_group(name="wind_speed_fg", version=1)
196- solar_irradiance_fg = fs.get_feature_group(name="solar_irradiance_fg", version=1)
197- rain_fg = fs.get_feature_group(name="rain_fg", version=1)
276+
277+ merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1)
278+ account_details_fg = fs.get_feature_group(name="account_details", version=1)
279+ credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1)
198280
199281 # fetch new feature view and its query instance
200- feature_view = fs.get_feature_view(“rain_dataset ”, version=1)
282+ feature_view = fs.get_feature_view(“credit_card_fraud ”, version=1)
201283
202- # apply join/filter logic based on location and wind speed
203- feature_view.query.join(wind_speed_fg .select_all()) \
204- .filter((rain_fg.location_id == 54 )
284+ # apply join/filter logic based on purchase type
285+ feature_view.query.join(merchant_details_fg .select_all()) \
286+ .filter((credit_card_transactions_fg.category == "Cash Withdrawal" )
205287
206- # to apply new logic independent of location and wind speed from above
288+ # to apply new logic independent of purchase type from above
207289 # re-fetch new feature view and its query instance
208- feature_view = fs.get_feature_view(“rain_dataset ”, version=1)
290+ feature_view = fs.get_feature_view(“credit_card_fraud ”, version=1)
209291
210- # apply new join/filter logic based on solar irradiance
211- feature_view.query.join(solar_irradiance_fg .select_all()) \
212- .filter(solar_irradiance_fg.location_id == 28 )
292+ # apply new join/filter logic based on account details
293+ feature_view.query.join(merchant_details_fg .select_all()) \
294+ .filter(account_details_fg.gender == "F" )
213295 ```
214296
215297=== "Scala"
216298 ```scala
217299 fs = ...
218- windSpeedFg = fs.getFeatureGroup("wind_speed_fg ", 1)
219- solarIrradianceFg = fs.getFeatureGroup("solar_irradiance_fg ", 1)
220- rainFg = fs.getFeatureGroup("rain_fg ", 1)
300+ merchantDetailsFg = fs.getFeatureGroup("merchant_details ", 1)
301+ accountDetailsFg = fs.getFeatureGroup("account_details ", 1)
302+ creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions ", 1)
221303
222304 // fetch new feature view and its query instance
223- val featureView = fs.getFeatureView(“rain_dataset ”, version=1)
305+ val featureView = fs.getFeatureView(“credit_card_fraud ”, version=1)
224306
225- // apply join/filter logic based on location and wind speed
226- featureView.getQuery.join(windSpeedFg .selectAll())
227- .filter(rainFg .getFeature("location_id ").eq(54 ))
307+ // apply join/filter logic based on purchase type
308+ featureView.getQuery.join(merchantDetailsFg .selectAll())
309+ .filter(creditCardTransactionsFg .getFeature("category ").eq("Cash Withdrawal" ))
228310
229- // to apply new logic independent of location and wind speed from above
311+ // to apply new logic independent of purchase type from above
230312 // re-fetch new feature view and its query instance
231- val featureView = fs.getFeatureView(“rain_dataset ”, 1)
313+ val featureView = fs.getFeatureView(“credit_card_fraud ”, 1)
232314
233- // apply new join/filter logic based on solar irradiance
234- featureView.getQuery.join(solarIrradianceFg .selectAll())
235- .filter(solarIrradianceFg .getFeature("location_id ").eq(28 ))
315+ // apply new join/filter logic based on account details
316+ featureView.getQuery.join(merchantDetailsFg .selectAll())
317+ .filter(accountDetailsFg .getFeature("gender ").eq("F" ))
236318 ```
0 commit comments