Added snippets for 120_Proximity_Matching

clintongormley · clintongormley · commit aeed981e7393 · 2014-03-14T15:41:33.000+01:00
diff --git a/120_Proximity_Matching/05_Phrase_matching.asciidoc b/120_Proximity_Matching/05_Phrase_matching.asciidoc
@@ -16,6 +16,7 @@ GET /my_index/my_type/_search
     }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/05_Match_phrase_query.json
 
 Like the `match` query, the `match_phrase` query first analyzes the query
 string to produce a list of terms. It then searches for all the terms, but
@@ -38,6 +39,7 @@ The `match_phrase` query can also be written as a `match` query with type
     }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/05_Match_phrase_query.json
 
 ****
 
@@ -51,6 +53,7 @@ also the _position_ or order of each term in the original string:
 GET /_analyze?analyzer=standard
 Quick brown fox
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/05_Term_positions.json
 
 This returns:
 
diff --git a/120_Proximity_Matching/10_Slop.asciidoc b/120_Proximity_Matching/10_Slop.asciidoc
@@ -20,6 +20,7 @@ GET /my_index/my_type/_search
     }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/10_Slop.json
 
 The `slop` parameter tells the `match_phrase` query how far apart terms are
 allowed to be while still considering the document a match. By ``how far
diff --git a/120_Proximity_Matching/15_Multi_value_fields.asciidoc b/120_Proximity_Matching/15_Multi_value_fields.asciidoc
@@ -10,6 +10,7 @@ PUT /my_index/groups/1
     "names": [ "John Abraham", "Lincoln Smith"]
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/15_Multi_value_fields.json
 
 Then run a phrase query for `"Abraham Lincoln"`:
 
@@ -24,6 +25,7 @@ GET /my_index/groups/_search
     }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/15_Multi_value_fields.json
 
 Surprisingly our document matches, even though `"Abraham"` and `"Lincoln"`
 belong to two different people in the `names` array. The reason for this comes
@@ -61,6 +63,8 @@ PUT /my_index/_mapping/groups <2>
     }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/15_Multi_value_fields.json
+
 <1> First delete the `group` mapping and and documents of that type.
 <2> Then create a new `group` mapping with the correct values.
 
diff --git a/120_Proximity_Matching/20_Scoring.asciidoc b/120_Proximity_Matching/20_Scoring.asciidoc
@@ -25,6 +25,8 @@ POST /my_index/my_type/_search
    }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/20_Scoring.json
+
 <1> Note the high `slop` value
 
 [source,js]
@@ -33,19 +35,20 @@ POST /my_index/my_type/_search
   "hits": [
      {
         "_id":      "3",
-        "_score":   0.75,
+        "_score":   0.75, <1>
         "_source": {
            "title": "The quick brown fox jumps over the quick dog"
         }
      },
      {
         "_id":      "2",
-        "_score":   0.28347334,
+        "_score":   0.28347334, <2>
         "_source": {
            "title": "The quick brown fox jumps over the lazy dog"
         }
      }
   ]
 }
 --------------------------------------------------
-
+<1> Higher score because `quick` and `dog` are close together.
+<2> Lower score because `quick` and `dog` are further apart.
diff --git a/120_Proximity_Matching/25_Relevance.asciidoc b/120_Proximity_Matching/25_Relevance.asciidoc
@@ -16,7 +16,7 @@ that we should combine them using the `bool` query.
 
 We can use a simple `match` query as a `must` clause. This is the query that
 will determine which documents are included in our resultset --  we can trim
-the long tail with the `minimum_must_match` parameter.  Then we can add other
+the long tail with the `minimum_should_match` parameter.  Then we can add other
 more specific queries as `should` clauses -- every one that matches will
 increase the relevance of the matching docs.
 
@@ -29,13 +29,13 @@ GET /my_index/my_type/_search
       "must": {
         "match": { <1>
           "title": {
-            "query":              "quick brown fox",
-            "minimum_must_match": "30%"
+            "query":                "quick brown fox",
+            "minimum_should_match": "30%"
           }
         }
       },
       "should": {
-        "match_phrase": <2>
+        "match_phrase": { <2>
           "title": {
             "query": "quick brown fox",
             "slop":  50
@@ -46,6 +46,8 @@ GET /my_index/my_type/_search
   }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/25_Relevance.json
+
 <1> The `must` clause includes or excludes documents from the resultset.
 <2> The `should` clause increases the relevance score of those documents that
     match.
diff --git a/120_Proximity_Matching/30_Performance.asciidoc b/120_Proximity_Matching/30_Performance.asciidoc
@@ -58,16 +58,16 @@ GET /my_index/my_type/_search
     "query": {
         "match": {  <1>
             "title": {
-                "query":              "quick brown fox",
-                "minimum_must_match": "30%"
+                "query":                "quick brown fox",
+                "minimum_should_match": "30%"
             }
         }
     },
     "rescore": {
         "window_size": 50, <2>
         "query": {         <3>
             "rescore_query": {
-                "match_phrase":
+                "match_phrase": {
                     "title": {
                         "query": "quick brown fox",
                         "slop":  50
@@ -78,6 +78,8 @@ GET /my_index/my_type/_search
     }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/30_Performance.json
+
 <1> The `match` query decides which results will be included in the final
     result set and ranks results according to TF/IDF.
 <2> The `window_size` is the number of top results to rescore, per shard.
diff --git a/120_Proximity_Matching/35_Shingles.asciidoc b/120_Proximity_Matching/35_Shingles.asciidoc
@@ -92,6 +92,7 @@ PUT /my_index
     }
 }
 --------------------------------------------------
+// SENSE: 120_Proximity_Matching/35_Shingles.json
 
 <1> See <<relevance-is-broken>>.
 <2> The default min/max shingle size is `2` so we don't really need to set
diff --git a/snippets/120_Proximity_Matching/05_Match_phrase_query.json b/snippets/120_Proximity_Matching/05_Match_phrase_query.json
@@ -0,0 +1,40 @@
+# Delete the `my_index` index
+DELETE /my_index
+
+# Create `my_index` with a single primary shard
+PUT /my_index
+{ "settings": { "number_of_shards": 1 }}
+
+# Index some example docs
+POST /my_index/my_type/_bulk
+{ "index": { "_id": 1 }}
+{ "title": "The quick brown fox" }
+{ "index": { "_id": 2 }}
+{ "title": "The quick brown fox jumps over the lazy dog" }
+{ "index": { "_id": 3 }}
+{ "title": "The quick brown fox jumps over the quick dog" }
+{ "index": { "_id": 4 }}
+{ "title": "Brown fox brown dog" }
+
+# match_phrase query
+GET /my_index/my_type/_search
+{
+  "query": {
+    "match_phrase": {
+      "title": "quick brown fox"
+    }
+  }
+}
+
+# match query, type phrase
+GET /my_index/my_type/_search
+{
+  "query": {
+    "match": {
+      "title": {
+        "type": "phrase",
+        "query": "quick brown fox"
+      }
+    }
+  }
+}
diff --git a/snippets/120_Proximity_Matching/05_Term_positions.json b/snippets/120_Proximity_Matching/05_Term_positions.json
@@ -0,0 +1,3 @@
+# Term positions
+GET /_analyze?text=Quick brown fox
+
diff --git a/snippets/120_Proximity_Matching/10_Slop.json b/snippets/120_Proximity_Matching/10_Slop.json
@@ -0,0 +1,44 @@
+# Delete the `my_index` index
+DELETE /my_index
+
+# Create `my_index` with a single primary shard
+PUT /my_index
+{ "settings": { "number_of_shards": 1 }}
+
+# Index some example docs
+POST /my_index/my_type/_bulk
+{ "index": { "_id": 1 }}
+{ "title": "The quick brown fox" }
+{ "index": { "_id": 2 }}
+{ "title": "The quick brown fox jumps over the lazy dog" }
+{ "index": { "_id": 3 }}
+{ "title": "The quick brown fox jumps over the quick dog" }
+{ "index": { "_id": 4 }}
+{ "title": "Brown fox brown dog" }
+
+
+# Phrase query - doesn't match
+GET /my_index/my_type/_search
+{
+  "query": {
+    "match_phrase": {
+      "title": {
+        "query": "quick fox"
+      }
+    }
+  }
+}
+
+
+# Proximity query with slop - matches
+GET /my_index/my_type/_search
+{
+  "query": {
+    "match_phrase": {
+      "title": {
+        "query": "quick fox",
+        "slop": 1
+      }
+    }
+  }
+}
diff --git a/snippets/120_Proximity_Matching/15_Multi_value_fields.json b/snippets/120_Proximity_Matching/15_Multi_value_fields.json
@@ -0,0 +1,68 @@
+# Delete the `my_index` index
+DELETE /my_index
+
+# Create `my_index` with a single primary shard
+PUT /my_index
+{ "settings": { "number_of_shards": 1 }}
+
+# Index an example doc
+PUT /my_index/groups/1
+{
+  "names": [
+    "John Abraham",
+    "Lincoln Smith"
+  ]
+}
+
+# Phrase "Abraham Lincoln" matches!
+GET /my_index/groups/_search
+{
+    "query": {
+        "match_phrase": {
+            "names": "Abraham Lincoln"
+        }
+    }
+}
+
+# Delete `groups` mapping and data
+DELETE /my_index/groups/
+
+# Map `names` to use position_offset_gap
+PUT /my_index/_mapping/groups
+{
+  "properties": {
+    "names": {
+      "type": "string",
+      "position_offset_gap": 100
+    }
+  }
+}
+
+# Reindex document
+PUT /my_index/groups/1
+{
+  "names": [
+    "John Abraham",
+    "Lincoln Smith"
+  ]
+}
+
+# Phrase "Abraham Lincoln" no longer matches
+GET /my_index/groups/_search
+{
+  "query": {
+    "match_phrase": {
+      "names": "Abraham Lincoln"
+    }
+  }
+}
+
+# But phrase "John Abraham" does
+GET /my_index/groups/_search
+{
+  "query": {
+    "match_phrase": {
+      "names": "John Abraham"
+    }
+  }
+}
diff --git a/snippets/120_Proximity_Matching/20_Scoring.json b/snippets/120_Proximity_Matching/20_Scoring.json
@@ -0,0 +1,30 @@
+# Delete the `my_index` index
+DELETE /my_index
+
+# Create `my_index` with a single primary shard
+PUT /my_index
+{ "settings": { "number_of_shards": 1 }}
+
+# Index some example docs
+POST /my_index/my_type/_bulk
+{ "index": { "_id": 1 }}
+{ "title": "The quick brown fox" }
+{ "index": { "_id": 2 }}
+{ "title": "The quick brown fox jumps over the lazy dog" }
+{ "index": { "_id": 3 }}
+{ "title": "The quick brown fox jumps over the quick dog" }
+{ "index": { "_id": 4 }}
+{ "title": "Brown fox brown dog" }
+
+# High slop value
+POST /my_index/my_type/_search
+{
+  "query": {
+    "match_phrase": {
+      "title": {
+        "query": "quick dog",
+        "slop": 50
+      }
+    }
+  }
+}
diff --git a/snippets/120_Proximity_Matching/25_Relevance.json b/snippets/120_Proximity_Matching/25_Relevance.json
@@ -0,0 +1,42 @@
+# Delete the `my_index` index
+DELETE /my_index
+
+# Create `my_index` with a single primary shard
+PUT /my_index
+{ "settings": { "number_of_shards": 1 }}
+
+# Index some example docs
+POST /my_index/my_type/_bulk
+{ "index": { "_id": 1 }}
+{ "title": "The quick brown fox" }
+{ "index": { "_id": 2 }}
+{ "title": "The quick brown fox jumps over the lazy dog" }
+{ "index": { "_id": 3 }}
+{ "title": "The quick brown fox jumps over the quick dog" }
+{ "index": { "_id": 4 }}
+{ "title": "Brown fox brown dog" }
+
+# Combine phrase with match query to boost relevance
+GET /my_index/my_type/_search
+{
+  "query": {
+    "bool": {
+      "must": {
+        "match": {
+          "title": {
+            "query": "quick brown fox",
+            "minimum_should_match": "30%"
+          }
+        }
+      },
+      "should": {
+        "match_phrase": {
+          "title": {
+            "query": "quick brown fox",
+            "slop": 50
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/snippets/120_Proximity_Matching/30_Performance.json b/snippets/120_Proximity_Matching/30_Performance.json
diff --git a/snippets/120_Proximity_Matching/35_Shingles.json b/snippets/120_Proximity_Matching/35_Shingles.json

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ GET /my_index/my_type/_search`
`16`	`16`	`}`
`17`	`17`	`}`
`18`	`18`	`--------------------------------------------------`
	`19`	`+// SENSE: 120_Proximity_Matching/05_Match_phrase_query.json`
`19`	`20`
`20`	`21`	Like the `match` query, the `match_phrase` query first analyzes the query
`21`	`22`	`string to produce a list of terms. It then searches for all the terms, but`
@@ -38,6 +39,7 @@ The `match_phrase` query can also be written as a `match` query with type
`38`	`39`	`}`
`39`	`40`	`}`
`40`	`41`	`--------------------------------------------------`
	`42`	`+// SENSE: 120_Proximity_Matching/05_Match_phrase_query.json`
`41`	`43`
`42`	`44`	`****`
`43`	`45`
`@@ -51,6 +53,7 @@ also the _position_ or order of each term in the original string:`
`51`	`53`	`GET /_analyze?analyzer=standard`
`52`	`54`	`Quick brown fox`
`53`	`55`	`--------------------------------------------------`
	`56`	`+// SENSE: 120_Proximity_Matching/05_Term_positions.json`
`54`	`57`
`55`	`58`	`This returns:`
`56`	`59`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ GET /my_index/my_type/_search`
`20`	`20`	`}`
`21`	`21`	`}`
`22`	`22`	`--------------------------------------------------`
	`23`	`+// SENSE: 120_Proximity_Matching/10_Slop.json`
`23`	`24`
`24`	`25`	The `slop` parameter tells the `match_phrase` query how far apart terms are
`25`	`26`	allowed to be while still considering the document a match. By ``how far
Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ PUT /my_index/groups/1`
`10`	`10`	`"names": [ "John Abraham", "Lincoln Smith"]`
`11`	`11`	`}`
`12`	`12`	`--------------------------------------------------`
	`13`	`+// SENSE: 120_Proximity_Matching/15_Multi_value_fields.json`
`13`	`14`
`14`	`15`	Then run a phrase query for `"Abraham Lincoln"`:
`15`	`16`
`@@ -24,6 +25,7 @@ GET /my_index/groups/_search`
`24`	`25`	`}`
`25`	`26`	`}`
`26`	`27`	`--------------------------------------------------`
	`28`	`+// SENSE: 120_Proximity_Matching/15_Multi_value_fields.json`
`27`	`29`
`28`	`30`	Surprisingly our document matches, even though `"Abraham"` and `"Lincoln"`
`29`	`31`	belong to two different people in the `names` array. The reason for this comes
`@@ -61,6 +63,8 @@ PUT /my_index/_mapping/groups <2>`
`61`	`63`	`}`
`62`	`64`	`}`
`63`	`65`	`--------------------------------------------------`
	`66`	`+// SENSE: 120_Proximity_Matching/15_Multi_value_fields.json`
	`67`	`+`
`64`	`68`	<1> First delete the `group` mapping and and documents of that type.
`65`	`69`	<2> Then create a new `group` mapping with the correct values.
`66`	`70`
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,8 @@ POST /my_index/my_type/_search`
`25`	`25`	`}`
`26`	`26`	`}`
`27`	`27`	`--------------------------------------------------`
	`28`	`+// SENSE: 120_Proximity_Matching/20_Scoring.json`
	`29`	`+`
`28`	`30`	<1> Note the high `slop` value
`29`	`31`
`30`	`32`	`[source,js]`
`@@ -33,19 +35,20 @@ POST /my_index/my_type/_search`
`33`	`35`	`"hits": [`
`34`	`36`	`{`
`35`	`37`	`"_id": "3",`
`36`		`- "_score": 0.75,`
	`38`	`+ "_score": 0.75, <1>`
`37`	`39`	`"_source": {`
`38`	`40`	`"title": "The quick brown fox jumps over the quick dog"`
`39`	`41`	`}`
`40`	`42`	`},`
`41`	`43`	`{`
`42`	`44`	`"_id": "2",`
`43`		`- "_score": 0.28347334,`
	`45`	`+ "_score": 0.28347334, <2>`
`44`	`46`	`"_source": {`
`45`	`47`	`"title": "The quick brown fox jumps over the lazy dog"`
`46`	`48`	`}`
`47`	`49`	`}`
`48`	`50`	`]`
`49`	`51`	`}`
`50`	`52`	`--------------------------------------------------`
`51`		`-`
	`53`	+<1> Higher score because `quick` and `dog` are close together.
	`54`	+<2> Lower score because `quick` and `dog` are further apart.
Original file line number	Diff line number	Diff line change
`@@ -92,6 +92,7 @@ PUT /my_index`
`92`	`92`	`}`
`93`	`93`	`}`
`94`	`94`	`--------------------------------------------------`
	`95`	`+// SENSE: 120_Proximity_Matching/35_Shingles.json`
`95`	`96`
`96`	`97`	`<1> See <<relevance-is-broken>>.`
`97`	`98`	<2> The default min/max shingle size is `2` so we don't really need to set
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Term positions`
	`2`	`+GET /_analyze?text=Quick brown fox`
	`3`	`+`