From 7b9464e97279a052cfdf115672880f89606421b3 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 21 Apr 2015 09:07:13 -0400 Subject: [PATCH 001/200] Made loadStr a public methor Fixes #18 --- README.md | 15 +++++++++++ src/PHPHtmlParser/Dom.php | 56 +++++++++++++++++++-------------------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 2bd8264a..4089b242 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,21 @@ $html = $dom->outerHtml; As long as the Connector object implements the `PHPHtmlParser\CurlInterface` interface properly it will use that object to get the content of the url instead of the default `PHPHtmlParser\Curl` class. +Loading Strings +--------------- + +Loading a string directly, with out the checks in `load()` is also easely done. + +```php +use PHPHtmlParser\Dom; + +$dom = new Dom; +$dom->loadStr('String', []) +$html = $dom->outerHtml; +``` + +If the string is to long, depending on your file system, the `load()` method will throw a warning. If this happens you can just call the above method to bypass the `is_file()` check in the `load()` method. + Options ------- diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index e1f4b085..91855d0f 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -160,6 +160,34 @@ public function loadFromUrl($url, $options = [], CurlInterface $curl = null) return $this->loadStr($content, $options); } + /** + * Parsers the html of the given string. Used for load(), loadFromFile(), + * and loadFromUrl(). + * + * @param string $str + * @param array $option + * @chainable + */ + public function loadStr($str, $option) + { + $this->options = new Options; + $this->options->setOptions($this->globalOptions) + ->setOptions($option); + + $this->rawSize = strlen($str); + $this->raw = $str; + + $html = $this->clean($str); + + $this->size = strlen($str); + $this->content = new Content($html); + + $this->parse(); + $this->detectCharset(); + + return $this; + } + /** * Sets a global options array to be used by all load calls. * @@ -291,34 +319,6 @@ public function getElementsByClass($class) return $this->find('.'.$class); } - /** - * Parsers the html of the given string. Used for load(), loadFromFile(), - * and loadFromUrl(). - * - * @param string $str - * @param array $option - * @chainable - */ - protected function loadStr($str, $option) - { - $this->options = new Options; - $this->options->setOptions($this->globalOptions) - ->setOptions($option); - - $this->rawSize = strlen($str); - $this->raw = $str; - - $html = $this->clean($str); - - $this->size = strlen($str); - $this->content = new Content($html); - - $this->parse(); - $this->detectCharset(); - - return $this; - } - /** * Checks if the load methods have been called. * From f400ff77e2e3dd27e33ec8df6c0c13837e6ecde6 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 21 Apr 2015 09:25:58 -0400 Subject: [PATCH 002/200] Fixed some documentation inconsistencies fixes #15 --- src/PHPHtmlParser/Dom/Tag.php | 4 ++-- tests/Node/TagTest.php | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index ca9fde93..e2699630 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -46,7 +46,7 @@ public function __construct($name) public function __get($key) { - return $this->getAttribute($key); + return$this->getAttribute($key); } public function __set($key, $value) @@ -133,7 +133,7 @@ public function setAttributes(array $attr) { foreach ($attr as $key => $value) { - $this->attr[$key] = $value; + $this->setAttribute($key, $value); } return $this; diff --git a/tests/Node/TagTest.php b/tests/Node/TagTest.php index cf29cf80..73109f43 100644 --- a/tests/Node/TagTest.php +++ b/tests/Node/TagTest.php @@ -25,6 +25,25 @@ public function testSetAttributes() $this->assertEquals('http://google.com', $tag->getAttribute('href')['value']); } + public function testSetAttributeNoArray() + { + $tag = new Tag('a'); + $tag->setAttribute('href', 'http://google.com'); + $this->assertEquals('http://google.com', $tag->getAttribute('href')['value']); + } + + public function testSetAttributesNoDoubleArray() + { + $attr = [ + 'href' => 'http://google.com', + 'class' => 'funtimes', + ]; + + $tag = new Tag('a'); + $tag->setAttributes($attr); + $this->assertEquals('funtimes', $tag->class['value']); + } + public function testNoise() { $tag = new Tag('a'); @@ -45,6 +64,13 @@ public function testGetAttributeMagic() $this->assertEquals('http://google.com', $tag->href['value']); } + public function testSetAttributeMagic() + { + $tag = new Tag('a'); + $tag->href = 'http://google.com'; + $this->assertEquals('http://google.com', $tag->href['value']); + } + public function testMakeOpeningTag() { $attr = [ From 1a11a7e81ecc40069635efec471441f328a494c7 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 21 Apr 2015 09:45:43 -0400 Subject: [PATCH 003/200] Fixed bug when no attribute tags are last tag (with out space). fixes #16 --- src/PHPHtmlParser/Dom.php | 7 +++++-- tests/DomTest.php | 14 ++++++++++++++ tests/Node/HtmlTest.php | 23 +++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 91855d0f..7780b586 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -372,7 +372,7 @@ protected function clean($str) return $str; } - /** + /** null, 'doubleQuote' => true, ]; - $this->content->rewind(1); + if ($this->content->char() != '>') + { + $this->content->rewind(1); + } } } diff --git a/tests/DomTest.php b/tests/DomTest.php index 0ad677d6..626914bb 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -123,6 +123,20 @@ public function testLoadClosingTagClearSelfClosingTag() $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); } + public function testLoadNoValueAttribute() + { + $dom = new Dom; + $dom->load('
Main content here
'); + $this->assertEquals('
Main content here
', $dom->innerHtml); + } + + public function testLoadNoValueAttributeBefore() + { + $dom = new Dom; + $dom->load('
Main content here
'); + $this->assertEquals('
Main content here
', $dom->innerHtml); + } + public function testLoadUpperCase() { $dom = new Dom; diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index df9022d5..ef1e4d49 100644 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -224,6 +224,29 @@ public function testOuterHtmlMagic() $this->assertEquals('', $parent->outerHtml); } + public function testOuterHtmlNoValueAttribute() + { + $parent = new HtmlNode('div'); + $parent->setAttribute('class', [ + 'value' => 'all', + 'doubleQuote' => true, + ]); + $childa = new HtmlNode('a'); + $childa->setAttribute('href', [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ]); + $childa->setAttribute('ui-view', null); + $childbr = new HtmlNode('br'); + $childbr->getTag()->selfClosing(); + + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals('', $parent->outerHtml); + } + public function testText() { $a = new Tag('a'); From 44a07cdeb46c524cfc01f2aac5087093ace47249 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 21 Apr 2015 09:46:34 -0400 Subject: [PATCH 004/200] Version 1.6.5 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4089b242..e71c3f30 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 1.6.4 +Version 1.6.5 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) From 022356b39fa8bf69c3c00b18375ddcdbd7847e58 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 16 Aug 2015 17:01:59 -0400 Subject: [PATCH 005/200] Updated the dev version of phpunit --- composer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer.json b/composer.json index 369aada4..5f985092 100644 --- a/composer.json +++ b/composer.json @@ -18,7 +18,7 @@ "paquettg/string-encode": "0.1.0" }, "require-dev": { - "phpunit/phpunit": "4.4.*", + "phpunit/phpunit": "4.8.*", "satooshi/php-coveralls": "0.6.*", "mockery/mockery": "0.9.*" }, From 91c41e78a47f524e941d71415a6ef915d4140feb Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 16 Aug 2015 19:06:15 -0400 Subject: [PATCH 006/200] Added child selector fixes #24 --- src/PHPHtmlParser/Dom.php | 2 +- src/PHPHtmlParser/Selector.php | 91 ++++++++++++++++++++++++++++------ tests/SelectorTest.php | 46 +++++++++++++++++ 3 files changed, 122 insertions(+), 17 deletions(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 7780b586..b52a6cad 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -372,7 +372,7 @@ protected function clean($str) return $str; } - /**]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; protected $selectors = []; @@ -51,9 +51,17 @@ public function find($node) if (count($selector) == 0) continue; + $options = []; foreach ($selector as $rule) { - $nodes = $this->seek($nodes, $rule); + if ($rule['alterNext']) + { + $options[] = $this->alterNext($rule); + continue; + } + $nodes = $this->seek($nodes, $rule, $options); + // clear the options + $options = []; } // this is the final set of nodes @@ -81,11 +89,18 @@ protected function parseSelectorString($selector) foreach ($matches as $match) { // default values - $tag = strtolower(trim($match[1])); - $operator = '='; - $key = null; - $value = null; - $noKey = false; + $tag = strtolower(trim($match[1])); + $operator = '='; + $key = null; + $value = null; + $noKey = false; + $alterNext = false; + + // check for elements that alter the behavior of the next element + if ($tag == '>') + { + $alterNext = true; + } // check for id selector if ( ! empty($match[2])) @@ -123,11 +138,12 @@ protected function parseSelectorString($selector) } $result[] = [ - 'tag' => $tag, - 'key' => $key, - 'value' => $value, - 'operator' => $operator, - 'noKey' => $noKey, + 'tag' => $tag, + 'key' => $key, + 'value' => $value, + 'operator' => $operator, + 'noKey' => $noKey, + 'alterNext' => $alterNext, ]; if (trim($match[7]) == ',') { @@ -149,9 +165,10 @@ protected function parseSelectorString($selector) * * @param array $nodes * @param array $rule + * @param array $options * @recursive */ - protected function seek(array $nodes, array $rule) + protected function seek(array $nodes, array $rule, array $options) { // XPath index if ( ! empty($rule['tag']) AND ! empty($rule['key']) AND @@ -173,6 +190,8 @@ protected function seek(array $nodes, array $rule) return []; } + $options = $this->flattenOptions($options); + $return = []; foreach ($nodes as $node) { @@ -256,7 +275,7 @@ protected function seek(array $nodes, array $rule) { $check = $this->match($rule['operator'], $rule['value'], $class); } - if ($check) + if ($check) break; } } @@ -294,10 +313,12 @@ protected function seek(array $nodes, array $rule) } } - if (count($children) > 0) + if ((! isset($options['checkGrandChildren']) || + $options['checkGrandChildren']) + && count($children) > 0) { // we have children that failed but are not leaves. - $matches = $this->seek($children, $rule); + $matches = $this->seek($children, $rule, $options); foreach ($matches as $match) { $return[] = $match; @@ -339,4 +360,42 @@ protected function match($operator, $pattern, $value) } return false; } + + /** + * Attempts to figure out what the alteration will be for + * the next element. + * + * @param array $rule + * @return array + */ + protected function alterNext($rule) + { + $options = []; + if ($rule['tag'] == '>') + { + $options['checkGrandChildren'] = false; + } + + return $options; + } + + /** + * Flattens the option array. + * + * @param array $optionsArray + * @return array + */ + protected function flattenOptions(array $optionsArray) + { + $options = []; + foreach ($optionsArray as $optionArray) + { + foreach ($optionArray as $key => $option) + { + $options[$key] = $option; + } + } + + return $options; + } } diff --git a/tests/SelectorTest.php b/tests/SelectorTest.php index 9cd1ab2f..fe06568a 100644 --- a/tests/SelectorTest.php +++ b/tests/SelectorTest.php @@ -157,4 +157,50 @@ public function testFindXpathKeySelector() $selector = new Selector('div[1]'); $this->assertEquals($parent->id(), $selector->find($parent)[0]->id()); } + + public function testFindChildMultipleLevelsDeep() + { + $root = new HtmlNode(new Tag('root')); + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('ul')); + $child2 = new HtmlNode(new Tag('li')); + $root->addChild($parent); + $parent->addChild($child1); + $child1->addChild($child2); + + $selector = new Selector('div li'); + $this->assertEquals(1, count($selector->find($root))); + } + + public function testFindAllChildren() + { + $root = new HtmlNode(new Tag('root')); + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('ul')); + $child2 = new HtmlNode(new Tag('span')); + $child3 = new HtmlNode(new Tag('ul')); + $root->addChild($parent); + $parent->addChild($child1); + $child2->addChild($child3); + $parent->addChild($child2); + + $selector = new Selector('div ul'); + $this->assertEquals(2, count($selector->find($root))); + } + + public function testFindChildUsingChildSelector() + { + $root = new HtmlNode(new Tag('root')); + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('ul')); + $child2 = new HtmlNode(new Tag('span')); + $child3 = new HtmlNode(new Tag('ul')); + $root->addChild($parent); + $parent->addChild($child1); + $child2->addChild($child3); + $parent->addChild($child2); + + $selector = new Selector('div > ul'); + $this->assertEquals(1, count($selector->find($root))); + } } From c550d797971c2877031abbb32d862b12a8460cee Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 16 Aug 2015 19:29:00 -0400 Subject: [PATCH 007/200] Replaced preg_replace with mb_ereg_replace --- src/PHPHtmlParser/Dom.php | 20 ++++++++++---------- src/PHPHtmlParser/Dom/TextNode.php | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index b52a6cad..cbc7e307 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -344,30 +344,30 @@ protected function clean($str) $str = str_replace(["\r\n", "\r", "\n"], ' ', $str); // strip the doctype - $str = preg_replace("''is", '', $str); + $str = mb_eregi_replace("", '', $str); // strip out comments - $str = preg_replace("''is", '', $str); + $str = mb_eregi_replace("", '', $str); // strip out cdata - $str = preg_replace("''is", '', $str); + $str = mb_eregi_replace("", '', $str); // strip out +

....

'); + $this->assertEquals('....', $dom->getElementsByTag('p')[1]->innerHtml); + } } From 222f59c8d59c00914ac3db4c982d9ddcbaee1512 Mon Sep 17 00:00:00 2001 From: Martin Date: Sat, 12 Sep 2015 09:01:25 +0200 Subject: [PATCH 011/200] Update HtmlNode.php typos --- src/PHPHtmlParser/Dom/HtmlNode.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index 9bef3f00..ce2c2459 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -50,7 +50,7 @@ public function __construct($tag) * Gets the inner html of this node. * * @return string - * @throws UnkownChildTypeException + * @throws UnknownChildTypeException */ public function innerHtml() { From 2f78ee05eeb5c05780e52543b800e9bcaadaef9a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 13 Sep 2015 09:42:34 -0400 Subject: [PATCH 012/200] Added feature to allow array usage of html node. fixes #26 --- src/PHPHtmlParser/Dom/AbstractNode.php | 35 +++++++++++++++++++++ src/PHPHtmlParser/Dom/ArrayNode.php | 42 ++++++++++++++++++++++++++ src/PHPHtmlParser/Dom/HtmlNode.php | 12 +++++++- 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 src/PHPHtmlParser/Dom/ArrayNode.php diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 1690e9ca..189d2249 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -210,6 +210,41 @@ public function getChild($id) return $this->children[$id]['node']; } + /** + * Returns a new array of child nodes + * + * @return array + */ + public function getChildren() + { + $nodes = []; + try + { + $child = $this->firstChild(); + do + { + $nodes[] = $child; + $child = $this->nextChild($child->id()); + } while ( ! is_null($child)); + } + catch (ChildNotFoundException $e) + { + // we are done looking for children + } + + return $nodes; + } + + /** + * Counts children + * + * @return int + */ + public function countChildren() + { + return count($this->children); + } + /** * Adds a child node to this node and returns the id of the child for this * parent. diff --git a/src/PHPHtmlParser/Dom/ArrayNode.php b/src/PHPHtmlParser/Dom/ArrayNode.php new file mode 100644 index 00000000..3e341b3c --- /dev/null +++ b/src/PHPHtmlParser/Dom/ArrayNode.php @@ -0,0 +1,42 @@ +getIteratorArray()); + } + + /** + * Returns the count of the iterator array. + * + * @return int + */ + public function count() + { + return count($this->getIteratorArray()); + } + +} diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index ce2c2459..98a53286 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -4,7 +4,7 @@ use PHPHtmlParser\Exceptions\UnknownChildTypeException; use PHPHtmlParser\Exceptions\ChildNotFoundException; -class HtmlNode extends AbstractNode { +class HtmlNode extends ArrayNode { /** * Remembers what the innerHtml was if it was scaned previously. @@ -202,4 +202,14 @@ protected function clear() $this->outerHtml = null; $this->text = null; } + + /** + * Returns all children of this html node. + * + * @return array + */ + protected function getIteratorArray() + { + return $this->getChildren(); + } } From 2b9651cec5b87aee0c34a40493eb4ac03628cfb7 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 13 Sep 2015 09:43:07 -0400 Subject: [PATCH 013/200] Added tests for the new array access --- tests/Node/ChildrenTest.php | 20 ++++++++++++ tests/Node/HtmlTest.php | 63 +++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/tests/Node/ChildrenTest.php b/tests/Node/ChildrenTest.php index bb90eab8..86397a5c 100644 --- a/tests/Node/ChildrenTest.php +++ b/tests/Node/ChildrenTest.php @@ -41,4 +41,24 @@ public function testPreviousSibling() $child2->setParent($parent); $this->assertEquals($child->id(), $child2->previousSibling()->id()); } + + public function testGetChildren() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child->setParent($parent); + $child2->setParent($parent); + $this->assertEquals($child->id(), $parent->getChildren()[0]->id()); + } + + public function testCountChildren() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child->setParent($parent); + $child2->setParent($parent); + $this->assertEquals(2, $parent->countChildren()); + } } diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index ef1e4d49..8bd58019 100644 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -1,5 +1,6 @@ setAttribute('class', 'foo'); $this->assertEquals('foo', $node->getAttribute('class')); } + + public function testCountable() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals(count($parent->getChildren()), count($parent)); + } + + public function testIterator() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $children = 0; + foreach ($parent as $child) { + ++$children; + } + $this->assertEquals(2, $children); + } } From 5e554f297f66bd44f58dc6a4d45d0eea5e7805f8 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 13 Sep 2015 09:45:07 -0400 Subject: [PATCH 014/200] Version 1.6.7 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 465f0896..ca785c70 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 1.6.6 +Version 1.6.7 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) From 31f1b09484e534519648b8ef9411e38d8832428f Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 13 Sep 2015 14:33:10 -0400 Subject: [PATCH 015/200] Updated composer --- composer.json | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/composer.json b/composer.json index 5f985092..4ba19ec1 100644 --- a/composer.json +++ b/composer.json @@ -15,17 +15,18 @@ ], "require": { "php": ">=5.4", - "paquettg/string-encode": "0.1.0" + "paquettg/string-encode": "~0.1.0" }, "require-dev": { - "phpunit/phpunit": "4.8.*", - "satooshi/php-coveralls": "0.6.*", - "mockery/mockery": "0.9.*" + "phpunit/phpunit": "~4.8.0", + "satooshi/php-coveralls": "~0.6.0", + "mockery/mockery": "~0.9.0" }, "autoload": { "psr-0": { "PHPHtmlParser": "src/" } }, - "minimum-stability": "dev" + "minimum-stability": "dev", + "prefer-stable": true } From 1e0e1c997e30809fdde76c2572edfc71d6b83162 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 13 Sep 2015 15:09:03 -0400 Subject: [PATCH 016/200] Updated in code documentation --- src/PHPHtmlParser/Content.php | 4 +-- src/PHPHtmlParser/Dom.php | 36 ++++++++++--------- src/PHPHtmlParser/Dom/AbstractNode.php | 17 +++++---- src/PHPHtmlParser/Dom/ArrayNode.php | 1 - src/PHPHtmlParser/Dom/Collection.php | 26 +++++++++++--- src/PHPHtmlParser/Dom/HtmlNode.php | 10 +++--- src/PHPHtmlParser/Dom/MockNode.php | 2 +- src/PHPHtmlParser/Dom/Tag.php | 9 ++--- .../Exceptions/EmptyCollectionException.php | 4 +++ src/PHPHtmlParser/Options.php | 9 ++++- src/PHPHtmlParser/Selector.php | 10 +++--- src/PHPHtmlParser/StaticDom.php | 6 ++-- 12 files changed, 84 insertions(+), 50 deletions(-) create mode 100644 src/PHPHtmlParser/Exceptions/EmptyCollectionException.php diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 92e015d2..392ef7d0 100644 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -72,7 +72,7 @@ public function char($char = null) * Moves the current position forward. * * @param int $count - * @chainable + * @return $this */ public function fastForward($count) { @@ -84,7 +84,7 @@ public function fastForward($count) * Moves the current position backward. * * @param int $count - * @chainable + * @return $this */ public function rewind($count) { diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 37e347b1..856a9011 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -108,8 +108,8 @@ public function __get($name) * Attempts to load the dom from any resource, string, file, or URL. * * @param string $str - * @param array $option - * @chainable + * @param array $options + * @return $this */ public function load($str, $options = []) { @@ -131,8 +131,8 @@ public function load($str, $options = []) * Loads the dom from a document file/url * * @param string $file - * @param array $option - * @chainable + * @param array $options + * @return $this */ public function loadFromFile($file, $options = []) { @@ -144,9 +144,9 @@ public function loadFromFile($file, $options = []) * the content from a url. * * @param string $url - * @param array $option + * @param array $options * @param CurlInterface $curl - * @chainable + * @return $this */ public function loadFromUrl($url, $options = [], CurlInterface $curl = null) { @@ -166,7 +166,7 @@ public function loadFromUrl($url, $options = [], CurlInterface $curl = null) * * @param string $str * @param array $option - * @chainable + * @return $this */ public function loadStr($str, $option) { @@ -192,7 +192,7 @@ public function loadStr($str, $option) * Sets a global options array to be used by all load calls. * * @param array $options - * @chainable + * @return $this */ public function setOptions(array $options) { @@ -218,7 +218,7 @@ public function find($selector, $nth = null) * be self closing. * * @param string|array $tag - * @chainable + * @return $this */ public function addSelfClosingTag($tag) { @@ -238,7 +238,7 @@ public function addSelfClosingTag($tag) * always be self closing. * * @param string|array $tag - * @chainable + * @return $this */ public function removeSelfClosingTag($tag) { @@ -253,7 +253,7 @@ public function removeSelfClosingTag($tag) /** * Sets the list of self closing tags to empty. * - * @chainable + * @return $this */ public function clearSelfClosingTags() { @@ -264,7 +264,7 @@ public function clearSelfClosingTags() /** * Simple wrapper function that returns the first child. * - * @return Node + * @return \PHPHtmlParser\Dom\AbstractNode */ public function firstChild() { @@ -275,7 +275,7 @@ public function firstChild() /** * Simple wrapper function that returns the last child. * - * @return AbstractNode + * @return \PHPHtmlParser\Dom\AbstractNode */ public function lastChild() { @@ -287,7 +287,8 @@ public function lastChild() * Simple wrapper function that returns an element by the * id. * - * @return AbstractNode + * @param string $id + * @return \PHPHtmlParser\Dom\AbstractNode */ public function getElementById($id) { @@ -298,7 +299,8 @@ public function getElementById($id) /** * Simple wrapper function that returns all elements by * tag name. - * + * + * @param string $name * @return array */ public function getElementsByTag($name) @@ -311,6 +313,7 @@ public function getElementsByTag($name) * Simple wrapper function that returns all elements by * class name. * + * @param string $class * @return array */ public function getElementsByClass($class) @@ -442,7 +445,8 @@ protected function parse() * Attempt to parse a tag out of the content. * * @return array - */ + * @throws StrictException + */ protected function parseTag() { $return = [ diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 189d2249..46040c67 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -19,7 +19,7 @@ abstract class AbstractNode { /** * Contains the tag name/type * - * @var string + * @var \PHPHtmlParser\Dom\Tag */ protected $tag; @@ -40,7 +40,7 @@ abstract class AbstractNode { /** * Contains the parent Node. * - * @var Node + * @var AbstractNode */ protected $parent = null; @@ -121,7 +121,7 @@ public function id() /** * Returns the parent of node. * - * @return Node + * @return AbstractNode */ public function getParent() { @@ -132,7 +132,7 @@ public function getParent() * Sets the parent node. * * @param AbstractNode $parent - * @chainable + * @return $this * @throws CircularException */ public function setParent(AbstractNode $parent) @@ -251,12 +251,11 @@ public function countChildren() * * @param AbstractNode $child * @return bool - * @throws CircularExceptionException + * @throws CircularException */ public function addChild(AbstractNode $child) { - $key = null; - $newKey = 0; + $key = null; // check integrity if ($this->isAncestor($child->id())) @@ -302,7 +301,7 @@ public function addChild(AbstractNode $child) * Removes the child by id. * * @param int $id - * @chainable + * @return $this */ public function removeChild($id) { @@ -533,7 +532,7 @@ public function getAttribute($key) * * @param string $key * @param string $value - * @chainable + * @return $this */ public function setAttribute($key, $value) { diff --git a/src/PHPHtmlParser/Dom/ArrayNode.php b/src/PHPHtmlParser/Dom/ArrayNode.php index 3e341b3c..e37ff719 100644 --- a/src/PHPHtmlParser/Dom/ArrayNode.php +++ b/src/PHPHtmlParser/Dom/ArrayNode.php @@ -4,7 +4,6 @@ use Countable; use ArrayIterator; use IteratorAggregate; -use PHPHtmlParser\Exceptions\IncorrectChildMethodException; /** * Dom node object which will allow users to use it as diff --git a/src/PHPHtmlParser/Dom/Collection.php b/src/PHPHtmlParser/Dom/Collection.php index 45c4f320..adf59bb1 100644 --- a/src/PHPHtmlParser/Dom/Collection.php +++ b/src/PHPHtmlParser/Dom/Collection.php @@ -5,6 +5,7 @@ use ArrayAccess; use ArrayIterator; use IteratorAggregate; +use PHPHtmlParser\Exceptions\EmptyCollectionException; class Collection implements IteratorAggregate, ArrayAccess, Countable { @@ -22,7 +23,8 @@ class Collection implements IteratorAggregate, ArrayAccess, Countable { * @param string $method * @param array $arguments * @return mixed; - */ + * @throws EmptyCollectionException + */ public function __call($method, $arguments) { $node = reset($this->collection); @@ -30,6 +32,10 @@ public function __call($method, $arguments) { return call_user_func_array([$node, $method], $arguments); } + else + { + throw new EmptyCollectionException('The collection does not contain any Nodes.'); + } } /** @@ -38,7 +44,8 @@ public function __call($method, $arguments) * * @param mixed $key * @return mixed - */ + * @throws EmptyCollectionException + */ public function __get($key) { $node = reset($this->collection); @@ -46,6 +53,10 @@ public function __get($key) { return $node->$key; } + else + { + throw new EmptyCollectionException('The collection does not contain any Nodes.'); + } } /** @@ -53,7 +64,8 @@ public function __get($key) * the collection. * * @return string - */ + * @throws EmptyCollectionException + */ public function __toString() { $node = reset($this->collection); @@ -61,6 +73,10 @@ public function __toString() { return (string) $node; } + else + { + throw new EmptyCollectionException('The collection does not contain any Nodes.'); + } } /** @@ -126,9 +142,9 @@ public function offsetUnset($offset) * Gets a node at the given offset, or null * * @param mixed $offset - * @return $offset + * @return mixed */ - public function offsetGet($offset) + public function offsetGet($offset) { return isset($this->collection[$offset]) ? $this->collection[$offset] : null; } diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index 98a53286..ba442367 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -7,19 +7,19 @@ class HtmlNode extends ArrayNode { /** - * Remembers what the innerHtml was if it was scaned previously. + * Remembers what the innerHtml was if it was scanned previously. */ protected $innerHtml = null; /** - * Remembers what the outerHtml was if it was scaned previously. + * Remembers what the outerHtml was if it was scanned previously. * * @var string */ protected $outerHtml = null; /** - * Remembers what the text was if it was scaned previously. + * Remembers what the text was if it was scanned previously. * * @var string */ @@ -35,7 +35,9 @@ class HtmlNode extends ArrayNode { /** * Sets up the tag of this node. - */ + * + * @param $tag + */ public function __construct($tag) { if ( ! $tag instanceof Tag) diff --git a/src/PHPHtmlParser/Dom/MockNode.php b/src/PHPHtmlParser/Dom/MockNode.php index 2d7256fe..66780e21 100644 --- a/src/PHPHtmlParser/Dom/MockNode.php +++ b/src/PHPHtmlParser/Dom/MockNode.php @@ -4,7 +4,7 @@ /** * This mock object is used solely for testing the abstract * class Node with out any potential side effects caused - * by testing a supperclass of Node. + * by testing a supper class of Node. * * This object is not to be used for any other reason. */ diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index e2699630..1f70ca7b 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -67,7 +67,7 @@ public function name() /** * Sets the tag to be self closing. * - * @chainable + * @return $this */ public function selfClosing() { @@ -93,7 +93,8 @@ public function setEncoding(Encode $encode) /** * Sets the noise for this tag (if any) * - * @chainable + * @param $noise + * @return $this */ public function noise($noise) { @@ -106,7 +107,7 @@ public function noise($noise) * * @param string $key * @param mixed $value - * @chainable + * @return $this */ public function setAttribute($key, $value) { @@ -127,7 +128,7 @@ public function setAttribute($key, $value) * Sets the attributes for this tag * * @param array $attr - * @chainable + * @return $this */ public function setAttributes(array $attr) { diff --git a/src/PHPHtmlParser/Exceptions/EmptyCollectionException.php b/src/PHPHtmlParser/Exceptions/EmptyCollectionException.php new file mode 100644 index 00000000..a9cf2cb5 --- /dev/null +++ b/src/PHPHtmlParser/Exceptions/EmptyCollectionException.php @@ -0,0 +1,4 @@ +selectors as $selector) @@ -166,6 +167,7 @@ protected function parseSelectorString($selector) * @param array $nodes * @param array $rule * @param array $options + * @return array * @recursive */ protected function seek(array $nodes, array $rule, array $options) diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php index 0f053cf2..634ab7bd 100644 --- a/src/PHPHtmlParser/StaticDom.php +++ b/src/PHPHtmlParser/StaticDom.php @@ -55,7 +55,7 @@ class_alias(__CLASS__, $className); * new object. * * @param string $str - * @chainable + * @return $this */ public static function load($str) { @@ -69,7 +69,7 @@ public static function load($str) * new object. * @param string $file - * @chainable + * @return $this */ public static function loadFromFile($file) { @@ -84,7 +84,7 @@ public static function loadFromFile($file) * * @param string $url * @param CurlInterface $curl - * @chainable + * @return $this */ public static function loadFromUrl($url, CurlInterface $curl = null) { From 61fcb736584df7766192720508bdf32be2432409 Mon Sep 17 00:00:00 2001 From: akond Date: Wed, 16 Sep 2015 20:30:59 +0300 Subject: [PATCH 017/200] Refactoring out isChild method. --- src/PHPHtmlParser/Dom/AbstractNode.php | 27 +++++++++++++++++++++++--- tests/Node/ChildrenTest.php | 14 +++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 46040c67..06dd6df5 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -358,13 +358,13 @@ public function previousChild($id) } /** - * Checks if the given node id is a decendant of the + * Checks if the given node id is a child of the * current node. * * @param int $id * @return bool */ - public function isDescendant($id) + public function isChild ($id) { foreach ($this->children as $childId => $child) { @@ -372,7 +372,28 @@ public function isDescendant($id) { return true; } - elseif ($child['node']->hasChildren()) + } + + return false; + } + + /** + * Checks if the given node id is a decendant of the + * current node. + * + * @param int $id + * @return bool + */ + public function isDescendant($id) + { + if ($this->isChild ($id)) + { + return true; + } + + foreach ($this->children as $childId => $child) + { + if ($child['node']->hasChildren()) { if ($child['node']->isDescendant($id)) { diff --git a/tests/Node/ChildrenTest.php b/tests/Node/ChildrenTest.php index 86397a5c..65fe0f35 100644 --- a/tests/Node/ChildrenTest.php +++ b/tests/Node/ChildrenTest.php @@ -61,4 +61,18 @@ public function testCountChildren() $child2->setParent($parent); $this->assertEquals(2, $parent->countChildren()); } + + public function testIsChild () + { + $parent = new Node; + $child1 = new Node; + $child2 = new Node; + + $child1->setParent($parent); + $child2->setParent($child1); + + $this->assertTrue ($parent->isChild ($child1->id ())); + $this->assertTrue ($parent->isDescendant ($child2->id ())); + $this->assertFalse ($parent->isChild ($child2->id ())); + } } From 8057f8e1e18409a1a353efde406152c72eaa9c87 Mon Sep 17 00:00:00 2001 From: Lukas Rosenstock Date: Sun, 25 Oct 2015 19:09:15 +0100 Subject: [PATCH 018/200] Added options: cleanupInput, removeScripts and removeStyles --- .gitignore | 2 + src/PHPHtmlParser/Dom.php | 50 +++++++++++++++--------- src/PHPHtmlParser/Options.php | 7 +++- tests/Options/CleanupTest.php | 73 +++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 21 deletions(-) create mode 100644 .gitignore create mode 100644 tests/Options/CleanupTest.php diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..7579f743 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +vendor +composer.lock diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 856a9011..06dc80e9 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -8,7 +8,7 @@ use stringEncode\Encode; class Dom { - + /** * The charset we would like the output to be in. * @@ -43,7 +43,7 @@ class Dom { * @var int */ protected $rawSize; - + /** * The size of the document after it is cleaned. * @@ -59,7 +59,7 @@ class Dom { protected $globalOptions = []; /** - * A persistent option object to be used for all options in the + * A persistent option object to be used for all options in the * parsing of the file. * * @var Options @@ -232,7 +232,7 @@ public function addSelfClosingTag($tag) } return $this; } - + /** * Removes the tag (or tags in an array) from the list of tags that will * always be self closing. @@ -297,7 +297,7 @@ public function getElementById($id) } /** - * Simple wrapper function that returns all elements by + * Simple wrapper function that returns all elements by * tag name. * * @param string $name @@ -343,6 +343,12 @@ protected function isLoaded() */ protected function clean($str) { + if ($this->options->get('cleanupInput') != true) + { + // skip entire cleanup step + return $str; + } + // clean out the \n\r $str = str_replace(["\r\n", "\r", "\n"], ' ', $str); @@ -351,24 +357,30 @@ protected function clean($str) // strip out comments $str = mb_eregi_replace("", '', $str); - + // strip out cdata $str = mb_eregi_replace("", '', $str); - + // strip out -

....

'); - $this->assertEquals('....', $dom->getElementsByTag('p')[1]->innerHtml); - } - - public function testMultipleDoubleQuotes() - { - $dom = new Dom; - $dom->load('Hello'); - $this->assertEquals('This is a "test" of double quotes', $dom->getElementsByTag('a')[0]->title); - } - - public function testMultipleSingleQuotes() - { - $dom = new Dom; - $dom->load("Hello"); - $this->assertEquals("Ain't this the best", $dom->getElementsByTag('a')[0]->title); - } - - public function testBeforeClosingTag() - { - $dom = new Dom; - $dom->load("
"); - $this->assertEquals("
", (string) $dom); - } - - public function testCodeTag() - { - $dom = new Dom; - $dom->load('hello$foo = "bar";'); - $this->assertEquals('hello$foo = "bar";', (string) $dom); - } - - public function testDeleteNode() - { - $dom = new Dom; - $dom->load('

Hey bro, click here
:)

'); - $a = $dom->find('a')[0]; - $a->delete(); - unset($a); - $this->assertEquals('

Hey bro,
:)

', (string) $dom); - } + } + + public function testLoadFromUrl() + { + $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); + $curl->shouldReceive('get') + ->once() + ->with('http://google.com') + ->andReturn(file_get_contents('tests/files/small.html')); + + $dom = new Dom; + $dom->loadFromUrl('http://google.com', [], $curl); + $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); + } + + public function testToStringMagic() + { + $dom = new Dom; + $dom->load('

Hey bro, click here
:)

'); + $this->assertEquals('

Hey bro, click here
:)

', (string) $dom); + } + + public function testGetMagic() + { + $dom = new Dom; + $dom->load('

Hey bro, click here
:)

'); + $this->assertEquals('

Hey bro, click here
:)

', $dom->innerHtml); + } + + public function testFirstChild() + { + $dom = new Dom; + $dom->load('

Hey bro, click here


'); + $this->assertEquals('

Hey bro, click here

', $dom->firstChild()->outerHtml); + } + + public function testLastChild() + { + $dom = new Dom; + $dom->load('

Hey bro, click here


'); + $this->assertEquals('
', $dom->lastChild()->outerHtml); + } + + public function testGetElementById() + { + $dom = new Dom; + $dom->load('

Hey bro, click here


'); + $this->assertEquals('click here', $dom->getElementById('78')->outerHtml); + } + + public function testGetElementsByTag() + { + $dom = new Dom; + $dom->load('

Hey bro, click here


'); + $this->assertEquals('

Hey bro, click here

', $dom->getElementsByTag('p')[0]->outerHtml); + } + + public function testGetElementsByClass() + { + $dom = new Dom; + $dom->load('

Hey bro, click here


'); + $this->assertEquals('

Hey bro, click here

', $dom->getElementsByClass('all')[0]->innerHtml); + } + + public function testEnforceEncoding() + { + $dom = new Dom; + $dom->load('tests/files/horrible.html', [ + 'enforceEncoding' => 'UTF-8', + ]); + $this->assertNotEquals('', $dom->find('table input', 1)->outerHtml); + } + + public function testScriptCleanerScriptTag() + { + $dom = new Dom; + $dom->load(' +

.....

+ +

....

'); + $this->assertEquals('....', $dom->getElementsByTag('p')[1]->innerHtml); + } + + public function testMultipleDoubleQuotes() + { + $dom = new Dom; + $dom->load('Hello'); + $this->assertEquals('This is a "test" of double quotes', $dom->getElementsByTag('a')[0]->title); + } + + public function testMultipleSingleQuotes() + { + $dom = new Dom; + $dom->load("Hello"); + $this->assertEquals("Ain't this the best", $dom->getElementsByTag('a')[0]->title); + } + + public function testBeforeClosingTag() + { + $dom = new Dom; + $dom->load("
"); + $this->assertEquals("
", (string) $dom); + } + + public function testCodeTag() + { + $dom = new Dom; + $dom->load('hello$foo = "bar";'); + $this->assertEquals('hello$foo = "bar";', (string) $dom); + } + + public function testDeleteNode() + { + $dom = new Dom; + $dom->load('

Hey bro, click here
:)

'); + $a = $dom->find('a')[0]; + $a->delete(); + unset($a); + $this->assertEquals('

Hey bro,
:)

', (string) $dom); + } } diff --git a/tests/Node/ChildrenTest.php b/tests/Node/ChildrenTest.php index 5824c9c6..e8b822da 100644 --- a/tests/Node/ChildrenTest.php +++ b/tests/Node/ChildrenTest.php @@ -4,115 +4,115 @@ class NodeChildTest extends PHPUnit_Framework_TestCase { - public function testGetParent() - { - $parent = new Node; - $child = new Node; - $child->setParent($parent); - $this->assertEquals($parent->id(), $child->getParent()->id()); - } + public function testGetParent() + { + $parent = new Node; + $child = new Node; + $child->setParent($parent); + $this->assertEquals($parent->id(), $child->getParent()->id()); + } - public function testSetParentTwice() - { - $parent = new Node; - $parent2 = new Node; - $child = new Node; - $child->setParent($parent); - $child->setParent($parent2); - $this->assertEquals($parent2->id(), $child->getParent()->id()); - } + public function testSetParentTwice() + { + $parent = new Node; + $parent2 = new Node; + $child = new Node; + $child->setParent($parent); + $child->setParent($parent2); + $this->assertEquals($parent2->id(), $child->getParent()->id()); + } - public function testNextSibling() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child->setParent($parent); - $child2->setParent($parent); - $this->assertEquals($child2->id(), $child->nextSibling()->id()); - } + public function testNextSibling() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child->setParent($parent); + $child2->setParent($parent); + $this->assertEquals($child2->id(), $child->nextSibling()->id()); + } - /** - * @expectedException PHPHtmlParser\Exceptions\ChildNotFoundException - */ - public function testNextSiblingNotFound() - { - $parent = new Node; - $child = new Node; - $child->setParent($parent); - $child->nextSibling(); - } + /** + * @expectedException PHPHtmlParser\Exceptions\ChildNotFoundException + */ + public function testNextSiblingNotFound() + { + $parent = new Node; + $child = new Node; + $child->setParent($parent); + $child->nextSibling(); + } - /** - * @expectedException PHPHtmlParser\Exceptions\ParentNotFoundException - */ - public function testNextSiblingNoParent() - { - $child = new Node; - $child->nextSibling(); - } + /** + * @expectedException PHPHtmlParser\Exceptions\ParentNotFoundException + */ + public function testNextSiblingNoParent() + { + $child = new Node; + $child->nextSibling(); + } - public function testPreviousSibling() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child->setParent($parent); - $child2->setParent($parent); - $this->assertEquals($child->id(), $child2->previousSibling()->id()); - } + public function testPreviousSibling() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child->setParent($parent); + $child2->setParent($parent); + $this->assertEquals($child->id(), $child2->previousSibling()->id()); + } - /** - * @expectedException PHPHtmlParser\Exceptions\ChildNotFoundException - */ - public function testPreviousSiblingNotFound() - { - $parent = new Node; - $node = new Node; - $node->setParent($parent); - $node->previousSibling(); - } + /** + * @expectedException PHPHtmlParser\Exceptions\ChildNotFoundException + */ + public function testPreviousSiblingNotFound() + { + $parent = new Node; + $node = new Node; + $node->setParent($parent); + $node->previousSibling(); + } - /** - * @expectedException PHPHtmlParser\Exceptions\ParentNotFoundException - */ - public function testPreviousSiblingNoParent() - { - $child = new Node; - $child->previousSibling(); - } + /** + * @expectedException PHPHtmlParser\Exceptions\ParentNotFoundException + */ + public function testPreviousSiblingNoParent() + { + $child = new Node; + $child->previousSibling(); + } - public function testGetChildren() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child->setParent($parent); - $child2->setParent($parent); - $this->assertEquals($child->id(), $parent->getChildren()[0]->id()); - } + public function testGetChildren() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child->setParent($parent); + $child2->setParent($parent); + $this->assertEquals($child->id(), $parent->getChildren()[0]->id()); + } - public function testCountChildren() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child->setParent($parent); - $child2->setParent($parent); - $this->assertEquals(2, $parent->countChildren()); - } + public function testCountChildren() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child->setParent($parent); + $child2->setParent($parent); + $this->assertEquals(2, $parent->countChildren()); + } - public function testIsChild () - { - $parent = new Node; - $child1 = new Node; - $child2 = new Node; + public function testIsChild () + { + $parent = new Node; + $child1 = new Node; + $child2 = new Node; - $child1->setParent($parent); - $child2->setParent($child1); + $child1->setParent($parent); + $child2->setParent($child1); - $this->assertTrue ($parent->isChild ($child1->id ())); - $this->assertTrue ($parent->isDescendant ($child2->id ())); - $this->assertFalse ($parent->isChild ($child2->id ())); - } + $this->assertTrue ($parent->isChild ($child1->id ())); + $this->assertTrue ($parent->isDescendant ($child2->id ())); + $this->assertFalse ($parent->isChild ($child2->id ())); + } } diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index 9a4b456d..e6998305 100644 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -8,452 +8,452 @@ class NodeHtmlTest extends PHPUnit_Framework_TestCase { - public function testInnerHtml() - { - $div = new Tag('div'); - $div->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $a = new Tag('a'); - $a->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $br = new Tag('br'); - $br->selfClosing(); - - $parent = new HtmlNode($div); - $childa = new HtmlNode($a); - $childbr = new HtmlNode($br); - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $this->assertEquals("link
", $parent->innerHtml()); - } - - public function testInnerHtmlTwice() - { - $div = new Tag('div'); - $div->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $a = new Tag('a'); - $br = new Tag('br'); - $br->selfClosing(); - - $parent = new HtmlNode($div); - $childa = new HtmlNode($a); - $childa->setAttribute('href', [ + public function testInnerHtml() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals("link
", $parent->innerHtml()); + } + + public function testInnerHtmlTwice() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childa->setAttribute('href', [ 'value' => 'http://google.com', 'doubleQuote' => false, - ]); - $childbr = new HtmlNode($br); - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $inner = $parent->innerHtml(); - $this->assertEquals($inner, $parent->innerHtml()); - } - - /** - * @expectedException PHPHtmlParser\Exceptions\UnknownChildTypeException - */ - public function testInnerHtmlUnkownChild() - { - $div = new Tag('div'); - $div->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $a = new Tag('a'); - $a->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $br = new Tag('br'); - $br->selfClosing(); - - $parent = new HtmlNode($div); - $childa = new HtmlNode($a); - $childbr = new MockNode($br); - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $inner = $parent->innerHtml(); - $this->assertEquals($inner, $parent->innerHtml()); - } - - public function testInnerHtmlMagic() - { - $parent = new HtmlNode('div'); - $parent->tag->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $childa = new HtmlNode('a'); - $childa->getTag()->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $childbr = new HtmlNode('br'); - $childbr->getTag()->selfClosing(); - - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $this->assertEquals("link
", $parent->innerHtml); - } - - public function testOuterHtml() - { - $div = new Tag('div'); - $div->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $a = new Tag('a'); - $a->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $br = new Tag('br'); - $br->selfClosing(); - - $parent = new HtmlNode($div); - $childa = new HtmlNode($a); - $childbr = new HtmlNode($br); - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $this->assertEquals('', $parent->outerHtml()); - } - - public function testOuterHtmlTwice() - { - $div = new Tag('div'); - $div->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $a = new Tag('a'); - $a->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $br = new Tag('br'); - $br->selfClosing(); - - $parent = new HtmlNode($div); - $childa = new HtmlNode($a); - $childbr = new HtmlNode($br); - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $outer = $parent->outerHtml(); - $this->assertEquals($outer, $parent->outerHtml()); - } - - public function testOuterHtmlEmpty() - { - $a = new Tag('a'); - $a->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $node = new HtmlNode($a); - - $this->assertEquals("", $node->OuterHtml()); - } - - public function testOuterHtmlMagic() - { - $parent = new HtmlNode('div'); - $parent->getTag()->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $childa = new HtmlNode('a'); - $childa->getTag()->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $childbr = new HtmlNode('br'); - $childbr->getTag()->selfClosing(); - - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $this->assertEquals('', $parent->outerHtml); - } - - public function testOuterHtmlNoValueAttribute() - { - $parent = new HtmlNode('div'); - $parent->setAttribute('class', [ - 'value' => 'all', - 'doubleQuote' => true, - ]); - $childa = new HtmlNode('a'); - $childa->setAttribute('href', [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ]); - $childa->setAttribute('ui-view', null); - $childbr = new HtmlNode('br'); - $childbr->getTag()->selfClosing(); - - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $this->assertEquals('', $parent->outerHtml); - } - - public function testText() - { - $a = new Tag('a'); - $node = new HtmlNode($a); - $node->addChild(new TextNode('link')); - - $this->assertEquals('link', $node->text()); - } - - public function testTextTwice() - { - $a = new Tag('a'); - $node = new HtmlNode($a); - $node->addChild(new TextNode('link')); - - $text = $node->text(); - $this->assertEquals($text, $node->text()); - } - - public function testTextNone() - { - $a = new Tag('a'); - $node = new HtmlNode($a); - - $this->assertEmpty($node->text()); - } - - public function testTextMagic() - { - $node = new HtmlNode('a'); - $node->addChild(new TextNode('link')); - - $this->assertEquals('link', $node->text); - } - - public function testTextLookInChildren() - { - $p = new HtmlNode('p'); - $a = new HtmlNode('a'); - $a->addChild(new TextNode('click me')); - $p->addChild(new TextNode('Please ')); - $p->addChild($a); - $p->addChild(new TextNode('!')); - $node = new HtmlNode('div'); - $node->addChild($p); - - $this->assertEquals('Please click me!', $node->text(true)); - } - - public function testTextLookInChildrenAndNoChildren() - { - $p = new HtmlNode('p'); - $a = new HtmlNode('a'); - $a->addChild(new TextNode('click me')); - $p->addChild(new TextNode('Please ')); - $p->addChild($a); - $p->addChild(new TextNode('!')); - - $p->text; - $p->text(true); - - $this->assertEquals('Please click me!', $p->text(true)); - } - - public function testGetAttribute() - { - $node = new HtmlNode('a'); - $node->getTag()->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - 'class' => [ - 'value' => 'outerlink rounded', - 'doubleQuote' => true, - ], - ]); - - $this->assertEquals('outerlink rounded', $node->getAttribute('class')); - } - - public function testGetAttributeMagic() - { - $node = new HtmlNode('a'); - $node->getTag()->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - 'class' => [ - 'value' => 'outerlink rounded', - 'doubleQuote' => true, - ], - ]); - - $this->assertEquals('http://google.com', $node->href); - } - - public function testGetAttributes() - { - $node = new HtmlNode('a'); - $node->getTag()->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - 'class' => [ - 'value' => 'outerlink rounded', - 'doubleQuote' => true, - ], - ]); - - $this->assertEquals('outerlink rounded', $node->getAttributes()['class']); - } - - public function testSetAttribute() - { - $node = new HtmlNode('a'); - $node->setAttribute('class', 'foo'); - $this->assertEquals('foo', $node->getAttribute('class')); - } - - public function testRemoveAttribute() - { - $node = new HtmlNode('a'); - $node->setAttribute('class', 'foo'); - $node->removeAttribute('class'); - $this->assertnull($node->getAttribute('class')); - } - - public function testRemoveAllAttributes() - { - $node = new HtmlNode('a'); - $node->setAttribute('class', 'foo'); - $node->setAttribute('href', 'http://google.com'); - $node->removeAllAttributes(); - $this->assertEquals(0, count($node->getAttributes())); - } - - public function testCountable() - { - $div = new Tag('div'); - $div->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $a = new Tag('a'); - $a->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $br = new Tag('br'); - $br->selfClosing(); - - $parent = new HtmlNode($div); - $childa = new HtmlNode($a); - $childbr = new HtmlNode($br); - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $this->assertEquals(count($parent->getChildren()), count($parent)); - } - - public function testIterator() - { - $div = new Tag('div'); - $div->setAttributes([ - 'class' => [ - 'value' => 'all', - 'doubleQuote' => true, - ], - ]); - $a = new Tag('a'); - $a->setAttributes([ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => false, - ], - ]); - $br = new Tag('br'); - $br->selfClosing(); - - $parent = new HtmlNode($div); - $childa = new HtmlNode($a); - $childbr = new HtmlNode($br); - $parent->addChild($childa); - $parent->addChild($childbr); - $childa->addChild(new TextNode('link')); - - $children = 0; - foreach ($parent as $child) { - ++$children; - } - $this->assertEquals(2, $children); - } - - /** - * @expectedException PHPHtmlParser\Exceptions\ParentNotFoundException - */ - public function testAncestorByTagFailure() - { - $a = new Tag('a'); - $node = new HtmlNode($a); - $node->ancestorByTag('div'); - } + ]); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $inner = $parent->innerHtml(); + $this->assertEquals($inner, $parent->innerHtml()); + } + + /** + * @expectedException PHPHtmlParser\Exceptions\UnknownChildTypeException + */ + public function testInnerHtmlUnkownChild() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new MockNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $inner = $parent->innerHtml(); + $this->assertEquals($inner, $parent->innerHtml()); + } + + public function testInnerHtmlMagic() + { + $parent = new HtmlNode('div'); + $parent->tag->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $childa = new HtmlNode('a'); + $childa->getTag()->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $childbr = new HtmlNode('br'); + $childbr->getTag()->selfClosing(); + + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals("link
", $parent->innerHtml); + } + + public function testOuterHtml() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals('', $parent->outerHtml()); + } + + public function testOuterHtmlTwice() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $outer = $parent->outerHtml(); + $this->assertEquals($outer, $parent->outerHtml()); + } + + public function testOuterHtmlEmpty() + { + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $node = new HtmlNode($a); + + $this->assertEquals("", $node->OuterHtml()); + } + + public function testOuterHtmlMagic() + { + $parent = new HtmlNode('div'); + $parent->getTag()->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $childa = new HtmlNode('a'); + $childa->getTag()->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $childbr = new HtmlNode('br'); + $childbr->getTag()->selfClosing(); + + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals('', $parent->outerHtml); + } + + public function testOuterHtmlNoValueAttribute() + { + $parent = new HtmlNode('div'); + $parent->setAttribute('class', [ + 'value' => 'all', + 'doubleQuote' => true, + ]); + $childa = new HtmlNode('a'); + $childa->setAttribute('href', [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ]); + $childa->setAttribute('ui-view', null); + $childbr = new HtmlNode('br'); + $childbr->getTag()->selfClosing(); + + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals('', $parent->outerHtml); + } + + public function testText() + { + $a = new Tag('a'); + $node = new HtmlNode($a); + $node->addChild(new TextNode('link')); + + $this->assertEquals('link', $node->text()); + } + + public function testTextTwice() + { + $a = new Tag('a'); + $node = new HtmlNode($a); + $node->addChild(new TextNode('link')); + + $text = $node->text(); + $this->assertEquals($text, $node->text()); + } + + public function testTextNone() + { + $a = new Tag('a'); + $node = new HtmlNode($a); + + $this->assertEmpty($node->text()); + } + + public function testTextMagic() + { + $node = new HtmlNode('a'); + $node->addChild(new TextNode('link')); + + $this->assertEquals('link', $node->text); + } + + public function testTextLookInChildren() + { + $p = new HtmlNode('p'); + $a = new HtmlNode('a'); + $a->addChild(new TextNode('click me')); + $p->addChild(new TextNode('Please ')); + $p->addChild($a); + $p->addChild(new TextNode('!')); + $node = new HtmlNode('div'); + $node->addChild($p); + + $this->assertEquals('Please click me!', $node->text(true)); + } + + public function testTextLookInChildrenAndNoChildren() + { + $p = new HtmlNode('p'); + $a = new HtmlNode('a'); + $a->addChild(new TextNode('click me')); + $p->addChild(new TextNode('Please ')); + $p->addChild($a); + $p->addChild(new TextNode('!')); + + $p->text; + $p->text(true); + + $this->assertEquals('Please click me!', $p->text(true)); + } + + public function testGetAttribute() + { + $node = new HtmlNode('a'); + $node->getTag()->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + 'class' => [ + 'value' => 'outerlink rounded', + 'doubleQuote' => true, + ], + ]); + + $this->assertEquals('outerlink rounded', $node->getAttribute('class')); + } + + public function testGetAttributeMagic() + { + $node = new HtmlNode('a'); + $node->getTag()->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + 'class' => [ + 'value' => 'outerlink rounded', + 'doubleQuote' => true, + ], + ]); + + $this->assertEquals('http://google.com', $node->href); + } + + public function testGetAttributes() + { + $node = new HtmlNode('a'); + $node->getTag()->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + 'class' => [ + 'value' => 'outerlink rounded', + 'doubleQuote' => true, + ], + ]); + + $this->assertEquals('outerlink rounded', $node->getAttributes()['class']); + } + + public function testSetAttribute() + { + $node = new HtmlNode('a'); + $node->setAttribute('class', 'foo'); + $this->assertEquals('foo', $node->getAttribute('class')); + } + + public function testRemoveAttribute() + { + $node = new HtmlNode('a'); + $node->setAttribute('class', 'foo'); + $node->removeAttribute('class'); + $this->assertnull($node->getAttribute('class')); + } + + public function testRemoveAllAttributes() + { + $node = new HtmlNode('a'); + $node->setAttribute('class', 'foo'); + $node->setAttribute('href', 'http://google.com'); + $node->removeAllAttributes(); + $this->assertEquals(0, count($node->getAttributes())); + } + + public function testCountable() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals(count($parent->getChildren()), count($parent)); + } + + public function testIterator() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $children = 0; + foreach ($parent as $child) { + ++$children; + } + $this->assertEquals(2, $children); + } + + /** + * @expectedException PHPHtmlParser\Exceptions\ParentNotFoundException + */ + public function testAncestorByTagFailure() + { + $a = new Tag('a'); + $node = new HtmlNode($a); + $node->ancestorByTag('div'); + } } diff --git a/tests/Node/ParentTest.php b/tests/Node/ParentTest.php index c5de93e4..9170c947 100644 --- a/tests/Node/ParentTest.php +++ b/tests/Node/ParentTest.php @@ -4,216 +4,216 @@ class NodeParentTest extends PHPUnit_Framework_TestCase { - public function testHasChild() - { - $parent = new Node; - $child = new Node; - $parent->addChild($child); - $this->assertTrue($parent->hasChildren()); - } - - public function testHasChildNoChildren() - { - $node = new Node; - $this->assertFalse($node->hasChildren()); - } - - public function testAddChild() - { - $parent = new Node; - $child = new Node; - $this->assertTrue($parent->addChild($child)); - } - - public function testAddChildTwoParent() - { - $parent = new Node; - $parent2 = new Node; - $child = new Node; - $parent->addChild($child); - $parent2->addChild($child); - $this->assertFalse($parent->hasChildren()); - } - - public function testGetChild() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $parent->addChild($child); - $parent->addChild($child2); - $this->assertTrue($parent->getChild($child2->id()) instanceof Node); - } - - public function testRemoveChild() - { - $parent = new Node; - $child = new Node; - $parent->addChild($child); - $parent->removeChild($child->id()); - $this->assertFalse($parent->hasChildren()); - } - - public function testRemoveChildNotExists() - { - $parent = new Node; - $parent->removeChild(1); - $this->assertFalse($parent->hasChildren()); - } - - public function testNextChild() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $parent->addChild($child); - $parent->addChild($child2); - - $this->assertEquals($child2->id(), $parent->nextChild($child->id())->id()); - } - - public function testNextChildWithRemove() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child3 = new Node; - $parent->addChild($child); - $parent->addChild($child2); - $parent->addChild($child3); - - $parent->removeChild($child2->id()); - $this->assertEquals($child3->id(), $parent->nextChild($child->id())->id()); - } - - public function testPreviousChild() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $parent->addChild($child); - $parent->addChild($child2); - - $this->assertEquals($child->id(), $parent->previousChild($child2->id())->id()); - } - - public function testPreviousChildWithRemove() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child3 = new Node; - $parent->addChild($child); - $parent->addChild($child2); - $parent->addChild($child3); - - $parent->removeChild($child2->id()); - $this->assertEquals($child->id(), $parent->previousChild($child3->id())->id()); - } - - public function testFirstChild() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child3 = new Node; - $parent->addChild($child); - $parent->addChild($child2); - $parent->addChild($child3); - - $this->assertEquals($child->id(), $parent->firstChild()->id()); - } - - public function testLastChild() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child3 = new Node; - $parent->addChild($child); - $parent->addChild($child2); - $parent->addChild($child3); - - $this->assertEquals($child3->id(), $parent->lastChild()->id()); - } - - public function testReplaceChild() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $child3 = new Node; - $parent->addChild($child); - $parent->addChild($child2); - $parent->replaceChild($child->id(), $child3); - - $this->assertFalse($parent->isChild($child->id())); - } - - /** - * @expectedException PHPHtmlParser\Exceptions\CircularException - */ - public function testSetParentDescendantException() - { - $parent = new Node; - $child = new Node; - $parent->addChild($child); - $parent->setParent($child); - } - - /** - * @expectedException PHPHtmlParser\Exceptions\CircularException - */ - public function testAddChildAncestorException() - { - $parent = new Node; - $child = new Node; - $parent->addChild($child); - $child->addChild($parent); - } - - /** - * @expectedException PHPHtmlParser\Exceptions\CircularException - */ - public function testAddItselfAsChild() - { - $parent = new Node; - $parent->addChild($parent); - } - - - public function testIsAncestorParent() - { - $parent = new Node; - $child = new Node; - $parent->addChild($child); - $this->assertTrue($child->isAncestor($parent->id())); - } - - public function testGetAncestor() - { - $parent = new Node; - $child = new Node; - $parent->addChild($child); - $ancestor = $child->getAncestor($parent->id()); - $this->assertEquals($parent->id(), $ancestor->id()); - } - - public function testGetGreatAncestor() - { - $parent = new Node; - $child = new Node; - $child2 = new Node; - $parent->addChild($child); - $child->addChild($child2); - $ancestor = $child2->getAncestor($parent->id()); - $this->assertEquals($parent->id(), $ancestor->id()); - } - - public function testGetAncestorNotFound() - { - $parent = new Node; - $ancestor = $parent->getAncestor(1); - $this->assertNull($ancestor); - } + public function testHasChild() + { + $parent = new Node; + $child = new Node; + $parent->addChild($child); + $this->assertTrue($parent->hasChildren()); + } + + public function testHasChildNoChildren() + { + $node = new Node; + $this->assertFalse($node->hasChildren()); + } + + public function testAddChild() + { + $parent = new Node; + $child = new Node; + $this->assertTrue($parent->addChild($child)); + } + + public function testAddChildTwoParent() + { + $parent = new Node; + $parent2 = new Node; + $child = new Node; + $parent->addChild($child); + $parent2->addChild($child); + $this->assertFalse($parent->hasChildren()); + } + + public function testGetChild() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + $this->assertTrue($parent->getChild($child2->id()) instanceof Node); + } + + public function testRemoveChild() + { + $parent = new Node; + $child = new Node; + $parent->addChild($child); + $parent->removeChild($child->id()); + $this->assertFalse($parent->hasChildren()); + } + + public function testRemoveChildNotExists() + { + $parent = new Node; + $parent->removeChild(1); + $this->assertFalse($parent->hasChildren()); + } + + public function testNextChild() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + + $this->assertEquals($child2->id(), $parent->nextChild($child->id())->id()); + } + + public function testNextChildWithRemove() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + $parent->addChild($child3); + + $parent->removeChild($child2->id()); + $this->assertEquals($child3->id(), $parent->nextChild($child->id())->id()); + } + + public function testPreviousChild() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + + $this->assertEquals($child->id(), $parent->previousChild($child2->id())->id()); + } + + public function testPreviousChildWithRemove() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + $parent->addChild($child3); + + $parent->removeChild($child2->id()); + $this->assertEquals($child->id(), $parent->previousChild($child3->id())->id()); + } + + public function testFirstChild() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + $parent->addChild($child3); + + $this->assertEquals($child->id(), $parent->firstChild()->id()); + } + + public function testLastChild() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + $parent->addChild($child3); + + $this->assertEquals($child3->id(), $parent->lastChild()->id()); + } + + public function testReplaceChild() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + $parent->replaceChild($child->id(), $child3); + + $this->assertFalse($parent->isChild($child->id())); + } + + /** + * @expectedException PHPHtmlParser\Exceptions\CircularException + */ + public function testSetParentDescendantException() + { + $parent = new Node; + $child = new Node; + $parent->addChild($child); + $parent->setParent($child); + } + + /** + * @expectedException PHPHtmlParser\Exceptions\CircularException + */ + public function testAddChildAncestorException() + { + $parent = new Node; + $child = new Node; + $parent->addChild($child); + $child->addChild($parent); + } + + /** + * @expectedException PHPHtmlParser\Exceptions\CircularException + */ + public function testAddItselfAsChild() + { + $parent = new Node; + $parent->addChild($parent); + } + + + public function testIsAncestorParent() + { + $parent = new Node; + $child = new Node; + $parent->addChild($child); + $this->assertTrue($child->isAncestor($parent->id())); + } + + public function testGetAncestor() + { + $parent = new Node; + $child = new Node; + $parent->addChild($child); + $ancestor = $child->getAncestor($parent->id()); + $this->assertEquals($parent->id(), $ancestor->id()); + } + + public function testGetGreatAncestor() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $parent->addChild($child); + $child->addChild($child2); + $ancestor = $child2->getAncestor($parent->id()); + $this->assertEquals($parent->id(), $ancestor->id()); + } + + public function testGetAncestorNotFound() + { + $parent = new Node; + $ancestor = $parent->getAncestor(1); + $this->assertNull($ancestor); + } } diff --git a/tests/Node/TagTest.php b/tests/Node/TagTest.php index 7987d71f..b3a528db 100644 --- a/tests/Node/TagTest.php +++ b/tests/Node/TagTest.php @@ -4,153 +4,153 @@ class NodeTagTest extends PHPUnit_Framework_TestCase { - public function testSelfClosing() - { - $tag = new Tag('a'); - $tag->selfClosing(); - $this->assertTrue($tag->isSelfClosing()); - } - - public function testSetAttributes() - { - $attr = [ - 'href' => [ - 'value' => 'http://google.com', - 'doublequote' => false, - ], - ]; - - $tag = new Tag('a'); - $tag->setAttributes($attr); - $this->assertEquals('http://google.com', $tag->getAttribute('href')['value']); - } - - public function testRemoveAttribute() - { - $tag = new Tag('a'); - $tag->setAttribute('href', 'http://google.com'); - $tag->removeAttribute('href'); - $this->assertNull($tag->getAttribute('href')['value']); - } - - public function testRemoveAllAttributes() - { - $attr = [ - 'class' => [ - 'value' => 'clear-fix', - 'doubleQuote' => true, - ], - ]; - - $tag = new Tag('a'); - $tag->setAttribute('href', 'http://google.com'); - $tag->setAttribute('class', $attr); - $tag->removeAllAttributes(); - $this->assertEquals(0, count($tag->getAttributes())); - } - - public function testSetAttributeNoArray() - { - $tag = new Tag('a'); - $tag->setAttribute('href', 'http://google.com'); - $this->assertEquals('http://google.com', $tag->getAttribute('href')['value']); - } - - public function testSetAttributesNoDoubleArray() - { - $attr = [ - 'href' => 'http://google.com', - 'class' => 'funtimes', - ]; - - $tag = new Tag('a'); - $tag->setAttributes($attr); - $this->assertEquals('funtimes', $tag->class['value']); - } - - public function testNoise() - { - $tag = new Tag('a'); - $this->assertTrue($tag->noise('noise') instanceof Tag); - } - - public function testGetAttributeMagic() - { - $attr = [ - 'href' => [ - 'value' => 'http://google.com', - 'doublequote' => false, - ], - ]; - - $tag = new Tag('a'); - $tag->setAttributes($attr); - $this->assertEquals('http://google.com', $tag->href['value']); - } - - public function testSetAttributeMagic() - { - $tag = new Tag('a'); - $tag->href = 'http://google.com'; - $this->assertEquals('http://google.com', $tag->href['value']); - } - - public function testMakeOpeningTag() - { - $attr = [ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => true, - ], - ]; - - $tag = new Tag('a'); - $tag->setAttributes($attr); - $this->assertEquals('', $tag->makeOpeningTag()); - } - - public function testMakeOpeningTagEmptyAttr() - { - $attr = [ - 'href' => [ - 'value' => 'http://google.com', - 'doubleQuote' => true, - ], - ]; - - $tag = new Tag('a'); - $tag->setAttributes($attr); - $tag->selected = [ - 'value' => null, - ]; - $this->assertEquals('', $tag->makeOpeningTag()); - } - - public function testMakeOpeningTagSelfClosing() - { - $attr = [ - 'class' => [ - 'value' => 'clear-fix', - 'doubleQuote' => true, - ], - ]; - - $tag = new Tag('div'); - $tag->selfClosing() - ->setAttributes($attr); - $this->assertEquals('
', $tag->makeOpeningTag()); - } - - public function testMakeClosingTag() - { - $tag = new Tag('a'); - $this->assertEquals('', $tag->makeClosingTag()); - } - - public function testMakeClosingTagSelfClosing() - { - $tag = new Tag('div'); - $tag->selfClosing(); - $this->assertEmpty($tag->makeClosingTag()); - } + public function testSelfClosing() + { + $tag = new Tag('a'); + $tag->selfClosing(); + $this->assertTrue($tag->isSelfClosing()); + } + + public function testSetAttributes() + { + $attr = [ + 'href' => [ + 'value' => 'http://google.com', + 'doublequote' => false, + ], + ]; + + $tag = new Tag('a'); + $tag->setAttributes($attr); + $this->assertEquals('http://google.com', $tag->getAttribute('href')['value']); + } + + public function testRemoveAttribute() + { + $tag = new Tag('a'); + $tag->setAttribute('href', 'http://google.com'); + $tag->removeAttribute('href'); + $this->assertNull($tag->getAttribute('href')['value']); + } + + public function testRemoveAllAttributes() + { + $attr = [ + 'class' => [ + 'value' => 'clear-fix', + 'doubleQuote' => true, + ], + ]; + + $tag = new Tag('a'); + $tag->setAttribute('href', 'http://google.com'); + $tag->setAttribute('class', $attr); + $tag->removeAllAttributes(); + $this->assertEquals(0, count($tag->getAttributes())); + } + + public function testSetAttributeNoArray() + { + $tag = new Tag('a'); + $tag->setAttribute('href', 'http://google.com'); + $this->assertEquals('http://google.com', $tag->getAttribute('href')['value']); + } + + public function testSetAttributesNoDoubleArray() + { + $attr = [ + 'href' => 'http://google.com', + 'class' => 'funtimes', + ]; + + $tag = new Tag('a'); + $tag->setAttributes($attr); + $this->assertEquals('funtimes', $tag->class['value']); + } + + public function testNoise() + { + $tag = new Tag('a'); + $this->assertTrue($tag->noise('noise') instanceof Tag); + } + + public function testGetAttributeMagic() + { + $attr = [ + 'href' => [ + 'value' => 'http://google.com', + 'doublequote' => false, + ], + ]; + + $tag = new Tag('a'); + $tag->setAttributes($attr); + $this->assertEquals('http://google.com', $tag->href['value']); + } + + public function testSetAttributeMagic() + { + $tag = new Tag('a'); + $tag->href = 'http://google.com'; + $this->assertEquals('http://google.com', $tag->href['value']); + } + + public function testMakeOpeningTag() + { + $attr = [ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => true, + ], + ]; + + $tag = new Tag('a'); + $tag->setAttributes($attr); + $this->assertEquals('', $tag->makeOpeningTag()); + } + + public function testMakeOpeningTagEmptyAttr() + { + $attr = [ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => true, + ], + ]; + + $tag = new Tag('a'); + $tag->setAttributes($attr); + $tag->selected = [ + 'value' => null, + ]; + $this->assertEquals('', $tag->makeOpeningTag()); + } + + public function testMakeOpeningTagSelfClosing() + { + $attr = [ + 'class' => [ + 'value' => 'clear-fix', + 'doubleQuote' => true, + ], + ]; + + $tag = new Tag('div'); + $tag->selfClosing() + ->setAttributes($attr); + $this->assertEquals('
', $tag->makeOpeningTag()); + } + + public function testMakeClosingTag() + { + $tag = new Tag('a'); + $this->assertEquals('', $tag->makeClosingTag()); + } + + public function testMakeClosingTagSelfClosing() + { + $tag = new Tag('div'); + $tag->selfClosing(); + $this->assertEmpty($tag->makeClosingTag()); + } } diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index 587dadf6..022dc842 100644 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -4,29 +4,29 @@ class NodeTextTest extends PHPUnit_Framework_TestCase { - public function testText() - { - $node = new TextNode('foo bar'); - $this->assertEquals('foo bar', $node->text()); - } + public function testText() + { + $node = new TextNode('foo bar'); + $this->assertEquals('foo bar', $node->text()); + } - public function testGetTag() - { - $node = new TextNode('foo bar'); - $this->assertEquals('text', $node->getTag()->name()); - } + public function testGetTag() + { + $node = new TextNode('foo bar'); + $this->assertEquals('text', $node->getTag()->name()); + } - public function testAncestorByTag() - { - $node = new TextNode('foo bar'); - $text = $node->ancestorByTag('text'); - $this->assertEquals($node, $text); - } + public function testAncestorByTag() + { + $node = new TextNode('foo bar'); + $text = $node->ancestorByTag('text'); + $this->assertEquals($node, $text); + } - public function testPreserveEntity() - { - $node = new TextNode('i'); - $text = $node->innerhtml; - $this->assertEquals('i', $text); - } + public function testPreserveEntity() + { + $node = new TextNode('i'); + $text = $node->innerhtml; + $this->assertEquals('i', $text); + } } diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php index ebddf677..44539651 100644 --- a/tests/Options/CleanupTest.php +++ b/tests/Options/CleanupTest.php @@ -4,70 +4,70 @@ class CleanupTest extends PHPUnit_Framework_TestCase { - public function testCleanupInputTrue() - { - $dom = new Dom; - $dom->setOptions([ - 'cleanupInput' => true, - ]); - $dom->loadFromFile('tests/files/horrible.html'); - $this->assertEquals(0, count($dom->find('style'))); - $this->assertEquals(0, count($dom->find('script'))); - } + public function testCleanupInputTrue() + { + $dom = new Dom; + $dom->setOptions([ + 'cleanupInput' => true, + ]); + $dom->loadFromFile('tests/files/horrible.html'); + $this->assertEquals(0, count($dom->find('style'))); + $this->assertEquals(0, count($dom->find('script'))); + } - public function testCleanupInputFalse() - { - $dom = new Dom; - $dom->setOptions([ - 'cleanupInput' => false, - ]); - $dom->loadFromFile('tests/files/horrible.html'); - $this->assertEquals(1, count($dom->find('style'))); - $this->assertEquals(1, count($dom->find('script'))); - } + public function testCleanupInputFalse() + { + $dom = new Dom; + $dom->setOptions([ + 'cleanupInput' => false, + ]); + $dom->loadFromFile('tests/files/horrible.html'); + $this->assertEquals(1, count($dom->find('style'))); + $this->assertEquals(1, count($dom->find('script'))); + } - public function testRemoveStylesTrue() - { - $dom = new Dom; - $dom->setOptions([ - 'removeStyles' => true, - ]); - $dom->loadFromFile('tests/files/horrible.html'); - $this->assertEquals(0, count($dom->find('style'))); - } + public function testRemoveStylesTrue() + { + $dom = new Dom; + $dom->setOptions([ + 'removeStyles' => true, + ]); + $dom->loadFromFile('tests/files/horrible.html'); + $this->assertEquals(0, count($dom->find('style'))); + } - public function testRemoveStylesFalse() - { - $dom = new Dom; - $dom->setOptions([ - 'removeStyles' => false, - ]); - $dom->loadFromFile('tests/files/horrible.html'); - $this->assertEquals(1, count($dom->find('style'))); - $this->assertEquals('text/css', - $dom->find('style')->getAttribute('type')); - } + public function testRemoveStylesFalse() + { + $dom = new Dom; + $dom->setOptions([ + 'removeStyles' => false, + ]); + $dom->loadFromFile('tests/files/horrible.html'); + $this->assertEquals(1, count($dom->find('style'))); + $this->assertEquals('text/css', + $dom->find('style')->getAttribute('type')); + } - public function testRemoveScriptsTrue() - { - $dom = new Dom; - $dom->setOptions([ - 'removeScripts' => true, - ]); - $dom->loadFromFile('tests/files/horrible.html'); - $this->assertEquals(0, count($dom->find('script'))); - } + public function testRemoveScriptsTrue() + { + $dom = new Dom; + $dom->setOptions([ + 'removeScripts' => true, + ]); + $dom->loadFromFile('tests/files/horrible.html'); + $this->assertEquals(0, count($dom->find('script'))); + } - public function testRemoveScriptsFalse() - { - $dom = new Dom; - $dom->setOptions([ - 'removeScripts' => false, - ]); - $dom->loadFromFile('tests/files/horrible.html'); - $this->assertEquals(1, count($dom->find('script'))); - $this->assertEquals('text/JavaScript', - $dom->find('script')->getAttribute('type')); - } + public function testRemoveScriptsFalse() + { + $dom = new Dom; + $dom->setOptions([ + 'removeScripts' => false, + ]); + $dom->loadFromFile('tests/files/horrible.html'); + $this->assertEquals(1, count($dom->find('script'))); + $this->assertEquals('text/JavaScript', + $dom->find('script')->getAttribute('type')); + } } diff --git a/tests/Options/PreserveLineBreaks.php b/tests/Options/PreserveLineBreaks.php index 53b1af66..7547b7a8 100644 --- a/tests/Options/PreserveLineBreaks.php +++ b/tests/Options/PreserveLineBreaks.php @@ -1,30 +1,30 @@ -setOptions([ - 'preserveLineBreaks' => true, - ]); - $dom->load("
-
"); - - $this->assertEquals("
\n
", (string) $dom); - } - - public function testPreserveLineBreakBeforeClosingTag() - { - $dom = new Dom; - $dom->setOptions([ - 'preserveLineBreaks' => true, - ]); - $dom->load("
"); - - $this->assertEquals("
", (string) $dom); - } -} +setOptions([ + 'preserveLineBreaks' => true, + ]); + $dom->load("
+
"); + + $this->assertEquals("
\n
", (string) $dom); + } + + public function testPreserveLineBreakBeforeClosingTag() + { + $dom = new Dom; + $dom->setOptions([ + 'preserveLineBreaks' => true, + ]); + $dom->load("
"); + + $this->assertEquals("
", (string) $dom); + } +} diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php index ad01f6b8..516a99cb 100644 --- a/tests/Options/StrictTest.php +++ b/tests/Options/StrictTest.php @@ -5,51 +5,51 @@ class StrictTest extends PHPUnit_Framework_TestCase { - public function testConfigStrict() - { - $dom = new Dom; - $dom->setOptions([ - 'strict' => true, - ]); - $dom->load('

Hey you

Ya you!

'); - $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); - } + public function testConfigStrict() + { + $dom = new Dom; + $dom->setOptions([ + 'strict' => true, + ]); + $dom->load('

Hey you

Ya you!

'); + $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); + } - public function testConfigStrictMissingSelfClosing() - { - $dom = new Dom; - $dom->setOptions([ - 'strict' => true, - ]); - try - { - // should throw an exception - $dom->load('

Hey you


Ya you!

'); - // we should not get here - $this->assertTrue(false); - } - catch (StrictException $e) - { - $this->assertEquals("Tag 'br' is not self closing! (character #31)", $e->getMessage()); - } - } + public function testConfigStrictMissingSelfClosing() + { + $dom = new Dom; + $dom->setOptions([ + 'strict' => true, + ]); + try + { + // should throw an exception + $dom->load('

Hey you


Ya you!

'); + // we should not get here + $this->assertTrue(false); + } + catch (StrictException $e) + { + $this->assertEquals("Tag 'br' is not self closing! (character #31)", $e->getMessage()); + } + } - public function testConfigStrictMissingAttribute() - { - $dom = new Dom; - $dom->setOptions([ - 'strict' => true, - ]); - try - { - // should throw an exception - $dom->load('

Hey you

Ya you!

'); - // we should not get here - $this->assertTrue(false); - } - catch (StrictException $e) - { - $this->assertEquals("Tag 'p' has an attribute 'block' with out a value! (character #22)", $e->getMessage()); - } - } + public function testConfigStrictMissingAttribute() + { + $dom = new Dom; + $dom->setOptions([ + 'strict' => true, + ]); + try + { + // should throw an exception + $dom->load('

Hey you

Ya you!

'); + // we should not get here + $this->assertTrue(false); + } + catch (StrictException $e) + { + $this->assertEquals("Tag 'p' has an attribute 'block' with out a value! (character #22)", $e->getMessage()); + } + } } diff --git a/tests/Options/WhitespaceTextNodeTest.php b/tests/Options/WhitespaceTextNodeTest.php index c72c3df1..cdafcdcc 100644 --- a/tests/Options/WhitespaceTextNodeTest.php +++ b/tests/Options/WhitespaceTextNodeTest.php @@ -4,25 +4,25 @@ class WhitespaceTextNodeTest extends PHPUnit_Framework_TestCase { - public function testConfigGlobalNoWhitespaceTextNode() - { - $dom = new Dom; - $dom->setOptions([ - 'whitespaceTextNode' => false, - ]); - $dom->load('

Hey you

Ya you!

'); - $this->assertEquals('Ya you!', $dom->getElementById('hey')->nextSibling()->text); - } + public function testConfigGlobalNoWhitespaceTextNode() + { + $dom = new Dom; + $dom->setOptions([ + 'whitespaceTextNode' => false, + ]); + $dom->load('

Hey you

Ya you!

'); + $this->assertEquals('Ya you!', $dom->getElementById('hey')->nextSibling()->text); + } - public function testConfigLocalOverride() - { - $dom = new Dom; - $dom->setOptions([ - 'whitespaceTextNode' => false, - ]); - $dom->load('

Hey you

Ya you!

', [ - 'whitespaceTextNode' => true, - ]); - $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); - } + public function testConfigLocalOverride() + { + $dom = new Dom; + $dom->setOptions([ + 'whitespaceTextNode' => false, + ]); + $dom->load('

Hey you

Ya you!

', [ + 'whitespaceTextNode' => true, + ]); + $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); + } } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index b5398c16..12a08eec 100644 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -4,40 +4,40 @@ class OptionsTest extends PHPUnit_Framework_TestCase { - public function testDefaultWhitespaceTextNode() - { - $options = new Options; - - $this->assertTrue($options->whitespaceTextNode); - } - - public function testAddingOption() - { - $options = new Options; - $options->setOptions([ - 'test' => true, - ]); - - $this->assertTrue($options->test); - } - - public function testAddingOver() - { - $options = new Options; - $options->setOptions([ - 'test' => false, - ])->setOptions([ - 'test' => true, - 'whitespaceTextNode' => false, - ]); - - $this->assertFalse($options->get('whitespaceTextNode')); - } - - public function testGettingNoOption() - { - $options = new Options; - $this->assertEquals(null, $options->get('doesnotexist')); - } + public function testDefaultWhitespaceTextNode() + { + $options = new Options; + + $this->assertTrue($options->whitespaceTextNode); + } + + public function testAddingOption() + { + $options = new Options; + $options->setOptions([ + 'test' => true, + ]); + + $this->assertTrue($options->test); + } + + public function testAddingOver() + { + $options = new Options; + $options->setOptions([ + 'test' => false, + ])->setOptions([ + 'test' => true, + 'whitespaceTextNode' => false, + ]); + + $this->assertFalse($options->get('whitespaceTextNode')); + } + + public function testGettingNoOption() + { + $options = new Options; + $this->assertEquals(null, $options->get('doesnotexist')); + } } diff --git a/tests/SelectorTest.php b/tests/SelectorTest.php index fe06568a..1b31afa0 100644 --- a/tests/SelectorTest.php +++ b/tests/SelectorTest.php @@ -5,202 +5,202 @@ use PHPHtmlParser\Dom\Tag; class SelectorTest extends PHPUnit_Framework_TestCase { - - public function testParseSelectorStringId() - { - $selector = new Selector('#all'); - $selectors = $selector->getSelectors(); - $this->assertEquals('id', $selectors[0][0]['key']); - } - - public function testParseSelectorStringClass() - { - $selector = new Selector('div.post'); - $selectors = $selector->getSelectors(); - $this->assertEquals('class', $selectors[0][0]['key']); - } - - public function testParseSelectorStringAttribute() - { - $selector = new Selector('div[visible=yes]'); - $selectors = $selector->getSelectors(); - $this->assertEquals('yes', $selectors[0][0]['value']); - } - - public function testParseSelectorStringNoKey() - { - $selector = new Selector('div[!visible]'); - $selectors = $selector->getSelectors(); - $this->assertTrue($selectors[0][0]['noKey']); - } - - public function testFind() - { - $root = new HtmlNode('root'); - $parent = new HtmlNode('div'); - $child1 = new HtmlNode('a'); - $child2 = new HtmlNode('p'); - $parent->addChild($child1); - $parent->addChild($child2); - $root->addChild($parent); - - $selector = new Selector('div a'); - $this->assertEquals($child1->id(), $selector->find($root)[0]->id()); - } - - public function testFindId() - { - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('a')); - $child2 = new HtmlNode(new Tag('p')); - $child2->getTag()->setAttributes([ - 'id' => [ - 'value' => 'content', - 'doubleQuote' => true, - ], - ]); - $parent->addChild($child1); - $parent->addChild($child2); - - $selector = new Selector('#content'); - $this->assertEquals($child2->id(), $selector->find($parent)[0]->id()); - } - - public function testFindClass() - { - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('a')); - $child2 = new HtmlNode(new Tag('p')); - $child3 = new HtmlNode('a'); - $child3->getTag()->setAttributes([ - 'class' => [ - 'value' => 'link', - 'doubleQuote' => true, - ], - ]); - $parent->addChild($child1); - $parent->addChild($child2); - $parent->addChild($child3); - - $selector = new Selector('.link'); - $this->assertEquals($child3->id(), $selector->find($parent)[0]->id()); - } - - public function testFindClassMultiple() - { - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('a')); - $child2 = new HtmlNode(new Tag('p')); - $child3 = new HtmlNode(new Tag('a')); - $child3->getTag()->setAttributes([ - 'class' => [ - 'value' => 'link outer', - 'doubleQuote' => false, - ], - ]); - $parent->addChild($child1); - $parent->addChild($child2); - $parent->addChild($child3); - - $selector = new Selector('.outer'); - $this->assertEquals($child3->id(), $selector->find($parent)[0]->id()); - } - - public function testFindWild() - { - $root = new HtmlNode(new Tag('root')); - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('a')); - $child2 = new HtmlNode(new Tag('p')); - $child3 = new HtmlNode(new Tag('a')); - $root->addChild($parent); - $parent->addChild($child1); - $parent->addChild($child2); - $child2->addChild($child3); - - $selector = new Selector('div * a'); - $this->assertEquals($child3->id(), $selector->find($root)[0]->id()); - } - - public function testFindMultipleSelectors() - { - $root = new HtmlNode(new Tag('root')); - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('a')); - $child2 = new HtmlNode(new Tag('p')); - $child3 = new HtmlNode(new Tag('a')); - $root->addChild($parent); - $parent->addChild($child1); - $parent->addChild($child2); - $child2->addChild($child3); - - $selector = new Selector('a, p'); - $this->assertEquals(3, count($selector->find($root))); - } - - public function testFindXpathKeySelector() - { - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('a')); - $child2 = new HtmlNode(new Tag('p')); - $child3 = new HtmlNode(new Tag('a')); - $child3->getTag()->setAttributes([ - 'class' => [ - 'value' => 'link outer', - 'doubleQuote' => false, - ], - ]); - $parent->addChild($child1); - $parent->addChild($child2); - $parent->addChild($child3); - - $selector = new Selector('div[1]'); - $this->assertEquals($parent->id(), $selector->find($parent)[0]->id()); - } - - public function testFindChildMultipleLevelsDeep() - { - $root = new HtmlNode(new Tag('root')); - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('ul')); - $child2 = new HtmlNode(new Tag('li')); - $root->addChild($parent); - $parent->addChild($child1); - $child1->addChild($child2); - - $selector = new Selector('div li'); - $this->assertEquals(1, count($selector->find($root))); - } - - public function testFindAllChildren() - { - $root = new HtmlNode(new Tag('root')); - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('ul')); - $child2 = new HtmlNode(new Tag('span')); - $child3 = new HtmlNode(new Tag('ul')); - $root->addChild($parent); - $parent->addChild($child1); - $child2->addChild($child3); - $parent->addChild($child2); - - $selector = new Selector('div ul'); - $this->assertEquals(2, count($selector->find($root))); - } - - public function testFindChildUsingChildSelector() - { - $root = new HtmlNode(new Tag('root')); - $parent = new HtmlNode(new Tag('div')); - $child1 = new HtmlNode(new Tag('ul')); - $child2 = new HtmlNode(new Tag('span')); - $child3 = new HtmlNode(new Tag('ul')); - $root->addChild($parent); - $parent->addChild($child1); - $child2->addChild($child3); - $parent->addChild($child2); - - $selector = new Selector('div > ul'); - $this->assertEquals(1, count($selector->find($root))); - } + + public function testParseSelectorStringId() + { + $selector = new Selector('#all'); + $selectors = $selector->getSelectors(); + $this->assertEquals('id', $selectors[0][0]['key']); + } + + public function testParseSelectorStringClass() + { + $selector = new Selector('div.post'); + $selectors = $selector->getSelectors(); + $this->assertEquals('class', $selectors[0][0]['key']); + } + + public function testParseSelectorStringAttribute() + { + $selector = new Selector('div[visible=yes]'); + $selectors = $selector->getSelectors(); + $this->assertEquals('yes', $selectors[0][0]['value']); + } + + public function testParseSelectorStringNoKey() + { + $selector = new Selector('div[!visible]'); + $selectors = $selector->getSelectors(); + $this->assertTrue($selectors[0][0]['noKey']); + } + + public function testFind() + { + $root = new HtmlNode('root'); + $parent = new HtmlNode('div'); + $child1 = new HtmlNode('a'); + $child2 = new HtmlNode('p'); + $parent->addChild($child1); + $parent->addChild($child2); + $root->addChild($parent); + + $selector = new Selector('div a'); + $this->assertEquals($child1->id(), $selector->find($root)[0]->id()); + } + + public function testFindId() + { + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('a')); + $child2 = new HtmlNode(new Tag('p')); + $child2->getTag()->setAttributes([ + 'id' => [ + 'value' => 'content', + 'doubleQuote' => true, + ], + ]); + $parent->addChild($child1); + $parent->addChild($child2); + + $selector = new Selector('#content'); + $this->assertEquals($child2->id(), $selector->find($parent)[0]->id()); + } + + public function testFindClass() + { + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('a')); + $child2 = new HtmlNode(new Tag('p')); + $child3 = new HtmlNode('a'); + $child3->getTag()->setAttributes([ + 'class' => [ + 'value' => 'link', + 'doubleQuote' => true, + ], + ]); + $parent->addChild($child1); + $parent->addChild($child2); + $parent->addChild($child3); + + $selector = new Selector('.link'); + $this->assertEquals($child3->id(), $selector->find($parent)[0]->id()); + } + + public function testFindClassMultiple() + { + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('a')); + $child2 = new HtmlNode(new Tag('p')); + $child3 = new HtmlNode(new Tag('a')); + $child3->getTag()->setAttributes([ + 'class' => [ + 'value' => 'link outer', + 'doubleQuote' => false, + ], + ]); + $parent->addChild($child1); + $parent->addChild($child2); + $parent->addChild($child3); + + $selector = new Selector('.outer'); + $this->assertEquals($child3->id(), $selector->find($parent)[0]->id()); + } + + public function testFindWild() + { + $root = new HtmlNode(new Tag('root')); + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('a')); + $child2 = new HtmlNode(new Tag('p')); + $child3 = new HtmlNode(new Tag('a')); + $root->addChild($parent); + $parent->addChild($child1); + $parent->addChild($child2); + $child2->addChild($child3); + + $selector = new Selector('div * a'); + $this->assertEquals($child3->id(), $selector->find($root)[0]->id()); + } + + public function testFindMultipleSelectors() + { + $root = new HtmlNode(new Tag('root')); + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('a')); + $child2 = new HtmlNode(new Tag('p')); + $child3 = new HtmlNode(new Tag('a')); + $root->addChild($parent); + $parent->addChild($child1); + $parent->addChild($child2); + $child2->addChild($child3); + + $selector = new Selector('a, p'); + $this->assertEquals(3, count($selector->find($root))); + } + + public function testFindXpathKeySelector() + { + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('a')); + $child2 = new HtmlNode(new Tag('p')); + $child3 = new HtmlNode(new Tag('a')); + $child3->getTag()->setAttributes([ + 'class' => [ + 'value' => 'link outer', + 'doubleQuote' => false, + ], + ]); + $parent->addChild($child1); + $parent->addChild($child2); + $parent->addChild($child3); + + $selector = new Selector('div[1]'); + $this->assertEquals($parent->id(), $selector->find($parent)[0]->id()); + } + + public function testFindChildMultipleLevelsDeep() + { + $root = new HtmlNode(new Tag('root')); + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('ul')); + $child2 = new HtmlNode(new Tag('li')); + $root->addChild($parent); + $parent->addChild($child1); + $child1->addChild($child2); + + $selector = new Selector('div li'); + $this->assertEquals(1, count($selector->find($root))); + } + + public function testFindAllChildren() + { + $root = new HtmlNode(new Tag('root')); + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('ul')); + $child2 = new HtmlNode(new Tag('span')); + $child3 = new HtmlNode(new Tag('ul')); + $root->addChild($parent); + $parent->addChild($child1); + $child2->addChild($child3); + $parent->addChild($child2); + + $selector = new Selector('div ul'); + $this->assertEquals(2, count($selector->find($root))); + } + + public function testFindChildUsingChildSelector() + { + $root = new HtmlNode(new Tag('root')); + $parent = new HtmlNode(new Tag('div')); + $child1 = new HtmlNode(new Tag('ul')); + $child2 = new HtmlNode(new Tag('span')); + $child3 = new HtmlNode(new Tag('ul')); + $root->addChild($parent); + $parent->addChild($child1); + $child2->addChild($child3); + $parent->addChild($child2); + + $selector = new Selector('div > ul'); + $this->assertEquals(1, count($selector->find($root))); + } } diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index 737879bd..ad6c218a 100644 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -4,73 +4,73 @@ class StaticDomTest extends PHPUnit_Framework_TestCase { - public function setUp() - { - StaticDom::mount(); - } + public function setUp() + { + StaticDom::mount(); + } - public function tearDown() - { - StaticDom::unload(); - } + public function tearDown() + { + StaticDom::unload(); + } - public function testMountWithDom() - { - $dom = new PHPHtmlParser\Dom; - StaticDom::unload(); - $status = StaticDom::mount('newDom', $dom); - $this->assertTrue($status); - } + public function testMountWithDom() + { + $dom = new PHPHtmlParser\Dom; + StaticDom::unload(); + $status = StaticDom::mount('newDom', $dom); + $this->assertTrue($status); + } - public function testLoad() - { - $dom = Dom::load('

Hey bro, click here
:)

'); - $div = $dom->find('div', 0); - $this->assertEquals('

Hey bro, click here
:)

', $div->outerHtml); - } + public function testLoad() + { + $dom = Dom::load('

Hey bro, click here
:)

'); + $div = $dom->find('div', 0); + $this->assertEquals('

Hey bro, click here
:)

', $div->outerHtml); + } - public function testLoadWithFile() - { - $dom = Dom::load('tests/files/small.html'); - $this->assertEquals('VonBurgermeister', $dom->find('.post-user font', 0)->text); - } + public function testLoadWithFile() + { + $dom = Dom::load('tests/files/small.html'); + $this->assertEquals('VonBurgermeister', $dom->find('.post-user font', 0)->text); + } - public function testLoadFromFile() - { - $dom = Dom::loadFromFile('tests/files/small.html'); - $this->assertEquals('VonBurgermeister', $dom->find('.post-user font', 0)->text); - } + public function testLoadFromFile() + { + $dom = Dom::loadFromFile('tests/files/small.html'); + $this->assertEquals('VonBurgermeister', $dom->find('.post-user font', 0)->text); + } - public function testFind() - { - Dom::load('tests/files/horrible.html'); - $this->assertEquals('', Dom::find('table input', 1)->outerHtml); - } + public function testFind() + { + Dom::load('tests/files/horrible.html'); + $this->assertEquals('', Dom::find('table input', 1)->outerHtml); + } - /** - * @expectedException PHPHtmlParser\Exceptions\NotLoadedException - */ - public function testFindNoLoad() - { - Dom::find('.post-user font', 0); - } + /** + * @expectedException PHPHtmlParser\Exceptions\NotLoadedException + */ + public function testFindNoLoad() + { + Dom::find('.post-user font', 0); + } - public function testFindI() - { - Dom::load('tests/files/horrible.html'); - $this->assertEquals('[ Досие бр:12928 ]', Dom::find('i')[0]->innerHtml); - } + public function testFindI() + { + Dom::load('tests/files/horrible.html'); + $this->assertEquals('[ Досие бр:12928 ]', Dom::find('i')[0]->innerHtml); + } - public function testLoadFromUrl() - { - $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); - $curl->shouldReceive('get') - ->once() - ->with('http://google.com') - ->andReturn(file_get_contents('tests/files/small.html')); + public function testLoadFromUrl() + { + $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); + $curl->shouldReceive('get') + ->once() + ->with('http://google.com') + ->andReturn(file_get_contents('tests/files/small.html')); - Dom::loadFromUrl('http://google.com', [], $curl); - $this->assertEquals('VonBurgermeister', Dom::find('.post-row div .post-user font', 0)->text); - } + Dom::loadFromUrl('http://google.com', [], $curl); + $this->assertEquals('VonBurgermeister', Dom::find('.post-row div .post-user font', 0)->text); + } } diff --git a/tests/files/horrible.html b/tests/files/horrible.html index c5430be1..f47c8059 100644 --- a/tests/files/horrible.html +++ b/tests/files/horrible.html @@ -1,301 +1,301 @@ - - -МАРнет - - - - - - - - - - -

- - 0-9 A - B C - D E - F G - H I - J K - L M - N O - P Q - R S - T U - V X Y - W Z - -

- - - - - - - - -
- - -
- -
- : - - -
-
-
- - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
: marnet.mk [ :12928 ]
-
-
: 22-05-2014
-
-
-

:

-
-
22-05-2008
-
-

:

-
-
-
-

:

-
-
. .17 ϣ
-
-

:

-
-
4080011519278
-
-

:

-
-
02/3256-561
  
-
-

:

-
-
-
-

e-mail:

-
-
domains@marnet.net.mk
-
-

:

-
-
//
-
-

:

-
-
-
-

e-mail:

-
-
domains@marnet.net.mk
-
-

:

-
-
//
IP
nsg.mio.gov.mk80.77.151.251
kitka.marnet.net.mk194.149.131.2
- - - -
- - - -
+ + +МАРнет + + + + + + + + + + +

+ + 0-9 A + B C + D E + F G + H I + J K + L M + N O + P Q + R S + T U + V X Y + W Z + +

+ + + + + + + + +
+ + +
+ +
+ : + + +
+
+
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
: marnet.mk [ :12928 ]
+
+
: 22-05-2014
+
+
+

:

+
+
22-05-2008
+
+

:

+
+
+
+

:

+
+
. .17 ϣ
+
+

:

+
+
4080011519278
+
+

:

+
+
02/3256-561
  
+
+

:

+
+
+
+

e-mail:

+
+
domains@marnet.net.mk
+
+

:

+
+
//
+
+

:

+
+
+
+

e-mail:

+
+
domains@marnet.net.mk
+
+

:

+
+
//
IP
nsg.mio.gov.mk80.77.151.251
kitka.marnet.net.mk194.149.131.2
+ + + +
+ + + +
From 364c79dfdd9f6c0211c665df006f83f822c7378d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 6 Apr 2016 10:36:49 -0400 Subject: [PATCH 048/200] Added .scrutinizer.yml to repo --- .scrutinizer.yml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .scrutinizer.yml diff --git a/.scrutinizer.yml b/.scrutinizer.yml new file mode 100644 index 00000000..7c9a4375 --- /dev/null +++ b/.scrutinizer.yml @@ -0,0 +1,35 @@ +filter: + paths: [src/*] + excluded_paths: [tests/*] +checks: + php: + code_rating: true + remove_extra_empty_lines: true + remove_php_closing_tag: true + remove_trailing_whitespace: true + fix_use_statements: + remove_unused: true + preserve_multiple: false + preserve_blanklines: true + order_alphabetically: true + fix_php_opening_tag: true + fix_linefeed: true + fix_line_ending: true + fix_identation_4spaces: true + fix_doc_comments: true +tools: + external_code_coverage: + timeout: 600 + runs: 3 + php_code_coverage: false + php_code_sniffer: + config: + standard: PSR2 + filter: + paths: ['src'] + php_loc: + enabled: true + excluded_dirs: [vendor, test] + php_cpd: + enabled: true + excluded_dirs: [vendor, test] From cfc89d5586b55dba1d8198a1d73626adbe22a7d4 Mon Sep 17 00:00:00 2001 From: Scrutinizer Auto-Fixer Date: Wed, 6 Apr 2016 15:14:16 +0000 Subject: [PATCH 049/200] Scrutinizer Auto-Fixes This commit consists of patches automatically generated for this project on https://scrutinizer-ci.com --- src/PHPHtmlParser/Dom/AbstractNode.php | 2 +- src/PHPHtmlParser/Dom/Collection.php | 2 +- src/PHPHtmlParser/Selector.php | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 86217646..8549a46f 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -1,9 +1,9 @@ Date: Wed, 6 Apr 2016 11:24:21 -0400 Subject: [PATCH 050/200] Version 1.7.0 --- CHANGELOG.md | 17 +++++++++++++++++ README.md | 2 +- composer.json | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b9e5955..618fb1b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,22 @@ ### Development +## 1.7.0 + +- Added .scrutinizer.yml to repo +- Reformated code to PSR-1/2 +- Improved the test coverage and some small code changes +- Added removeAttribute and removeAllAttributes tag methods fixes #57 +- Added replaceNode method implements #52 +- Added a delete method. fixes #43 +- Added semicolon after for linebreak preservation. fixes #62 +- Removed code that removed tag fixes #60 +- Added new test related to #63 +- Refactored the nodes into inner and leaf nodes +- Fixed Strings example in README +- Close this header so the markdown will render properly +- Added preserve line break option. Defaults to false. + + ## 1.6.9 - Added Changelog diff --git a/README.md b/README.md index 2e082e45..c160585e 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 1.6.9 - DEV +Version 1.7.0 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) diff --git a/composer.json b/composer.json index e613e01e..1d554f41 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,7 @@ { "name": "paquettg/php-html-parser", "type": "library", - "version": "1.6.9", + "version": "1.7.0", "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.", "keywords": ["html", "dom", "parser"], "homepage": "https://github.com/paquettg/php-html-parser", From 08299c84703278974a444c6d7cd65604f5aebb19 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 6 Apr 2016 12:22:26 -0400 Subject: [PATCH 051/200] Added a .gitattributes file --- .gitattributes | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..3c40333f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,9 @@ +/tests export-ignore +/.scrutinizar.yml export-ignore +/.travis.yml export-ignore +/CHANGELOG.md export-ignore +/CONTRIBUTING.md export-ignore +/LICENSE.md export-ignore +/README.md export-ignore +/phpunit.php export-ignore +/phpunit.xml export-ignore From 5cc34ce06abbfe5966640e08dbbe371fc4eb5b27 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 6 Apr 2016 12:22:52 -0400 Subject: [PATCH 052/200] Updated dependencies to most recent version --- .travis.yml | 2 -- README.md | 2 +- composer.json | 6 +++--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9b451b96..c0a4c975 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,6 @@ language: php php: - - 5.4 - - 5.5 - 5.6 - 7.0 - hhvm diff --git a/README.md b/README.md index c160585e..c4a5d7e8 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ PHPHtmlParser is a simple, flexible, html parser which allows you to select tags Install ------- -This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 5.4, 5.5, and hhvm 2.3. +This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 5.6, 7.0, and hhvm 2.3. Usage ----- diff --git a/composer.json b/composer.json index 1d554f41..c75e0e46 100644 --- a/composer.json +++ b/composer.json @@ -14,12 +14,12 @@ } ], "require": { - "php": ">=5.4", + "php": ">=5.6", "paquettg/string-encode": "~0.1.0" }, "require-dev": { - "phpunit/phpunit": "~4.8.0", - "satooshi/php-coveralls": "~0.6.0", + "phpunit/phpunit": "~5.3.0", + "satooshi/php-coveralls": "~1.0.0", "mockery/mockery": "~0.9.0" }, "autoload": { From 96634727ec973ee0a57c5060139bfd747f079b67 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 6 Apr 2016 12:42:15 -0400 Subject: [PATCH 053/200] Updated travis.yml to support scrutinizer --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index c0a4c975..28422ddb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,3 +15,5 @@ script: after_script: - php vendor/bin/coveralls + - wget https://scrutinizer-ci.com/ocular.phar + - php ocular.phar code-coverage:upload --format=php-clover build/logs/clover.xml From 57101798707405ff1405c6e2f24cbffc3c791082 Mon Sep 17 00:00:00 2001 From: Paris Holley Date: Tue, 3 May 2016 12:11:36 -0700 Subject: [PATCH 054/200] fixes #69 --- src/PHPHtmlParser/Dom/InnerNode.php | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 6f458460..34ac6895 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -235,13 +235,11 @@ public function isChild($id) */ public function replaceChild($childId, AbstractNode $newChild) { - $oldChild = $this->getChild($childId); - $keys = array_keys($this->children); - $index = array_search($childId, $keys, true); - $keys[$index] = $newChild->id(); - $this->children = array_combine($keys, $this->children); - $this->children[$newChild->id()] = $newChild; - unset($oldChild); + $oldChild = $this->children[$childId]; + unset($oldChild['node']); + $oldChild['node'] = $newChild; + unset($this->children[$childId]); + $this->children[$newChild->id()] = $oldChild; } /** From 99940d905a6dbddcd2b416a6930e17cafbfe379a Mon Sep 17 00:00:00 2001 From: Paris Holley Date: Sun, 8 May 2016 00:30:52 -0700 Subject: [PATCH 055/200] fix another case --- src/PHPHtmlParser/Dom/InnerNode.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 34ac6895..c9b0a725 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -236,10 +236,20 @@ public function isChild($id) public function replaceChild($childId, AbstractNode $newChild) { $oldChild = $this->children[$childId]; + $newChild->prev = $oldChild['prev']; + $newChild->next = $oldChild['next']; unset($oldChild['node']); $oldChild['node'] = $newChild; unset($this->children[$childId]); $this->children[$newChild->id()] = $oldChild; + + if ($newChild->prev && isset($this->children[$newChild->prev])) { + $this->children[$newChild->prev]['next'] = $newChild->id(); + } + + if ($newChild->next && isset($this->children[$newChild->next])) { + $this->children[$newChild->next]['prev'] = $newChild->id(); + } } /** From 57df1b278eb4e3f16d82fcae7be503a563164bcc Mon Sep 17 00:00:00 2001 From: Paris Holley Date: Wed, 11 May 2016 08:25:40 -0700 Subject: [PATCH 056/200] found use case where my way didnt work, merging origin style with fixes needed to work properly --- src/PHPHtmlParser/Dom/InnerNode.php | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index c9b0a725..0c7fc9b7 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -236,19 +236,26 @@ public function isChild($id) public function replaceChild($childId, AbstractNode $newChild) { $oldChild = $this->children[$childId]; + $newChild->prev = $oldChild['prev']; $newChild->next = $oldChild['next']; - unset($oldChild['node']); - $oldChild['node'] = $newChild; - unset($this->children[$childId]); - $this->children[$newChild->id()] = $oldChild; - if ($newChild->prev && isset($this->children[$newChild->prev])) { - $this->children[$newChild->prev]['next'] = $newChild->id(); + $keys = array_keys($this->children); + $index = array_search($childId, $keys, true); + $keys[$index] = $newChild->id(); + $this->children = array_combine($keys, $this->children); + $this->children[$newChild->id()] = array( + 'prev' => $oldChild['prev'], + 'node' => $newChild, + 'next' => $oldChild['next'] + ); + + if ($oldChild['prev'] && isset($this->children[$newChild->prev])) { + $this->children[$oldChild['prev']]['next'] = $newChild->id(); } - if ($newChild->next && isset($this->children[$newChild->next])) { - $this->children[$newChild->next]['prev'] = $newChild->id(); + if ($oldChild['next'] && isset($this->children[$newChild->next])) { + $this->children[$oldChild['next']]['prev'] = $newChild->id(); } } From 4128266e160e3f1ec40ade0f401d5b504760ce57 Mon Sep 17 00:00:00 2001 From: Paris Holley Date: Mon, 16 May 2016 14:43:47 -0700 Subject: [PATCH 057/200] ability to insert before and after --- .gitignore | 3 + src/PHPHtmlParser/Dom/InnerNode.php | 87 ++++++++++++++++++++++++----- tests/Node/ParentTest.php | 72 ++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 14 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..5a2f04bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +composer.lock +vendor +.idea/ diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 6f458460..70c545b2 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -104,7 +104,7 @@ public function countChildren() * @return bool * @throws CircularException */ - public function addChild(AbstractNode $child) + public function addChild(AbstractNode $child, $before = null) { $key = null; @@ -118,22 +118,51 @@ public function addChild(AbstractNode $child) throw new CircularException('Can not set itself as a child.'); } + $next = null; + if ($this->hasChildren()) { - if (isset($this->children[$child->id()])) { - // we already have this child - return false; - } - $sibling = $this->lastChild(); - $key = $sibling->id(); - $this->children[$key]['next'] = $child->id(); + if (isset($this->children[$child->id()])) { + // we already have this child + return false; + } + + if ($before) { + if (!isset($this->children[$before])) { + return false; + } + + $key = $this->children[$before]['prev']; + + if($key){ + $this->children[$key]['next'] = $child->id(); + } + + $this->children[$before]['prev'] = $child->id(); + $next = $before; + } else { + $sibling = $this->lastChild(); + $key = $sibling->id(); + + $this->children[$key]['next'] = $child->id(); + } } - // add the child - $this->children[$child->id()] = [ - 'node' => $child, - 'next' => null, - 'prev' => $key, - ]; + $keys = array_keys($this->children); + + $insert = [ + 'node' => $child, + 'next' => $next, + 'prev' => $key, + ]; + + $index = $key ? (array_search($key, $keys, true) + 1) : 0; + array_splice($keys, $index, 0, $child->id()); + + $children = array_values($this->children); + array_splice($children, $index, 0, [$insert]); + + // add the child + $this->children = array_combine($keys, $children); // tell child I am the new parent $child->setParent($this); @@ -144,6 +173,36 @@ public function addChild(AbstractNode $child) return true; } + /** + * Insert element before child with provided id + * + * @param AbstractNode $child + * @return bool + * @param int $id + */ + public function insertBefore(AbstractNode $child, $id){ + $this->addChild($child, $id); + } + + /** + * Insert element before after with provided id + * + * @param AbstractNode $child + * @return bool + * @param int $id + */ + public function insertAfter(AbstractNode $child, $id){ + if (!isset($this->children[$id])) { + return false; + } + + if ($this->children[$id]['next']) { + return $this->addChild($child, $this->children[$id]['next']); + } + + return $this->addChild($child); + } + /** * Removes the child by id. * diff --git a/tests/Node/ParentTest.php b/tests/Node/ParentTest.php index 9170c947..c80d2566 100644 --- a/tests/Node/ParentTest.php +++ b/tests/Node/ParentTest.php @@ -137,6 +137,78 @@ public function testLastChild() $this->assertEquals($child3->id(), $parent->lastChild()->id()); } + public function testInsertBeforeFirst() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child2); + $parent->addChild($child3); + + $parent->insertBefore($child, $child2->id()); + + $this->assertTrue($parent->isChild($child->id())); + $this->assertEquals($parent->firstChild()->id(), $child->id()); + $this->assertEquals($child->nextSibling()->id(), $child2->id()); + $this->assertEquals($child2->nextSibling()->id(), $child3->id()); + $this->assertEquals($parent->lastChild()->id(), $child3->id()); + } + + public function testInsertBeforeLast() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child); + $parent->addChild($child3); + + $parent->insertBefore($child2, $child3->id()); + + $this->assertTrue($parent->isChild($child2->id())); + $this->assertEquals($parent->firstChild()->id(), $child->id()); + $this->assertEquals($child->nextSibling()->id(), $child2->id()); + $this->assertEquals($child2->nextSibling()->id(), $child3->id()); + $this->assertEquals($parent->lastChild()->id(), $child3->id()); + } + + public function testInsertAfterFirst() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child); + $parent->addChild($child3); + + $parent->insertAfter($child2, $child->id()); + + $this->assertTrue($parent->isChild($child2->id())); + $this->assertEquals($parent->firstChild()->id(), $child->id()); + $this->assertEquals($child->nextSibling()->id(), $child2->id()); + $this->assertEquals($child2->nextSibling()->id(), $child3->id()); + $this->assertEquals($parent->lastChild()->id(), $child3->id()); + } + + public function testInsertAfterLast() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $child3 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + + $parent->insertAfter($child3, $child2->id()); + + $this->assertTrue($parent->isChild($child2->id())); + $this->assertEquals($parent->firstChild()->id(), $child->id()); + $this->assertEquals($child->nextSibling()->id(), $child2->id()); + $this->assertEquals($child2->nextSibling()->id(), $child3->id()); + $this->assertEquals($parent->lastChild()->id(), $child3->id()); + } + public function testReplaceChild() { $parent = new Node; From 9bc35dbb9fba57e2c997d1ad947d139f1413daf1 Mon Sep 17 00:00:00 2001 From: Paris Holley Date: Thu, 19 May 2016 12:16:47 -0700 Subject: [PATCH 058/200] support html tags with extra whitespace --- src/PHPHtmlParser/Dom.php | 4 ++-- tests/DomTest.php | 9 +++++++++ tests/files/whitespace.html | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 tests/files/whitespace.html diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index f3e17ff9..017ec62c 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -516,8 +516,8 @@ protected function parseTag() } if (empty($name)) { - $this->content->fastForward(1); - continue; + $this->content->skipByToken('blank'); + continue; } $this->content->skipByToken('blank'); diff --git a/tests/DomTest.php b/tests/DomTest.php index 9a13ee95..e1c3d548 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -172,6 +172,15 @@ public function testLoadUtf8() $this->assertEquals('Dzień', $dom->find('p', 0)->text); } + public function testLoadFileWhitespace() + { + $dom = new Dom; + $dom->setOptions(['cleanupInput' => false]); + $dom->loadFromFile('tests/files/whitespace.html'); + $this->assertEquals(1, count($dom->find('.class'))); + $this->assertEquals("", (string)$dom); + } + public function testLoadFileBig() { $dom = new Dom; diff --git a/tests/files/whitespace.html b/tests/files/whitespace.html new file mode 100644 index 00000000..b2603edc --- /dev/null +++ b/tests/files/whitespace.html @@ -0,0 +1 @@ + \ No newline at end of file From e1a61e45c900e96e8ea0cc6cb2d32ac7ddbf548f Mon Sep 17 00:00:00 2001 From: Cojad Date: Fri, 7 Oct 2016 15:18:31 +0800 Subject: [PATCH 059/200] update with full list html5 self closing tags html5 spec has a list of void elements (aka self closing tags) https://www.w3.org/TR/html5/syntax.html#void-elements --- src/PHPHtmlParser/Dom.php | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index f3e17ff9..323000f9 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -79,15 +79,23 @@ class Dom * @var array */ protected $selfClosing = [ - 'img', + 'area', + 'base', + 'basefont', 'br', + 'col', + 'embed', + 'hr', + 'img', 'input', - 'meta', + 'keygen', 'link', - 'hr', - 'base', - 'embed', + 'meta', + 'param', + 'source', 'spacer', + 'track', + 'wbr' ]; /** From 2091d112afbf362de6bdc061e454c8c44ed96127 Mon Sep 17 00:00:00 2001 From: masterlei Date: Sun, 16 Oct 2016 23:33:38 +0400 Subject: [PATCH 060/200] added browser agent to curl options --- src/PHPHtmlParser/Curl.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php index a6fcb95f..cf688cb9 100644 --- a/src/PHPHtmlParser/Curl.php +++ b/src/PHPHtmlParser/Curl.php @@ -28,6 +28,11 @@ public function get($url) curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_VERBOSE, true); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'); + curl_setopt($ch, CURLOPT_URL, $url); $content = curl_exec($ch); if ($content === false) { From 4004e5bbef611868921039ac8045ae22d9b0f32e Mon Sep 17 00:00:00 2001 From: "roketyyang@tencent.com" Date: Mon, 17 Oct 2016 12:34:13 +0800 Subject: [PATCH 061/200] fix clear bug --- src/PHPHtmlParser/Dom/AbstractNode.php | 3 --- src/PHPHtmlParser/Dom/HtmlNode.php | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 8549a46f..937f27aa 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -152,9 +152,6 @@ public function setParent(InnerNode $parent) // assign child to parent $this->parent->addChild($this); - //clear any cache - $this->clear(); - return $this; } diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index 42b4169c..685dd39c 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -186,6 +186,9 @@ protected function clear() $this->innerHtml = null; $this->outerHtml = null; $this->text = null; + if ( ! is_null($this->parent)) { + $this->parent->clear(); + } } /** From d5c093a8c5ef0dedf1442306423fe64feee3fc83 Mon Sep 17 00:00:00 2001 From: LeoTM Date: Mon, 24 Oct 2016 16:04:59 +0100 Subject: [PATCH 062/200] Update README.md Correct cleanupInput option section. Correct spelling. --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c4a5d7e8..af7e72cc 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Version 1.7.0 [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/paquettg/php-html-parser/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/paquettg/php-html-parser/?branch=master) -PHPHtmlParser is a simple, flexible, html parser which allows you to select tags using any css selector, like jQuery. The goal is to assiste in the development of tools which require a quick, easy way to scrap html, whether it's valid or not! This project was original supported by [sunra/php-simple-html-dom-parser](https://github.com/sunra/php-simple-html-dom-parser) but the support seems to have stopped so this project is my adaptation of his previous work. +PHPHtmlParser is a simple, flexible, html parser which allows you to select tags using any css selector, like jQuery. The goal is to assist in the development of tools which require a quick, easy way to scrap html, whether it's valid or not! This project was original supported by [sunra/php-simple-html-dom-parser](https://github.com/sunra/php-simple-html-dom-parser) but the support seems to have stopped so this project is my adaptation of his previous work. Install ------- @@ -35,7 +35,7 @@ The above will output "click here". Simple no? There are many ways to get the sa Loading Files ------------------ -You may also seamlessly load a file into the dom instead of a string, which is much more convinient and is how I except most developers will be loading the html. The following example is taken from our test and uses the "big.html" file found there. +You may also seamlessly load a file into the dom instead of a string, which is much more convenient and is how I except most developers will be loading the html. The following example is taken from our test and uses the "big.html" file found there. ```php // Assuming you installed from Composer: @@ -61,9 +61,9 @@ foreach ($contents as $content) } ``` -This example loads the html from big.html, a real page found online, and gets all the content-border classes to process. It also shows a few things you can do with a node but it is not an exhaustive list of methods that a node has avaiable. +This example loads the html from big.html, a real page found online, and gets all the content-border classes to process. It also shows a few things you can do with a node but it is not an exhaustive list of methods that a node has available. -Alternativly, you can always use the `load()` method to load the file. It will attempt to find the file using `file_exists` and, if succesfull, will call `loadFromFile()` for you. The same applies to a URL and `loadFromUrl()` method. +Alternativly, you can always use the `load()` method to load the file. It will attempt to find the file using `file_exists` and, if successful, will call `loadFromFile()` for you. The same applies to a URL and `loadFromUrl()` method. Loading Url ---------------- @@ -102,7 +102,7 @@ As long as the Connector object implements the `PHPHtmlParser\CurlInterface` int Loading Strings --------------- -Loading a string directly, with out the checks in `load()` is also easely done. +Loading a string directly, with out the checks in `load()` is also easily done. ```php // Assuming you installed from Composer: @@ -142,7 +142,7 @@ At the moment we support 7 options. **Strict** -Strict, by default false, will throw a `StrickException` if it find that the html is not strict complient (all tags must have a clossing tag, no attribute with out a value, etc.). +Strict, by default false, will throw a `StrickException` if it find that the html is not strictly compliant (all tags must have a closing tag, no attribute with out a value, etc.). **whitespaceTextNode** @@ -150,11 +150,11 @@ The whitespaceTextNode, by default true, option tells the parser to save textnod **enforceEncoding** -The enforceEncoding, by default null, option will enforce an charater set to be used for reading the content and returning the content in that encoding. Setting it to null will trigger an attempt to figure out the encoding from within the content of the string given instead. +The enforceEncoding, by default null, option will enforce an character set to be used for reading the content and returning the content in that encoding. Setting it to null will trigger an attempt to figure out the encoding from within the content of the string given instead. **cleanupInput** -Set this to `true` to skip the entire clean up phase of the parser. If this is set to true the next 3 options will be ignored. Defaults to `false`. +Set this to `false` to skip the entire clean up phase of the parser. If this is set to true the next 3 options will be ignored. Defaults to `true`. **removeScripts** From 4cb1619542168aefb44f56abe667d81bd57e8af9 Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Sat, 29 Oct 2016 23:26:00 +0300 Subject: [PATCH 063/200] Add additional features: findById, isTextNode, hasNextSibling, hasNextChild, countChildren, getChildren, hasChildren --- src/PHPHtmlParser/Dom.php | 54 +++++++++++++++++++++++++- src/PHPHtmlParser/Dom/AbstractNode.php | 46 ++++++++++++++++++++-- src/PHPHtmlParser/Dom/InnerNode.php | 12 ++++++ src/PHPHtmlParser/Dom/TextNode.php | 6 +++ src/PHPHtmlParser/Finder.php | 54 ++++++++++++++++++++++++++ 5 files changed, 168 insertions(+), 4 deletions(-) create mode 100644 src/PHPHtmlParser/Finder.php diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index f3e17ff9..f2253e74 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -219,6 +219,20 @@ public function find($selector, $nth = null) return $this->root->find($selector, $nth); } + /** + * Find element by Id on the root node + * + * @param int $id Element Id + * @return mixed + * + */ + public function findById($id) + { + $this->isLoaded(); + + return $this->root->findById($id); + } + /** * Adds the tag (or tags in an array) to the list of tags that will always * be self closing. @@ -291,6 +305,42 @@ public function lastChild() return $this->root->lastChild(); } + /** + * Simple wrapper function that returns count of child elements + * + * @return int + */ + public function countChildren() + { + $this->isLoaded(); + + return $this->root->countChildren(); + } + + /** + * Get array of children + * + * @return array + */ + public function getChildren() + { + $this->isLoaded(); + + return $this->root->getChildren(); + } + + /** + * Check if node have children nodes + * + * @return bool + */ + public function hasChildren() + { + $this->isLoaded(); + + return $this->root->hasChildren(); + } + /** * Simple wrapper function that returns an element by the * id. @@ -391,7 +441,9 @@ protected function clean($str) } // strip out server side scripts - $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str); + if ($this->options->get('serverSideScriptis') == true){ + $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str); + } // strip smarty scripts $str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str); diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 8549a46f..6bbb3f2a 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -5,6 +5,7 @@ use PHPHtmlParser\Exceptions\ParentNotFoundException; use PHPHtmlParser\Selector; use stringEncode\Encode; +use PHPHtmlParser\Finder; /** * Dom node object. @@ -17,7 +18,7 @@ */ abstract class AbstractNode { - + private static $count = 0; /** * Contains the tag name/type * @@ -54,11 +55,12 @@ abstract class AbstractNode protected $encode; /** - * Creates a unique spl hash for this node. + * Creates a unique id for this node. */ public function __construct() { - $this->id = spl_object_hash($this); + $this->id = self::$count; + self::$count++; } /** @@ -110,6 +112,11 @@ public function __toString() return $this->outerHtml(); } + public function resetCount() + { + self::$count = 0; + } + /** * Returns the id of this object. */ @@ -220,6 +227,15 @@ public function getAncestor($id) return null; } + public function hasNextSibling() + { + if (is_null($this->parent) || (!$this->parent->hasChildren())) { + return false; + } + + return $this->parent->hasNextChild($this->id()); + } + /** * Attempts to get the next sibling. * @@ -378,6 +394,20 @@ public function find($selector, $nth = null) return $nodes; } + /** + * Find node by id + * + * @param $id + * @return bool|AbstractNode + */ + public function findById($id) + { + $finder= new Finder($id); + + return $finder->find($this); + } + + /** * Gets the inner html of this node. * @@ -407,4 +437,14 @@ abstract public function text(); * @return void */ abstract protected function clear(); + + /** + * Check is node type textNode + * + * @return boolean + */ + public function isTextNode() { + + return false; + } } diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 6f458460..502ae884 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -175,6 +175,18 @@ public function removeChild($id) return $this; } + /** + * Check if has next Child + * + * @param $id childId + * @return mixed + */ + public function hasNextChild($id) + { + $child= $this->getChild($id); + return $this->children[$child->id()]['next']; + } + /** * Attempts to get the next child. * diff --git a/src/PHPHtmlParser/Dom/TextNode.php b/src/PHPHtmlParser/Dom/TextNode.php index 0a3d8773..6a0faa8b 100644 --- a/src/PHPHtmlParser/Dom/TextNode.php +++ b/src/PHPHtmlParser/Dom/TextNode.php @@ -102,4 +102,10 @@ protected function clear() { $this->convertedText = null; } + + public function isTextNode() + { + return true; + } + } diff --git a/src/PHPHtmlParser/Finder.php b/src/PHPHtmlParser/Finder.php new file mode 100644 index 00000000..f08b63ed --- /dev/null +++ b/src/PHPHtmlParser/Finder.php @@ -0,0 +1,54 @@ +id = $id; + } + + /** + * + * Find node in tree + * + * @param AbstractNode $node + * @return bool|AbstractNode + */ + public function find(AbstractNode $node) + { + + if (!$node->id()) { + return $this->find($node->firstChild()); + } + + if ($node->id() === $this->id) { + return $node; + } + + if ($node->hasNextSibling()) { + $nextSibling = $node->nextSibling(); + if ($nextSibling->id() === $this->id) { + return $nextSibling; + } + if ($nextSibling->id() > $this->id) { + return $this->find($node->firstChild()); + } + if ($nextSibling->id() < $this->id) { + return $this->find($nextSibling->firstChild()); + } + } + + return false; + } + +} \ No newline at end of file From 1db5d92876ae00ae6568943d80d14b065b5f95e7 Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Sun, 30 Oct 2016 21:12:23 +0200 Subject: [PATCH 064/200] Update finder engine --- src/PHPHtmlParser/Finder.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Finder.php b/src/PHPHtmlParser/Finder.php index f08b63ed..6517c6ff 100644 --- a/src/PHPHtmlParser/Finder.php +++ b/src/PHPHtmlParser/Finder.php @@ -44,8 +44,10 @@ public function find(AbstractNode $node) return $this->find($node->firstChild()); } if ($nextSibling->id() < $this->id) { - return $this->find($nextSibling->firstChild()); + return $this->find($nextSibling); } + } else { + return $this->find($node->firstChild()); } return false; From 39715da7757535a0078644147bbc27d450a5196c Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Mon, 31 Oct 2016 00:22:45 +0200 Subject: [PATCH 065/200] Update logic to reset nodes counter --- src/PHPHtmlParser/Dom.php | 1 + src/PHPHtmlParser/Dom/AbstractNode.php | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index f2253e74..eec60f1e 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -120,6 +120,7 @@ public function __get($name) */ public function load($str, $options = []) { + AbstractNode::resetCount(); // check if it's a file if (strpos($str, "\n") === false && is_file($str)) { return $this->loadFromFile($str, $options); diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 6bbb3f2a..33beeda3 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -112,7 +112,10 @@ public function __toString() return $this->outerHtml(); } - public function resetCount() + /** + * Reset node counter + */ + public static function resetCount() { self::$count = 0; } From 2e23256facf4c050b9ba4e3e41605a441eced137 Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Mon, 31 Oct 2016 02:30:08 +0200 Subject: [PATCH 066/200] Update finder logic --- src/PHPHtmlParser/Finder.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Finder.php b/src/PHPHtmlParser/Finder.php index 6517c6ff..e70cd0fb 100644 --- a/src/PHPHtmlParser/Finder.php +++ b/src/PHPHtmlParser/Finder.php @@ -31,13 +31,13 @@ public function find(AbstractNode $node) return $this->find($node->firstChild()); } - if ($node->id() === $this->id) { + if ($node->id() == $this->id) { return $node; } if ($node->hasNextSibling()) { $nextSibling = $node->nextSibling(); - if ($nextSibling->id() === $this->id) { + if ($nextSibling->id() == $this->id) { return $nextSibling; } if ($nextSibling->id() > $this->id) { From 802baecf51a852376b468f19c4a41b3cb311e02b Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Mon, 31 Oct 2016 02:34:40 +0200 Subject: [PATCH 067/200] Add text setter to TextNode --- src/PHPHtmlParser/Dom/TextNode.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/PHPHtmlParser/Dom/TextNode.php b/src/PHPHtmlParser/Dom/TextNode.php index 6a0faa8b..549c96b7 100644 --- a/src/PHPHtmlParser/Dom/TextNode.php +++ b/src/PHPHtmlParser/Dom/TextNode.php @@ -108,4 +108,9 @@ public function isTextNode() return true; } + public function setText($text) + { + $this->text = $text; + } + } From ab88ba23bc3a28f7a36cd15c8901e8351e779649 Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Mon, 31 Oct 2016 13:51:19 +0200 Subject: [PATCH 068/200] Add getStyle method to tag class --- src/PHPHtmlParser/Dom/Tag.php | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 43659556..57052457 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -225,6 +225,30 @@ public function getAttribute($key) return $this->attr[$key]; } + /** + * Return style array + * + * @return null|array + */ + public function getStyle() + { + $style = trim($this->getAttribute('style')); + + if ($style) { + $style_attr = explode(';', $style); + if (!$style_attr) { + return null; + } + foreach ($style_attr as $attr) { + $attr = explode(':', $attr); + $style_array[$attr[0]] = $attr[1]; + } + return $style_array; + } + + return null; + } + /** * Generates the opening tag for this object. * From 87de63123bb2007b304e79c6a43f4787ef5ff5c1 Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Mon, 31 Oct 2016 14:31:06 +0200 Subject: [PATCH 069/200] Update getStyle method in tag class --- src/PHPHtmlParser/Dom/Tag.php | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 57052457..da330c22 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -232,21 +232,28 @@ public function getAttribute($key) */ public function getStyle() { - $style = trim($this->getAttribute('style')); + $style = $this->getAttribute('style'); - if ($style) { - $style_attr = explode(';', $style); - if (!$style_attr) { - return null; - } - foreach ($style_attr as $attr) { - $attr = explode(':', $attr); - $style_array[$attr[0]] = $attr[1]; - } - return $style_array; + if ($style === null) { + return null; + } + + $style_attr = trim($style['value']); + $style_attr = substr($style_attr, 0, -1); + $style_attr = explode(';', $style_attr); + + if (!$style_attr) { + return null; + } + + $style_array = []; + + foreach ($style_attr as $attr) { + $attr = explode(':', $attr); + $style_array[$attr[0]] = $attr[1]; } - return null; + return $style_array; } /** From af5926ca8a3abf23d0e84f52a12fd6363b63c024 Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Tue, 8 Nov 2016 10:09:22 +0200 Subject: [PATCH 070/200] Add getAttributeArray() method to AbstractNode class --- src/PHPHtmlParser/Dom/AbstractNode.php | 23 +++++++++++++++++++ src/PHPHtmlParser/Dom/Tag.php | 31 -------------------------- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 33beeda3..8c74fbb7 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -450,4 +450,27 @@ public function isTextNode() { return false; } + + /** + * Get attribute values in array + * + * @param $attributeValue + * @param $delimiter + * @return array + */ + public function getAttributeArray($attributeValue, $delimiter) + { + $attributeValue = trim($attributeValue); + $attributeValue = substr($attributeValue, 0, -1); + $attributeValue = explode(';', $attributeValue); + + $result = []; + + foreach ($attributeValue as $attr) { + $attr = explode($delimiter, $attr); + $result[$attr[0]] = $attr[1]; + } + + return $result; + } } diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index da330c22..43659556 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -225,37 +225,6 @@ public function getAttribute($key) return $this->attr[$key]; } - /** - * Return style array - * - * @return null|array - */ - public function getStyle() - { - $style = $this->getAttribute('style'); - - if ($style === null) { - return null; - } - - $style_attr = trim($style['value']); - $style_attr = substr($style_attr, 0, -1); - $style_attr = explode(';', $style_attr); - - if (!$style_attr) { - return null; - } - - $style_array = []; - - foreach ($style_attr as $attr) { - $attr = explode(':', $attr); - $style_array[$attr[0]] = $attr[1]; - } - - return $style_array; - } - /** * Generates the opening tag for this object. * From b3f326b81db11065974fbbd3075059afd00ad38a Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Thu, 10 Nov 2016 11:53:58 +0200 Subject: [PATCH 071/200] Update getAttributeArray() method in AbstractNode class --- src/PHPHtmlParser/Dom/AbstractNode.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 8c74fbb7..0c2d6ddf 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -460,6 +460,10 @@ public function isTextNode() { */ public function getAttributeArray($attributeValue, $delimiter) { + if ($attributeValue == null) { + return null; + } + $attributeValue = trim($attributeValue); $attributeValue = substr($attributeValue, 0, -1); $attributeValue = explode(';', $attributeValue); From 3d7ccad20277f828645c871fc2848221deb2d63c Mon Sep 17 00:00:00 2001 From: Oleg Andreyev Date: Fri, 9 Dec 2016 13:14:11 +0200 Subject: [PATCH 072/200] adding hasAttribute --- src/PHPHtmlParser/Dom/AbstractNode.php | 12 ++++++++++++ src/PHPHtmlParser/Dom/Tag.php | 11 +++++++++++ src/PHPHtmlParser/Selector.php | 4 +--- tests/SelectorTest.php | 11 +++++++++++ 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 8549a46f..9f1a9fc7 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -293,6 +293,18 @@ public function getAttribute($key) return $attribute; } + /** + * A wrapper method that simply calls the hasAttribute method + * on the tag of this node. + * + * @param string $key + * @return bool + */ + public function hasAttribute($key) + { + return $this->tag->hasAttribute($key); + } + /** * A wrapper method that simply calls the setAttribute method * on the tag of this node. diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 43659556..c2e955fb 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -225,6 +225,17 @@ public function getAttribute($key) return $this->attr[$key]; } + /** + * Returns TRUE if node has attribute + * + * @param string $key + * @return bool + */ + public function hasAttribute($key) + { + return isset($this->attr[$key]); + } + /** * Generates the opening tag for this object. * diff --git a/src/PHPHtmlParser/Selector.php b/src/PHPHtmlParser/Selector.php index e0e1ac73..841c7a0c 100644 --- a/src/PHPHtmlParser/Selector.php +++ b/src/PHPHtmlParser/Selector.php @@ -232,9 +232,7 @@ protected function seek(array $nodes, array $rule, array $options) $pass = false; } } else { - if ($rule['key'] != 'plaintext' && - is_null($child->getAttribute($rule['key'])) - ) { + if ($rule['key'] != 'plaintext' && !$child->hasAttribute($rule['key'])) { $pass = false; } } diff --git a/tests/SelectorTest.php b/tests/SelectorTest.php index 1b31afa0..08a278f2 100644 --- a/tests/SelectorTest.php +++ b/tests/SelectorTest.php @@ -203,4 +203,15 @@ public function testFindChildUsingChildSelector() $selector = new Selector('div > ul'); $this->assertEquals(1, count($selector->find($root))); } + + public function testFindNodeByAttributeOnly() + { + $root = new HtmlNode(new Tag('root')); + $child1 = new HtmlNode(new Tag('ul')); + $child1->setAttribute('custom-attr', null); + $root->addChild($child1); + + $selector = new Selector('[custom-attr]'); + $this->assertEquals(1, count($selector->find($root))); + } } From 786b28ebee66f2bbe5a853b3518fb2a88b048854 Mon Sep 17 00:00:00 2001 From: Oleg Andreyev Date: Fri, 9 Dec 2016 13:15:07 +0200 Subject: [PATCH 073/200] fixing `testLoadFileBigTwicePreserveOption` to be system independent (*nix, win) --- tests/DomTest.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/DomTest.php b/tests/DomTest.php index 9a13ee95..177b9373 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -192,9 +192,10 @@ public function testLoadFileBigTwicePreserveOption() $dom = new Dom; $dom->loadFromFile('tests/files/big.html', ['preserveLineBreaks' => true]); $post = $dom->find('.post-row', 0); - $this->assertEquals('

Журчанье воды
-Черно-белые тени
-Вновь на фонтане

', trim($post->find('.post-message', 0)->innerHtml)); + $this->assertEquals( + "

Журчанье воды
\nЧерно-белые тени
\nВновь на фонтане

", + trim($post->find('.post-message', 0)->innerHtml) + ); } public function testLoadFromUrl() From c47796c715e16ce13f1eb5e801ea917a14cde7e8 Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Wed, 21 Dec 2016 17:25:55 +0200 Subject: [PATCH 074/200] Added setStyleAttributeValue, getStyleAttributeArray methods to Tag class --- src/PHPHtmlParser/Dom/AbstractNode.php | 5 +-- src/PHPHtmlParser/Dom/Tag.php | 45 ++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 0c2d6ddf..f65fe60c 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -453,12 +453,13 @@ public function isTextNode() { /** * Get attribute values in array - * + * @deprecated + * @todo remove this method * @param $attributeValue * @param $delimiter * @return array */ - public function getAttributeArray($attributeValue, $delimiter) + public function getAttributeArray($attributeValue, $delimiter = ';') { if ($attributeValue == null) { return null; diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 43659556..1e4de1dd 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -153,6 +153,51 @@ public function setAttribute($key, $value) return $this; } + /** + * Set inline style attribute value. + * + * @param $attr_key + * @param $attr_value + */ + public function setStyleAttributeValue($attr_key, $attr_value) + { + + $style_array = $this->getStyleAttributeArray(); + $style_array[$attr_key] = $attr_value; + + $style_string = ''; + foreach ($style_array as $key => $value) { + $style_string .= $key . ':' . $value . ';'; + } + + $this->setAttribute('style', $style_string); + } + + /** + * Get style attribute in array + * + * @return array|null + */ + public function getStyleAttributeArray() + { + $value = $this->getAttribute('style'); + + if ($value === null) { + return null; + } + + $value = explode(';', substr(trim($value), 0, -1)); + $result = []; + foreach ($value as $attr) { + $attr = explode(':', $attr); + $result[$attr[0]] = $attr[1]; + } + + return $result; + } + + + /** * Removes an attribute from this tag. * From 43086dfdf5b4b8045d3b56b55b312b6762f65a41 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 28 Dec 2016 12:36:17 -0500 Subject: [PATCH 075/200] Run clear() on attribute changes This change will run the clear() function to empty any caches if an attribute has been changed, added, or removed. --- src/PHPHtmlParser/Dom/AbstractNode.php | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 8549a46f..403dfb9c 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -305,6 +305,9 @@ public function setAttribute($key, $value) { $this->tag->setAttribute($key, $value); + //clear any cache + $this->clear(); + return $this; } @@ -318,6 +321,9 @@ public function setAttribute($key, $value) public function removeAttribute($key) { $this->tag->removeAttribute($key); + + //clear any cache + $this->clear(); } /** @@ -329,8 +335,10 @@ public function removeAttribute($key) public function removeAllAttributes() { $this->tag->removeAllAttributes(); + + //clear any cache + $this->clear(); } - /** * Function to locate a specific ancestor tag in the path to the root. * From 5d05cb089912474101e66d33b893657363e66254 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 28 Dec 2016 12:44:50 -0500 Subject: [PATCH 076/200] Fix InnerNode::replaceChild() to clear cache when run --- src/PHPHtmlParser/Dom/InnerNode.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 6f458460..a5e9329c 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -242,6 +242,9 @@ public function replaceChild($childId, AbstractNode $newChild) $this->children = array_combine($keys, $this->children); $this->children[$newChild->id()] = $newChild; unset($oldChild); + + //clear any cache + $this->clear(); } /** @@ -314,4 +317,4 @@ public function setParent(InnerNode $parent) return parent::setParent($parent); } -} \ No newline at end of file +} From 4030bc05e04825f2b5ddc2648590847077ad7cdd Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 28 Dec 2016 13:06:49 -0500 Subject: [PATCH 077/200] composer libraries should have a .gitignore file All composer based/distributed libraries should include a local gitignore to prevent the vendor folder from being included. I've included what I would call the default minimum; I was considering leaving composer.lock off the list, but as it's not currently synced it should be included here. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..fde868ef --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +composer.phar +composer.lock +/vendor/ From 7bcfa3a206616ad702dd3926570d377d54929e19 Mon Sep 17 00:00:00 2001 From: Daniel Pock Date: Wed, 28 Dec 2016 13:31:07 -0500 Subject: [PATCH 078/200] Add test to check for cache clear issue While the Tag test doesn't test this issue, I think it's a good addition. This particular bug/issue isn't triggered until you ask the library to 'render' the html for you. The HtmlTest does infact test this bug and when run on current master it WILL fail; however, it passes with the changes suggested. --- tests/Node/HtmlTest.php | 33 +++++++++++++++++++++++++++++++++ tests/Node/TagTest.php | 24 ++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index e6998305..e61b0aef 100644 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -246,6 +246,39 @@ public function testOuterHtmlNoValueAttribute() $this->assertEquals('', $parent->outerHtml); } + public function testOuterHtmlWithChanges() + { + $div = new Tag('div'); + $div->setAttributes([ + 'class' => [ + 'value' => 'all', + 'doubleQuote' => true, + ], + ]); + $a = new Tag('a'); + $a->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + $br = new Tag('br'); + $br->selfClosing(); + + $parent = new HtmlNode($div); + $childa = new HtmlNode($a); + $childbr = new HtmlNode($br); + $parent->addChild($childa); + $parent->addChild($childbr); + $childa->addChild(new TextNode('link')); + + $this->assertEquals('', $parent->outerHtml()); + + $childa->setAttribute('href', 'https://www.google.com'); + + $this->assertEquals('link', $childa->outerHtml()); + } + public function testText() { $a = new Tag('a'); diff --git a/tests/Node/TagTest.php b/tests/Node/TagTest.php index b3a528db..a10bef4e 100644 --- a/tests/Node/TagTest.php +++ b/tests/Node/TagTest.php @@ -68,6 +68,30 @@ public function testSetAttributesNoDoubleArray() $this->assertEquals('funtimes', $tag->class['value']); } + public function testUpdateAttributes() + { + $tag = new Tag('a'); + $tag->setAttributes([ + 'href' => [ + 'value' => 'http://google.com', + 'doubleQuote' => false, + ], + ]); + + $this->assertEquals(null, $tag->class['value']); + $this->assertEquals('http://google.com', $tag->href['value']); + + + $attr = [ + 'href' => 'https://www.google.com', + 'class' => 'funtimes', + ]; + + $tag->setAttributes($attr); + $this->assertEquals('funtimes', $tag->class['value']); + $this->assertEquals('https://www.google.com', $tag->href['value']); + } + public function testNoise() { $tag = new Tag('a'); From f557f95fe75e219748cc1d31b10852d296df5017 Mon Sep 17 00:00:00 2001 From: Dan Pock Date: Wed, 28 Dec 2016 21:17:32 -0500 Subject: [PATCH 079/200] Update to address another clear related issue --- src/PHPHtmlParser/Dom/AbstractNode.php | 10 +++++----- src/PHPHtmlParser/Dom/HtmlNode.php | 3 +++ src/PHPHtmlParser/Dom/MockNode.php | 3 +++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 403dfb9c..76e0ca9a 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -169,8 +169,8 @@ public function delete() if ( ! is_null($this->parent)) { $this->parent->removeChild($this->id); } - - $this->parent = null; + $this->parent->clear(); + $this->clear(); } /** @@ -307,7 +307,7 @@ public function setAttribute($key, $value) //clear any cache $this->clear(); - + return $this; } @@ -321,7 +321,7 @@ public function setAttribute($key, $value) public function removeAttribute($key) { $this->tag->removeAttribute($key); - + //clear any cache $this->clear(); } @@ -335,7 +335,7 @@ public function removeAttribute($key) public function removeAllAttributes() { $this->tag->removeAllAttributes(); - + //clear any cache $this->clear(); } diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index 42b4169c..36aabd88 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -186,6 +186,9 @@ protected function clear() $this->innerHtml = null; $this->outerHtml = null; $this->text = null; + if (is_null($this->parent) === false) { + $this->parent->clear(); + } } /** diff --git a/src/PHPHtmlParser/Dom/MockNode.php b/src/PHPHtmlParser/Dom/MockNode.php index 7029a180..ab7ccfe2 100644 --- a/src/PHPHtmlParser/Dom/MockNode.php +++ b/src/PHPHtmlParser/Dom/MockNode.php @@ -40,6 +40,9 @@ protected function clear() $this->innerHtml = null; $this->outerHtml = null; $this->text = null; + if (is_null($this->parent) === false) { + $this->parent->clear(); + } } /** From 770b48f78c4b66914359c3a9e0bf7f28a0503b02 Mon Sep 17 00:00:00 2001 From: billythekid Date: Mon, 16 Jan 2017 11:10:59 +0000 Subject: [PATCH 080/200] added optional array of tags to exempt from trailing slashes for self-closing tags --- src/PHPHtmlParser/Dom.php | 61 +++++++++++++++++++++++++++++++++++ src/PHPHtmlParser/Dom/Tag.php | 22 ++++++++++++- tests/DomTest.php | 9 ++++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index f3e17ff9..7aef2639 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -90,6 +90,13 @@ class Dom 'spacer', ]; + /** + * A list of tags where there should be no /> at the end (html5 style) + * + * @var array + */ + protected $noSlash = []; + /** * Returns the inner html of the root node. * @@ -267,6 +274,53 @@ public function clearSelfClosingTags() return $this; } + + /** + * Adds a tag to the list of self closing tags that should not have a trailing slash + * + * @param $tag + * @return $this + */ + public function addNoSlashTag($tag) + { + if ( ! is_array($tag)) { + $tag = [$tag]; + } + foreach ($tag as $value) { + $this->noSlash[] = $value; + } + + return $this; + } + + /** + * Removes a tag from the list of no-slash tags. + * + * @param $tag + * @return $this + */ + public function removeNoSlashTag($tag) + { + if ( ! is_array($tag)) { + $tag = [$tag]; + } + $this->noSlash = array_diff($this->noSlash, $tag); + + return $this; + } + + /** + * Empties the list of no-slash tags. + * + * @return $this + */ + public function clearNoSlashTags() + { + $this->noSlash = []; + + return $this; + } + /** * Simple wrapper function that returns the first child. * @@ -588,6 +642,13 @@ protected function parseTag() // We force self closing on this tag. $node->getTag()->selfClosing(); + + // Should this tag use a trailing slash? + if(in_array($tag, $this->noSlash)) + { + $node->getTag()->noTrailingSlash(); + } + } $this->content->fastForward(1); diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 43659556..2d354da5 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -33,6 +33,13 @@ class Tag */ protected $selfClosing = false; + /** + * If self-closing, will this use a trailing slash. /> + * + * @var bool + */ + protected $trailingSlash = true; + /** * Tag noise */ @@ -99,6 +106,19 @@ public function selfClosing() return $this; } + + /** + * Sets the tag to not use a trailing slash. + * + * @return $this + */ + public function noTrailingSlash() + { + $this->trailingSlash = false; + + return $this; + } + /** * Checks if the tag is self closing. * @@ -247,7 +267,7 @@ public function makeOpeningTag() } } - if ($this->selfClosing) { + if ($this->selfClosing && $this->trailingSlash) { return $return.' />'; } else { return $return.'>'; diff --git a/tests/DomTest.php b/tests/DomTest.php index 9a13ee95..06589864 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -88,6 +88,15 @@ public function testLoadClosingTagOnSelfClosing() $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } + public function testLoadClosingTagOnSelfClosingNoSlash() + { + $dom = new Dom; + $dom->addNoSlashTag("br"); + + $dom->load('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + public function testLoadClosingTagAddSelfClosingTag() { $dom = new Dom; From 21e15e136988918f64eb56fd609ccfd4b27d08ec Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Thu, 19 Jan 2017 16:20:39 +0200 Subject: [PATCH 081/200] Added Unit test cases for new methods --- tests/Node/ParentTest.php | 20 ++++++++++++++++++++ tests/Node/TextTest.php | 21 ++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/Node/ParentTest.php b/tests/Node/ParentTest.php index 9170c947..d9cbf179 100644 --- a/tests/Node/ParentTest.php +++ b/tests/Node/ParentTest.php @@ -72,6 +72,26 @@ public function testNextChild() $this->assertEquals($child2->id(), $parent->nextChild($child->id())->id()); } + public function testHasNextChild() + { + $parent = new Node; + $child = new Node; + $child2 = new Node; + $parent->addChild($child); + $parent->addChild($child2); + + $this->assertEquals($child2->id(), $parent->hasNextChild($child->id())); + } + + public function testHasNextChildNotExists() + { + $parent = new Node; + $child = new Node; + + $this->expectException(\PHPHtmlParser\Exceptions\ChildNotFoundException::class); + $parent->hasNextChild($child->id()); + } + public function testNextChildWithRemove() { $parent = new Node; diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index 022dc842..aa04ec59 100644 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -29,4 +29,23 @@ public function testPreserveEntity() $text = $node->innerhtml; $this->assertEquals('i', $text); } -} + + public function testIsTextNode() + { + $node = new TextNode('text'); + $this->assertEquals(true, $node->isTextNode()); + } + + public function testTextInTextNode() + { + $node = new TextNode('foo bar'); + $this->assertEquals('foo bar', $node->outerHtml()); + } + + public function testSetTextToTextNode() + { + $node = new TextNode(''); + $node->setText('foo bar'); + $this->assertEquals('foo bar', $node->innerHtml()); + } +} \ No newline at end of file From 41b8ed713183a5ffbbbfbd3b40f015a08bdda7fc Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Thu, 19 Jan 2017 17:54:09 +0200 Subject: [PATCH 082/200] Added Unit test cases for new methods --- src/PHPHtmlParser/Finder.php | 6 ++--- tests/DomTest.php | 48 ++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/src/PHPHtmlParser/Finder.php b/src/PHPHtmlParser/Finder.php index e70cd0fb..1c754c3c 100644 --- a/src/PHPHtmlParser/Finder.php +++ b/src/PHPHtmlParser/Finder.php @@ -19,14 +19,13 @@ public function __construct($id) /** * - * Find node in tree + * Find node in tree by id * * @param AbstractNode $node * @return bool|AbstractNode */ public function find(AbstractNode $node) { - if (!$node->id()) { return $this->find($node->firstChild()); } @@ -46,11 +45,10 @@ public function find(AbstractNode $node) if ($nextSibling->id() < $this->id) { return $this->find($nextSibling); } - } else { + } else if (!$node->isTextNode()) { return $this->find($node->firstChild()); } return false; } - } \ No newline at end of file diff --git a/tests/DomTest.php b/tests/DomTest.php index 9a13ee95..fad261cb 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -319,4 +319,52 @@ public function testDeleteNode() unset($a); $this->assertEquals('

Hey bro,
:)

', (string) $dom); } + + public function testCountChildren() + { + $dom = new Dom; + $dom->load('hello$foo = "bar";'); + $this->assertEquals(2, $dom->countChildren()); + } + + public function testGetChildrenArray() + { + $dom = new Dom; + $dom->load('hello$foo = "bar";'); + $this->assertInternalType('array', $dom->getChildren()); + } + + public function testHasChildren() + { + $dom = new Dom; + $dom->load('hello$foo = "bar";'); + $this->assertTrue($dom->hasChildren()); + } + + public function testFindByIdVar1() + { + $dom = new Dom; + $dom->load('

Hey bro, click here
:)

'); + /** @var Dom\AbstractNode $result */ + $result = $dom->findById(4); + $this->assertEquals(4, $result->id()); + } + + public function testFindByIdVar2() + { + $dom = new Dom; + $dom->load('

Hey bro, click here
:)

'); + /** @var Dom\AbstractNode $result */ + $result = $dom->findById(5); + $this->assertEquals(5, $result->id()); + } + + public function testFindByIdNotFountEleement() + { + $dom = new Dom; + $dom->load('

Hey bro, click here
:)

'); + /** @var Dom\AbstractNode $result */ + $result = $dom->findById(8); + $this->assertFalse($result); + } } From 8535994263d3fc5b11656d0fcb6ae853037cee87 Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Thu, 19 Jan 2017 18:38:33 +0200 Subject: [PATCH 083/200] Added Unit test cases for style attributes --- src/PHPHtmlParser/Dom/Tag.php | 2 +- tests/Node/TagTest.php | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 1e4de1dd..97a4bece 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -180,7 +180,7 @@ public function setStyleAttributeValue($attr_key, $attr_value) */ public function getStyleAttributeArray() { - $value = $this->getAttribute('style'); + $value = $this->getAttribute('style')['value']; if ($value === null) { return null; diff --git a/tests/Node/TagTest.php b/tests/Node/TagTest.php index b3a528db..361bfc98 100644 --- a/tests/Node/TagTest.php +++ b/tests/Node/TagTest.php @@ -153,4 +153,18 @@ public function testMakeClosingTagSelfClosing() $tag->selfClosing(); $this->assertEmpty($tag->makeClosingTag()); } + + public function testSetTagAttribute() + { + $tag = new Tag('div'); + $tag->setStyleAttributeValue('display', 'none'); + $this->assertEquals('display:none;', $tag->getAttribute('style')['value']); + } + + public function testGetStyleAttributesArray() + { + $tag = new Tag('div'); + $tag->setStyleAttributeValue('display', 'none'); + $this->assertInternalType('array', $tag->getStyleAttributeArray()); + } } From 9928233c13a09106d436b15ccdc94e678da10c9d Mon Sep 17 00:00:00 2001 From: Andrii Lytvynenko Date: Thu, 19 Jan 2017 18:56:25 +0200 Subject: [PATCH 084/200] Removed deprecated method getAttributeArray --- src/PHPHtmlParser/Dom/AbstractNode.php | 28 -------------------------- 1 file changed, 28 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index f65fe60c..33beeda3 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -450,32 +450,4 @@ public function isTextNode() { return false; } - - /** - * Get attribute values in array - * @deprecated - * @todo remove this method - * @param $attributeValue - * @param $delimiter - * @return array - */ - public function getAttributeArray($attributeValue, $delimiter = ';') - { - if ($attributeValue == null) { - return null; - } - - $attributeValue = trim($attributeValue); - $attributeValue = substr($attributeValue, 0, -1); - $attributeValue = explode(';', $attributeValue); - - $result = []; - - foreach ($attributeValue as $attr) { - $attr = explode($delimiter, $attr); - $result[$attr[0]] = $attr[1]; - } - - return $result; - } } From fbd2f2e6666c5b0cab61aab4d07e16ff9f5152b9 Mon Sep 17 00:00:00 2001 From: Florian Arndt Date: Tue, 4 Apr 2017 13:44:32 +0200 Subject: [PATCH 085/200] Inserted return statement. --- src/PHPHtmlParser/Dom/AbstractNode.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 8549a46f..e60376f5 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -83,7 +83,7 @@ public function __get($key) case 'tag': return $this->getTag(); case 'parent': - $this->getParent(); + return $this->getParent(); } return null; From 6847a772a9f7a6d2ee1b79160b35e909e2fe666c Mon Sep 17 00:00:00 2001 From: Florian Arndt Date: Tue, 4 Apr 2017 13:45:49 +0200 Subject: [PATCH 086/200] . --- composer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer.json b/composer.json index c75e0e46..a4a6f6c4 100644 --- a/composer.json +++ b/composer.json @@ -1,5 +1,5 @@ { - "name": "paquettg/php-html-parser", + "name": "arweb/php-html-parser", "type": "library", "version": "1.7.0", "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.", From 067980d2c40e49dda5d74b2b7441809bb289dd71 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Jul 2017 20:35:13 +0000 Subject: [PATCH 087/200] Added mbstring to requirements --- composer.json | 1 + 1 file changed, 1 insertion(+) diff --git a/composer.json b/composer.json index c75e0e46..dec36443 100644 --- a/composer.json +++ b/composer.json @@ -15,6 +15,7 @@ ], "require": { "php": ">=5.6", + "ext-mbstring": "*", "paquettg/string-encode": "~0.1.0" }, "require-dev": { From 0a237beb2ed5358b7811c9b0e77659b837a2a94d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Jul 2017 20:35:31 +0000 Subject: [PATCH 088/200] Updated tests --- tests/DomTest.php | 9 - tests/Options/CleanupTest.php | 18 +- tests/StaticDomTest.php | 10 +- tests/files/big.html | 4 +- tests/files/horrible.html | 301 ---------------------------------- 5 files changed, 14 insertions(+), 328 deletions(-) delete mode 100644 tests/files/horrible.html diff --git a/tests/DomTest.php b/tests/DomTest.php index 9a13ee95..387d8772 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -259,15 +259,6 @@ public function testGetElementsByClass() $this->assertEquals('

Hey bro, click here

', $dom->getElementsByClass('all')[0]->innerHtml); } - public function testEnforceEncoding() - { - $dom = new Dom; - $dom->load('tests/files/horrible.html', [ - 'enforceEncoding' => 'UTF-8', - ]); - $this->assertNotEquals('', $dom->find('table input', 1)->outerHtml); - } - public function testScriptCleanerScriptTag() { $dom = new Dom; diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php index 44539651..5df17bf1 100644 --- a/tests/Options/CleanupTest.php +++ b/tests/Options/CleanupTest.php @@ -10,7 +10,7 @@ public function testCleanupInputTrue() $dom->setOptions([ 'cleanupInput' => true, ]); - $dom->loadFromFile('tests/files/horrible.html'); + $dom->loadFromFile('tests/files/big.html'); $this->assertEquals(0, count($dom->find('style'))); $this->assertEquals(0, count($dom->find('script'))); } @@ -21,9 +21,9 @@ public function testCleanupInputFalse() $dom->setOptions([ 'cleanupInput' => false, ]); - $dom->loadFromFile('tests/files/horrible.html'); + $dom->loadFromFile('tests/files/big.html'); $this->assertEquals(1, count($dom->find('style'))); - $this->assertEquals(1, count($dom->find('script'))); + $this->assertEquals(22, count($dom->find('script'))); } public function testRemoveStylesTrue() @@ -32,7 +32,7 @@ public function testRemoveStylesTrue() $dom->setOptions([ 'removeStyles' => true, ]); - $dom->loadFromFile('tests/files/horrible.html'); + $dom->loadFromFile('tests/files/big.html'); $this->assertEquals(0, count($dom->find('style'))); } @@ -42,7 +42,7 @@ public function testRemoveStylesFalse() $dom->setOptions([ 'removeStyles' => false, ]); - $dom->loadFromFile('tests/files/horrible.html'); + $dom->loadFromFile('tests/files/big.html'); $this->assertEquals(1, count($dom->find('style'))); $this->assertEquals('text/css', $dom->find('style')->getAttribute('type')); @@ -54,7 +54,7 @@ public function testRemoveScriptsTrue() $dom->setOptions([ 'removeScripts' => true, ]); - $dom->loadFromFile('tests/files/horrible.html'); + $dom->loadFromFile('tests/files/big.html'); $this->assertEquals(0, count($dom->find('script'))); } @@ -64,9 +64,9 @@ public function testRemoveScriptsFalse() $dom->setOptions([ 'removeScripts' => false, ]); - $dom->loadFromFile('tests/files/horrible.html'); - $this->assertEquals(1, count($dom->find('script'))); - $this->assertEquals('text/JavaScript', + $dom->loadFromFile('tests/files/big.html'); + $this->assertEquals(22, count($dom->find('script'))); + $this->assertEquals('text/javascript', $dom->find('script')->getAttribute('type')); } diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index ad6c218a..5c1f487a 100644 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -41,12 +41,6 @@ public function testLoadFromFile() $this->assertEquals('VonBurgermeister', $dom->find('.post-user font', 0)->text); } - public function testFind() - { - Dom::load('tests/files/horrible.html'); - $this->assertEquals('', Dom::find('table input', 1)->outerHtml); - } - /** * @expectedException PHPHtmlParser\Exceptions\NotLoadedException */ @@ -57,8 +51,8 @@ public function testFindNoLoad() public function testFindI() { - Dom::load('tests/files/horrible.html'); - $this->assertEquals('[ Досие бр:12928 ]', Dom::find('i')[0]->innerHtml); + Dom::load('tests/files/big.html'); + $this->assertEquals('В кустах блестит металл
И искрится ток
Человечеству конец', Dom::find('i')[1]->innerHtml); } public function testLoadFromUrl() diff --git a/tests/files/big.html b/tests/files/big.html index 64f0f57b..6b5e3ee5 100644 --- a/tests/files/big.html +++ b/tests/files/big.html @@ -48,7 +48,9 @@ - + - - - - - - -

- - 0-9 A - B C - D E - F G - H I - J K - L M - N O - P Q - R S - T U - V X Y - W Z - -

- - - - - - - - -
- - -
- -
- : - - -
-
-
- - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
: marnet.mk [ :12928 ]
-
-
: 22-05-2014
-
-
-

:

-
-
22-05-2008
-
-

:

-
-
-
-

:

-
-
. .17 ϣ
-
-

:

-
-
4080011519278
-
-

:

-
-
02/3256-561
  
-
-

:

-
-
-
-

e-mail:

-
-
domains@marnet.net.mk
-
-

:

-
-
//
-
-

:

-
-
-
-

e-mail:

-
-
domains@marnet.net.mk
-
-

:

-
-
//
IP
nsg.mio.gov.mk80.77.151.251
kitka.marnet.net.mk194.149.131.2
- - - -
- - - -
From 050e7ff5a6c7b961dd037ed20810dd8f8db91d48 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Jul 2017 20:38:44 +0000 Subject: [PATCH 089/200] Updated phpunit --- composer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer.json b/composer.json index dec36443..96c16d2f 100644 --- a/composer.json +++ b/composer.json @@ -19,7 +19,7 @@ "paquettg/string-encode": "~0.1.0" }, "require-dev": { - "phpunit/phpunit": "~5.3.0", + "phpunit/phpunit": "~5.7.0", "satooshi/php-coveralls": "~1.0.0", "mockery/mockery": "~0.9.0" }, From c1aaa0d122ebaf677eb1b6cf73d03062b98a2c16 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Jul 2017 23:08:06 +0000 Subject: [PATCH 090/200] Updated travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 28422ddb..20c077a0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,7 @@ language: php php: - 5.6 - 7.0 - - hhvm + ~ 7.1 install: - composer self-update From da5afae3f4905cc8ac20ebf64fb06a25ba5d5ee9 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Jul 2017 23:08:16 +0000 Subject: [PATCH 091/200] $61 Implemented new setText method --- README.md | 12 +++++++++++- src/PHPHtmlParser/Dom/TextNode.php | 18 ++++++++++++++++++ tests/Node/TextTest.php | 23 +++++++++++++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c4a5d7e8..6dde603a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ PHPHtmlParser is a simple, flexible, html parser which allows you to select tags Install ------- -This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 5.6, 7.0, and hhvm 2.3. +This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 5.6, 7.0, 7.1. Usage ----- @@ -217,3 +217,13 @@ $a->delete(); unset($a); echo $dom; // '

Hey bro,
:)

'); ``` + +You can modify the text of `TextNode` objects easely. Please note that, if you set an encoding, the new text will be encoded using the existing encoding. + +```php +$dom = new Dom; +$dom->load('

Hey bro, click here
:)

'); +$a = $dom->find('a')[0]; +$a->firstChild()->setText('biz baz'); +echo $dom; // '

Hey bro, biz baz
:)

' +``` diff --git a/src/PHPHtmlParser/Dom/TextNode.php b/src/PHPHtmlParser/Dom/TextNode.php index 0a3d8773..0a853774 100644 --- a/src/PHPHtmlParser/Dom/TextNode.php +++ b/src/PHPHtmlParser/Dom/TextNode.php @@ -72,6 +72,24 @@ public function text() } } + /** + * Sets the text for this node. + * + * @var string $text + * @return void + */ + public function setText($text) + { + $this->text = $text; + + if ( ! is_null($this->encode)) { + $text = $this->encode->convert($text); + + // remember the conversion + $this->convertedText = $text; + } + } + /** * This node has no html, just return the text. * diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index 022dc842..40b6c59f 100644 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -1,6 +1,8 @@ innerhtml; $this->assertEquals('i', $text); } + + public function testSetText() + { + $dom = new Dom; + $dom->load('

Hey bro, click here
:)

'); + $a = $dom->find('a')[0]; + $a->firstChild()->setText('biz baz'); + $this->assertEquals('

Hey bro, biz baz
:)

', (string) $dom); + } + + public function testSetTextEncoded() + { + $encode = new Encode; + $encode->from('UTF-8'); + $encode->to('UTF-8'); + + $node = new TextNode('foo bar'); + $node->propagateEncoding($encode); + $node->setText('biz baz'); + $this->assertEquals('biz baz', $node->text()); + } } From 8c150f52a66154f16c1d5d00386f24b7f832b046 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Jul 2017 23:14:27 +0000 Subject: [PATCH 092/200] Fixed issue in composer json --- composer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer.json b/composer.json index dac82284..96c16d2f 100644 --- a/composer.json +++ b/composer.json @@ -1,5 +1,5 @@ { - "name": "arweb/php-html-parser", + "name": "paquettg/php-html-parser", "type": "library", "version": "1.7.0", "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.", From c1b94214d06948c489242032ee5bfa9e5dedd5ea Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 24 Jul 2017 00:20:42 +0000 Subject: [PATCH 093/200] Updated gitignore and attributes --- .gitattributes | 1 + .gitignore | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitattributes b/.gitattributes index 3c40333f..afc2bfbc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,6 +1,7 @@ /tests export-ignore /.scrutinizar.yml export-ignore /.travis.yml export-ignore +/.gitignore export-ignore /CHANGELOG.md export-ignore /CONTRIBUTING.md export-ignore /LICENSE.md export-ignore diff --git a/.gitignore b/.gitignore index b0d2e593..b871be44 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ composer.phar composer.lock /vendor/ .idea/ +*.swp From 35de083eb8022da461686917da043cb5dd300d52 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 24 Jul 2017 00:20:55 +0000 Subject: [PATCH 094/200] Fixes #73 --- tests/Node/HtmlTest.php | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index e61b0aef..8f433859 100644 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -1,6 +1,7 @@ ancestorByTag('div'); } + + public function testReplaceNode() + { + $dom = new Dom; + $dom->load('

Hey bro, click here
:)

'); + $id = $dom->find('p')[0]->id(); + $newChild = new HtmlNode('h1'); + $dom->find('p')[0]->getParent()->replaceChild($id, $newChild); + $this->assertEquals('

', (string) $dom); + } } From e85e379a07143f60471d289748f7d26513e28c99 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 24 Jul 2017 00:37:01 +0000 Subject: [PATCH 095/200] Added double space option fixes #80 close #80 --- README.md | 5 ++++- src/PHPHtmlParser/Dom.php | 2 +- src/PHPHtmlParser/Dom/TextNode.php | 8 +++++--- src/PHPHtmlParser/Options.php | 1 + tests/DomTest.php | 10 ++++++++++ 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 11a32179..a2e88b3c 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ $dom->load('http://google.com', [ $dom->load('http://gmail.com'); // will not have whitespaceTextNode set to false. ``` -At the moment we support 7 options. +At the moment we support 8 options. **Strict** @@ -168,6 +168,9 @@ Set this to `false` to skip removing of style tags from the document body. This Preserves Line Breaks if set to `true`. If set to `false` line breaks are cleaned up as part of the input clean up process. Defaults to `false`. +**removeDoubleSpace** + +Set this to `false` if you want to preserver whitespace inside of text nodes. It is set to `true` by default. Static Facade ------------- diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index ac765ca7..22b56ec6 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -565,7 +565,7 @@ protected function parse() trim($str) != '' ) { // we found text we care about - $textNode = new TextNode($str); + $textNode = new TextNode($str, $this->options->removeDoubleSpace); $activeNode->addChild($textNode); } } diff --git a/src/PHPHtmlParser/Dom/TextNode.php b/src/PHPHtmlParser/Dom/TextNode.php index facb8fcd..c4c65c28 100644 --- a/src/PHPHtmlParser/Dom/TextNode.php +++ b/src/PHPHtmlParser/Dom/TextNode.php @@ -35,10 +35,12 @@ class TextNode extends LeafNode * * @param string $text */ - public function __construct($text) + public function __construct($text, $removeDoubleSpace = true) { - // remove double spaces - $text = mb_ereg_replace('\s+', ' ', $text); + if ($removeDoubleSpace) { + // remove double spaces + $text = mb_ereg_replace('\s+', ' ', $text); + } // restore line breaks $text = str_replace(' ', "\n", $text); diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index ac84ac2f..9baf6c09 100644 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -25,6 +25,7 @@ class Options 'removeScripts' => true, 'removeStyles' => true, 'preserveLineBreaks' => false, + 'removeDoubleSpace' => true, ]; /** diff --git a/tests/DomTest.php b/tests/DomTest.php index 5481422b..cceeaf29 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -377,4 +377,14 @@ public function testFindByIdNotFountEleement() $result = $dom->findById(8); $this->assertFalse($result); } + + public function testWhitespaceInText() + { + $dom = new Dom(); + $dom->setOptions(array( + 'removeDoubleSpace' => false, + )); + $dom->load('
    Hello world
'); + $this->assertEquals('
    Hello world
', (string) $dom); + } } From 1fd97411e0dc042c221263c61bd393973efb9c93 Mon Sep 17 00:00:00 2001 From: Upperfoot Date: Thu, 12 Oct 2017 12:43:12 +0100 Subject: [PATCH 096/200] Resolves Problem with PHP 7.2 Throwing Error on count Problem: Currently PHP 7.2 throws an error if count is used on a non-countable object. Resolution: Check for variable Countableness. Change: Check variable is an instanceof Countable. --- src/PHPHtmlParser/Selector.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Selector.php b/src/PHPHtmlParser/Selector.php index e0e1ac73..6910f3d6 100644 --- a/src/PHPHtmlParser/Selector.php +++ b/src/PHPHtmlParser/Selector.php @@ -6,6 +6,7 @@ use PHPHtmlParser\Dom\InnerNode; use PHPHtmlParser\Dom\LeafNode; use PHPHtmlParser\Exceptions\ChildNotFoundException; +use Countable; /** * Class Selector @@ -168,7 +169,9 @@ protected function parseSelectorString($selector) protected function seek(array $nodes, array $rule, array $options) { // XPath index - if (count($rule['tag']) > 0 && + if ($rule['tag'] instanceof Countable && + count($rule['tag']) > 0 && + $rule['key'] instanceof Countable && count($rule['key']) > 0 && is_numeric($rule['key']) ) { From 261e5dd38212e7e3978ab39651fde7c66fecddff Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 8 Jan 2019 21:31:20 -0500 Subject: [PATCH 097/200] Updated dependencies and tests Updated Supported PHP Versions --- .travis.yml | 6 +++--- composer.json | 10 +++++----- phpunit.xml | 1 - tests/CollectionTest.php | 4 +++- tests/ContentTest.php | 4 +++- tests/DomTest.php | 4 +++- tests/Node/ChildrenTest.php | 4 +++- tests/Node/HtmlTest.php | 5 +++-- tests/Node/ParentTest.php | 4 +++- tests/Node/TagTest.php | 4 +++- tests/Node/TextTest.php | 6 ++++-- tests/Options/CleanupTest.php | 4 +++- tests/Options/PreserveLineBreaks.php | 4 +++- tests/Options/StrictTest.php | 4 +++- tests/Options/WhitespaceTextNodeTest.php | 4 +++- tests/OptionsTest.php | 4 +++- tests/SelectorTest.php | 4 +++- tests/StaticDomTest.php | 4 +++- 18 files changed, 54 insertions(+), 26 deletions(-) diff --git a/.travis.yml b/.travis.yml index 20c077a0..37036761 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,9 @@ language: php php: - - 5.6 - - 7.0 - ~ 7.1 + - 7.1 + - 7.2 + - 7.3 install: - composer self-update diff --git a/composer.json b/composer.json index 96c16d2f..885b470a 100644 --- a/composer.json +++ b/composer.json @@ -14,14 +14,14 @@ } ], "require": { - "php": ">=5.6", + "php": ">=7.1", "ext-mbstring": "*", - "paquettg/string-encode": "~0.1.0" + "paquettg/string-encode": "~1.0.0" }, "require-dev": { - "phpunit/phpunit": "~5.7.0", - "satooshi/php-coveralls": "~1.0.0", - "mockery/mockery": "~0.9.0" + "phpunit/phpunit": "^7.5.1", + "mockery/mockery": "^1.2", + "php-coveralls/php-coveralls": "^2.1" }, "autoload": { "psr-0": { diff --git a/phpunit.xml b/phpunit.xml index 64a402ac..d0aa7db8 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -8,7 +8,6 @@ convertWarningsToExceptions="true" processIsolation="false" stopOnFailure="false" - syntaxCheck="false" > diff --git a/tests/CollectionTest.php b/tests/CollectionTest.php index 6204f954..8e9f1c59 100644 --- a/tests/CollectionTest.php +++ b/tests/CollectionTest.php @@ -1,11 +1,13 @@ setText('biz baz'); $this->assertEquals('biz baz', $node->text()); } -} \ No newline at end of file +} diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php index 5df17bf1..5dd3854e 100644 --- a/tests/Options/CleanupTest.php +++ b/tests/Options/CleanupTest.php @@ -1,8 +1,10 @@ Date: Sat, 12 Jan 2019 14:33:30 -0500 Subject: [PATCH 098/200] Updated README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a2e88b3c..5826d2c2 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ PHPHtmlParser is a simple, flexible, html parser which allows you to select tags Install ------- -This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 5.6, 7.0, 7.1. +This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.1, 7.2, and 7.3. Usage ----- From 236b9a16d7a092ee6425f276590056537d2e703f Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 12 Jan 2019 14:33:50 -0500 Subject: [PATCH 099/200] Fixed issue with selector on top level items --- src/PHPHtmlParser/Selector.php | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/PHPHtmlParser/Selector.php b/src/PHPHtmlParser/Selector.php index 99767768..5fe2179b 100644 --- a/src/PHPHtmlParser/Selector.php +++ b/src/PHPHtmlParser/Selector.php @@ -169,10 +169,8 @@ protected function parseSelectorString($selector) protected function seek(array $nodes, array $rule, array $options) { // XPath index - if ($rule['tag'] instanceof Countable && - count($rule['tag']) > 0 && - $rule['key'] instanceof Countable && - count($rule['key']) > 0 && + if (array_key_exists('tag', $rule) && + array_key_exists('key', $rule) && is_numeric($rule['key']) ) { $count = 0; From 9b32cd1341fec31adfc41d8f116d8da7aab269bd Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 12 Jan 2019 15:07:50 -0500 Subject: [PATCH 100/200] Fixed pattern for PHP 7.3 --- src/PHPHtmlParser/Selector.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Selector.php b/src/PHPHtmlParser/Selector.php index 5fe2179b..e327ad05 100644 --- a/src/PHPHtmlParser/Selector.php +++ b/src/PHPHtmlParser/Selector.php @@ -21,7 +21,7 @@ class Selector * * @var string */ - protected $pattern = "/([\w-:\*>]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + protected $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; protected $selectors = []; From d57a7eb1e3811b30d6201d6fc7bfc9db133b415e Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 20 Jan 2019 20:51:53 -0500 Subject: [PATCH 101/200] Updated method signatures --- src/PHPHtmlParser/Content.php | 28 ++++---- src/PHPHtmlParser/Curl.php | 2 +- src/PHPHtmlParser/CurlInterface.php | 2 +- src/PHPHtmlParser/Dom.php | 96 ++++++++++++++------------ src/PHPHtmlParser/Dom/AbstractNode.php | 84 ++++++++++++++-------- src/PHPHtmlParser/Dom/ArrayNode.php | 6 +- src/PHPHtmlParser/Dom/Collection.php | 20 +++--- src/PHPHtmlParser/Dom/HtmlNode.php | 12 ++-- src/PHPHtmlParser/Dom/InnerNode.php | 62 ++++++++++------- src/PHPHtmlParser/Dom/MockNode.php | 10 +-- src/PHPHtmlParser/Dom/Tag.php | 49 +++++++------ src/PHPHtmlParser/Dom/TextNode.php | 15 ++-- src/PHPHtmlParser/Options.php | 7 +- src/PHPHtmlParser/Selector.php | 10 +-- src/PHPHtmlParser/StaticDom.php | 18 ++--- 15 files changed, 238 insertions(+), 183 deletions(-) diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index aa0dfe52..09c74d6a 100644 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -43,9 +43,9 @@ class Content /** * Content constructor. * - * @param $content + * @param string $content */ - public function __construct($content) + public function __construct(string $content = '') { $this->content = $content; $this->size = strlen($content); @@ -57,7 +57,7 @@ public function __construct($content) * * @return int */ - public function getPosition() + public function getPosition(): int { return $this->pos; } @@ -68,7 +68,7 @@ public function getPosition() * @param int $char * @return string */ - public function char($char = null) + public function char(int $char = null): string { $pos = $this->pos; if ( ! is_null($char)) { @@ -86,9 +86,10 @@ public function char($char = null) * Moves the current position forward. * * @param int $count - * @return $this + * @return Content + * @chainable */ - public function fastForward($count) + public function fastForward(int $count): Content { $this->pos += $count; @@ -99,9 +100,10 @@ public function fastForward($count) * Moves the current position backward. * * @param int $count - * @return $this + * @return Content + * @chainable */ - public function rewind($count) + public function rewind(int $count): Content { $this->pos -= $count; if ($this->pos < 0) { @@ -119,7 +121,7 @@ public function rewind($count) * @param bool $escape * @return string */ - public function copyUntil($string, $char = false, $escape = false) + public function copyUntil(string $string, bool $char = false, bool $escape = false): string { if ($this->pos >= $this->size) { // nothing left @@ -180,7 +182,7 @@ public function copyUntil($string, $char = false, $escape = false) * @param string $unless * @return string */ - public function copyUntilUnless($string, $unless) + public function copyUntilUnless(string $string, string $unless) { $lastPos = $this->pos; $this->fastForward(1); @@ -205,7 +207,7 @@ public function copyUntilUnless($string, $unless) * @return string * @uses $this->copyUntil() */ - public function copyByToken($token, $char = false, $escape = false) + public function copyByToken(string $token, bool $char = false, bool $escape = false) { $string = $this->$token; @@ -219,7 +221,7 @@ public function copyByToken($token, $char = false, $escape = false) * @param bool $copy * @return $this|string */ - public function skip($string, $copy = false) + public function skip(string $string, bool $copy = false) { $len = strspn($this->content, $string, $this->pos); @@ -243,7 +245,7 @@ public function skip($string, $copy = false) * @return null|string * @uses $this->skip() */ - public function skipByToken($token, $copy = false) + public function skipByToken(string $token, bool $copy = false) { $string = $this->$token; diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php index cf688cb9..8ce8fa37 100644 --- a/src/PHPHtmlParser/Curl.php +++ b/src/PHPHtmlParser/Curl.php @@ -18,7 +18,7 @@ class Curl implements CurlInterface * @return string * @throws CurlException */ - public function get($url) + public function get(string $url): string { $ch = curl_init($url); diff --git a/src/PHPHtmlParser/CurlInterface.php b/src/PHPHtmlParser/CurlInterface.php index 6f3f0951..622857ec 100644 --- a/src/PHPHtmlParser/CurlInterface.php +++ b/src/PHPHtmlParser/CurlInterface.php @@ -15,5 +15,5 @@ interface CurlInterface * @param string $url * @return string */ - public function get($url); + public function get($url): string; } diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 22b56ec6..cc88281d 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -110,7 +110,7 @@ class Dom * * @return string */ - public function __toString() + public function __toString(): string { return $this->root->innerHtml(); } @@ -131,9 +131,10 @@ public function __get($name) * * @param string $str * @param array $options - * @return $this + * @return Dom + * @chainable */ - public function load($str, $options = []) + public function load(string $str, array $options = []): Dom { AbstractNode::resetCount(); // check if it's a file @@ -153,9 +154,10 @@ public function load($str, $options = []) * * @param string $file * @param array $options - * @return $this + * @return Dom + * @chainable */ - public function loadFromFile($file, $options = []) + public function loadFromFile(string $file, array $options = []): Dom { return $this->loadStr(file_get_contents($file), $options); } @@ -167,9 +169,10 @@ public function loadFromFile($file, $options = []) * @param string $url * @param array $options * @param CurlInterface $curl - * @return $this + * @return Dom + * @chainable */ - public function loadFromUrl($url, $options = [], CurlInterface $curl = null) + public function loadFromUrl(string $url, array $options = [], CurlInterface $curl = null): Dom { if (is_null($curl)) { // use the default curl interface @@ -186,9 +189,10 @@ public function loadFromUrl($url, $options = [], CurlInterface $curl = null) * * @param string $str * @param array $option - * @return $this + * @return Dom + * @chainable */ - public function loadStr($str, $option) + public function loadStr(string $str, array $option): Dom { $this->options = new Options; $this->options->setOptions($this->globalOptions) @@ -212,9 +216,10 @@ public function loadStr($str, $option) * Sets a global options array to be used by all load calls. * * @param array $options - * @return $this + * @return Dom + * @chainable */ - public function setOptions(array $options) + public function setOptions(array $options): Dom { $this->globalOptions = $options; @@ -226,9 +231,9 @@ public function setOptions(array $options) * * @param string $selector * @param int $nth - * @return array + * @return mixed */ - public function find($selector, $nth = null) + public function find(string $selector, int $nth = null) { $this->isLoaded(); @@ -238,11 +243,10 @@ public function find($selector, $nth = null) /** * Find element by Id on the root node * - * @param int $id Element Id + * @param int $id * @return mixed - * */ - public function findById($id) + public function findById(int $id) { $this->isLoaded(); @@ -254,9 +258,10 @@ public function findById($id) * be self closing. * * @param string|array $tag - * @return $this + * @return Dom + * @chainable */ - public function addSelfClosingTag($tag) + public function addSelfClosingTag($tag): Dom { if ( ! is_array($tag)) { $tag = [$tag]; @@ -273,9 +278,10 @@ public function addSelfClosingTag($tag) * always be self closing. * * @param string|array $tag - * @return $this + * @return Dom + * @chainable */ - public function removeSelfClosingTag($tag) + public function removeSelfClosingTag($tag): Dom { if ( ! is_array($tag)) { $tag = [$tag]; @@ -288,9 +294,10 @@ public function removeSelfClosingTag($tag) /** * Sets the list of self closing tags to empty. * - * @return $this + * @return Dom + * @chainable */ - public function clearSelfClosingTags() + public function clearSelfClosingTags(): Dom { $this->selfClosing = []; @@ -302,9 +309,10 @@ public function clearSelfClosingTags() * Adds a tag to the list of self closing tags that should not have a trailing slash * * @param $tag - * @return $this + * @return Dom + * @chainable */ - public function addNoSlashTag($tag) + public function addNoSlashTag($tag): Dom { if ( ! is_array($tag)) { $tag = [$tag]; @@ -320,9 +328,10 @@ public function addNoSlashTag($tag) * Removes a tag from the list of no-slash tags. * * @param $tag - * @return $this + * @return Dom + * @chainable */ - public function removeNoSlashTag($tag) + public function removeNoSlashTag($tag): Dom { if ( ! is_array($tag)) { $tag = [$tag]; @@ -335,9 +344,10 @@ public function removeNoSlashTag($tag) /** * Empties the list of no-slash tags. * - * @return $this + * @return Dom + * @chainable */ - public function clearNoSlashTags() + public function clearNoSlashTags(): Dom { $this->noSlash = []; @@ -349,7 +359,7 @@ public function clearNoSlashTags() * * @return \PHPHtmlParser\Dom\AbstractNode */ - public function firstChild() + public function firstChild(): \PHPHtmlParser\Dom\AbstractNode { $this->isLoaded(); @@ -361,7 +371,7 @@ public function firstChild() * * @return \PHPHtmlParser\Dom\AbstractNode */ - public function lastChild() + public function lastChild(): \PHPHtmlParser\Dom\AbstractNode { $this->isLoaded(); @@ -373,7 +383,7 @@ public function lastChild() * * @return int */ - public function countChildren() + public function countChildren(): int { $this->isLoaded(); @@ -385,7 +395,7 @@ public function countChildren() * * @return array */ - public function getChildren() + public function getChildren(): array { $this->isLoaded(); @@ -397,7 +407,7 @@ public function getChildren() * * @return bool */ - public function hasChildren() + public function hasChildren(): bool { $this->isLoaded(); @@ -411,7 +421,7 @@ public function hasChildren() * @param string $id * @return \PHPHtmlParser\Dom\AbstractNode */ - public function getElementById($id) + public function getElementById($id): \PHPHtmlParser\Dom\AbstractNode { $this->isLoaded(); @@ -423,9 +433,9 @@ public function getElementById($id) * tag name. * * @param string $name - * @return array + * @return mixed */ - public function getElementsByTag($name) + public function getElementsByTag(string $name) { $this->isLoaded(); @@ -437,9 +447,9 @@ public function getElementsByTag($name) * class name. * * @param string $class - * @return array + * @return mixed */ - public function getElementsByClass($class) + public function getElementsByClass(string $class) { $this->isLoaded(); @@ -451,7 +461,7 @@ public function getElementsByClass($class) * * @throws NotLoadedException */ - protected function isLoaded() + protected function isLoaded(): void { if (is_null($this->content)) { throw new NotLoadedException('Content is not loaded!'); @@ -464,7 +474,7 @@ protected function isLoaded() * @param string $str * @return string */ - protected function clean($str) + protected function clean(string $str): string { if ($this->options->get('cleanupInput') != true) { // skip entire cleanup step @@ -517,7 +527,7 @@ protected function clean($str) /** * Attempts to parse the html in content. */ - protected function parse() + protected function parse(): void { // add the root node $this->root = new HtmlNode('root'); @@ -577,7 +587,7 @@ protected function parse() * @return array * @throws StrictException */ - protected function parseTag() + protected function parseTag(): array { $return = [ 'status' => false, @@ -725,7 +735,7 @@ protected function parseTag() * * @return bool */ - protected function detectCharset() + protected function detectCharset(): bool { // set the default $encode = new Encode; diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index c1b94f9b..fbd1c906 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -3,6 +3,7 @@ use PHPHtmlParser\Exceptions\CircularException; use PHPHtmlParser\Exceptions\ParentNotFoundException; +use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Selector; use stringEncode\Encode; use PHPHtmlParser\Finder; @@ -69,7 +70,7 @@ public function __construct() * @param string $key * @return mixed */ - public function __get($key) + public function __get(string $key) { // check attribute first if ( ! is_null($this->getAttribute($key))) { @@ -114,6 +115,8 @@ public function __toString() /** * Reset node counter + * + * @return void */ public static function resetCount() { @@ -122,8 +125,10 @@ public static function resetCount() /** * Returns the id of this object. + * + * @return int */ - public function id() + public function id(): int { return $this->id; } @@ -142,10 +147,11 @@ public function getParent() * Sets the parent node. * * @param InnerNode $parent - * @return $this + * @return AbstractNode * @throws CircularException + * @chainable */ - public function setParent(InnerNode $parent) + public function setParent(InnerNode $parent): AbstractNode { // remove from old parent if ( ! is_null($this->parent)) { @@ -199,7 +205,7 @@ public function propagateEncoding(Encode $encode) * @param int $id * @return bool */ - public function isAncestor($id) + public function isAncestor(int $id): Bool { if ( ! is_null($this->getAncestor($id))) { return true; @@ -214,7 +220,7 @@ public function isAncestor($id) * @param int $id * @return null|AbstractNode */ - public function getAncestor($id) + public function getAncestor(int $id) { if ( ! is_null($this->parent)) { if ($this->parent->id() == $id) { @@ -227,13 +233,29 @@ public function getAncestor($id) return null; } - public function hasNextSibling() + /** + * Checks if the current node has a next sibling. + * + * @return bool + */ + public function hasNextSibling(): bool { - if (is_null($this->parent) || (!$this->parent->hasChildren())) { + try + { + $sibling = $this->nextSibling(); + // sibling found, return true; + return true; + } + catch (ParentNotFoundException $e) + { + // no parent, no next sibling + return false; + } + catch (ChildNotFoundException $e) + { + // no sibling found return false; } - - return $this->parent->hasNextChild($this->id()); } /** @@ -242,7 +264,7 @@ public function hasNextSibling() * @return AbstractNode * @throws ParentNotFoundException */ - public function nextSibling() + public function nextSibling(): AbstractNode { if (is_null($this->parent)) { throw new ParentNotFoundException('Parent is not set for this node.'); @@ -257,7 +279,7 @@ public function nextSibling() * @return AbstractNode * @throws ParentNotFoundException */ - public function previousSibling() + public function previousSibling(): AbstractNode { if (is_null($this->parent)) { throw new ParentNotFoundException('Parent is not set for this node.'); @@ -271,7 +293,7 @@ public function previousSibling() * * @return Tag */ - public function getTag() + public function getTag(): Tag { return $this->tag; } @@ -282,7 +304,7 @@ public function getTag() * * @return array */ - public function getAttributes() + public function getAttributes(): array { $attributes = $this->tag->getAttributes(); foreach ($attributes as $name => $info) { @@ -299,7 +321,7 @@ public function getAttributes() * @param string $key * @return mixed */ - public function getAttribute($key) + public function getAttribute(string $key) { $attribute = $this->tag->getAttribute($key); if ( ! is_null($attribute)) { @@ -316,7 +338,7 @@ public function getAttribute($key) * @param string $key * @return bool */ - public function hasAttribute($key) + public function hasAttribute(string $key): bool { return $this->tag->hasAttribute($key); } @@ -326,10 +348,11 @@ public function hasAttribute($key) * on the tag of this node. * * @param string $key - * @param string $value - * @return $this + * @param string|null $value + * @return AbstractNode + * @chainable */ - public function setAttribute($key, $value) + public function setAttribute(string $key, $value): AbstractNode { $this->tag->setAttribute($key, $value); @@ -346,7 +369,7 @@ public function setAttribute($key, $value) * @param string $key * @return void */ - public function removeAttribute($key) + public function removeAttribute(string $key): void { $this->tag->removeAttribute($key); @@ -360,7 +383,7 @@ public function removeAttribute($key) * * @return void */ - public function removeAllAttributes() + public function removeAllAttributes(): void { $this->tag->removeAllAttributes(); @@ -374,7 +397,7 @@ public function removeAllAttributes() * @return AbstractNode * @throws ParentNotFoundException */ - public function ancestorByTag($tag) + public function ancestorByTag(string $tag): AbstractNode { // Start by including ourselves in the comparison. $node = $this; @@ -397,7 +420,7 @@ public function ancestorByTag($tag) * @param int $nth * @return array|AbstractNode */ - public function find($selector, $nth = null) + public function find(string $selector, int $nth = null) { $selector = new Selector($selector); $nodes = $selector->find($this); @@ -417,10 +440,10 @@ public function find($selector, $nth = null) /** * Find node by id * - * @param $id + * @param int $id * @return bool|AbstractNode */ - public function findById($id) + public function findById(int $id) { $finder= new Finder($id); @@ -433,7 +456,7 @@ public function findById($id) * * @return string */ - abstract public function innerHtml(); + abstract public function innerHtml(): string; /** * Gets the html of this node, including it's own @@ -441,14 +464,14 @@ abstract public function innerHtml(); * * @return string */ - abstract public function outerHtml(); + abstract public function outerHtml(): string; /** * Gets the text of this node (if there is any text). * * @return string */ - abstract public function text(); + abstract public function text(): string; /** * Call this when something in the node tree has changed. Like a child has been added @@ -456,14 +479,15 @@ abstract public function text(); * * @return void */ - abstract protected function clear(); + abstract protected function clear(): void; /** * Check is node type textNode * * @return boolean */ - public function isTextNode() { + public function isTextNode(): bool + { return false; } diff --git a/src/PHPHtmlParser/Dom/ArrayNode.php b/src/PHPHtmlParser/Dom/ArrayNode.php index 5ced08ed..d1549f43 100644 --- a/src/PHPHtmlParser/Dom/ArrayNode.php +++ b/src/PHPHtmlParser/Dom/ArrayNode.php @@ -17,7 +17,7 @@ abstract class ArrayNode extends AbstractNode implements IteratorAggregate, Coun * * @return ArrayIterator */ - public function getIterator() + public function getIterator(): ArrayIterator { return new ArrayIterator($this->getIteratorArray()); } @@ -27,7 +27,7 @@ public function getIterator() * * @return int */ - public function count() + public function count(): int { return count($this->getIteratorArray()); } @@ -37,5 +37,5 @@ public function count() * * @return array */ - abstract protected function getIteratorArray(); + abstract protected function getIteratorArray(): array; } diff --git a/src/PHPHtmlParser/Dom/Collection.php b/src/PHPHtmlParser/Dom/Collection.php index 70532e45..f2733b8d 100644 --- a/src/PHPHtmlParser/Dom/Collection.php +++ b/src/PHPHtmlParser/Dom/Collection.php @@ -31,7 +31,7 @@ class Collection implements IteratorAggregate, ArrayAccess, Countable * @return mixed; * @throws EmptyCollectionException */ - public function __call($method, $arguments) + public function __call(string $method, array $arguments) { $node = reset($this->collection); if ($node instanceof AbstractNode) { @@ -66,7 +66,7 @@ public function __get($key) * @return string * @throws EmptyCollectionException */ - public function __toString() + public function __toString(): string { $node = reset($this->collection); if ($node instanceof AbstractNode) { @@ -81,7 +81,7 @@ public function __toString() * * @return int */ - public function count() + public function count(): int { return count($this->collection); } @@ -91,7 +91,7 @@ public function count() * * @return ArrayIterator */ - public function getIterator() + public function getIterator(): ArrayIterator { return new ArrayIterator($this->collection); } @@ -102,7 +102,7 @@ public function getIterator() * @param mixed $offset * @param mixed $value */ - public function offsetSet($offset, $value) + public function offsetSet($offset, $value): void { if (is_null($offset)) { $this->collection[] = $value; @@ -117,7 +117,7 @@ public function offsetSet($offset, $value) * @param mixed $offset * @return bool */ - public function offsetExists($offset) + public function offsetExists($offset): bool { return isset($this->collection[$offset]); } @@ -127,7 +127,7 @@ public function offsetExists($offset) * * @param mixed $offset */ - public function offsetUnset($offset) + public function offsetUnset($offset): void { unset($this->collection[$offset]); } @@ -148,7 +148,7 @@ public function offsetGet($offset) * * @return array */ - public function toArray() + public function toArray(): array { return $this->collection; } @@ -157,9 +157,9 @@ public function toArray() * Similar to jQuery "each" method. Calls the callback with each * Node in this collection. * - * @param callback $callback + * @param callable $callback */ - public function each($callback) + public function each(callable $callback) { foreach ($this->collection as $key => $value) { $callback($value, $key); diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index ceb024ed..87be6c52 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -42,7 +42,7 @@ class HtmlNode extends InnerNode /** * Sets up the tag of this node. * - * @param $tag + * @param string|Tag $tag */ public function __construct($tag) { @@ -59,7 +59,7 @@ public function __construct($tag) * @return string * @throws UnknownChildTypeException */ - public function innerHtml() + public function innerHtml(): string { if ( ! $this->hasChildren()) { // no children @@ -104,7 +104,7 @@ public function innerHtml() * * @return string */ - public function outerHtml() + public function outerHtml(): string { // special handling for root if ($this->tag->name() == 'root') { @@ -141,7 +141,7 @@ public function outerHtml() * @param bool $lookInChildren * @return string */ - public function text($lookInChildren = false) + public function text(bool $lookInChildren = false): string { if ($lookInChildren) { if ( ! is_null($this->textWithChildren)) { @@ -181,7 +181,7 @@ public function text($lookInChildren = false) * Call this when something in the node tree has changed. Like a child has been added * or a parent has been changed. */ - protected function clear() + protected function clear(): void { $this->innerHtml = null; $this->outerHtml = null; @@ -197,7 +197,7 @@ protected function clear() * * @return array */ - protected function getIteratorArray() + protected function getIteratorArray(): array { return $this->getChildren(); } diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 1473903e..2c521f70 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -27,7 +27,7 @@ abstract class InnerNode extends ArrayNode * @param Encode $encode * @return void */ - public function propagateEncoding(Encode $encode) + public function propagateEncoding(Encode $encode): void { $this->encode = $encode; $this->tag->setEncoding($encode); @@ -44,7 +44,7 @@ public function propagateEncoding(Encode $encode) * * @return bool */ - public function hasChildren() + public function hasChildren(): bool { return ! empty($this->children); } @@ -56,7 +56,7 @@ public function hasChildren() * @return AbstractNode * @throws ChildNotFoundException */ - public function getChild($id) + public function getChild(int $id): AbstractNode { if ( ! isset($this->children[$id])) { throw new ChildNotFoundException("Child '$id' not found in this node."); @@ -70,7 +70,7 @@ public function getChild($id) * * @return array */ - public function getChildren() + public function getChildren(): array { $nodes = []; try { @@ -91,7 +91,7 @@ public function getChildren() * * @return int */ - public function countChildren() + public function countChildren(): int { return count($this->children); } @@ -101,10 +101,11 @@ public function countChildren() * parent. * * @param AbstractNode $child + * @param Int $before * @return bool * @throws CircularException */ - public function addChild(AbstractNode $child, $before = null) + public function addChild(AbstractNode $child, int $before = -1): bool { $key = null; @@ -126,7 +127,7 @@ public function addChild(AbstractNode $child, $before = null) return false; } - if ($before) { + if ($before >= 0) { if (!isset($this->children[$before])) { return false; } @@ -177,21 +178,23 @@ public function addChild(AbstractNode $child, $before = null) * Insert element before child with provided id * * @param AbstractNode $child - * @return bool * @param int $id + * @return bool */ - public function insertBefore(AbstractNode $child, $id){ - $this->addChild($child, $id); + public function insertBefore(AbstractNode $child, int $id): bool + { + return $this->addChild($child, $id); } /** * Insert element before after with provided id * * @param AbstractNode $child - * @return bool * @param int $id + * @return bool */ - public function insertAfter(AbstractNode $child, $id){ + public function insertAfter(AbstractNode $child, int $id): bool + { if (!isset($this->children[$id])) { return false; } @@ -207,9 +210,10 @@ public function insertAfter(AbstractNode $child, $id){ * Removes the child by id. * * @param int $id - * @return $this + * @return InnerNode + * @chainable */ - public function removeChild($id) + public function removeChild(int $id): InnerNode { if ( ! isset($this->children[$id])) { return $this; @@ -237,10 +241,10 @@ public function removeChild($id) /** * Check if has next Child * - * @param $id childId + * @param int $id * @return mixed */ - public function hasNextChild($id) + public function hasNextChild(int $id) { $child= $this->getChild($id); return $this->children[$child->id()]['next']; @@ -254,10 +258,13 @@ public function hasNextChild($id) * @uses $this->getChild() * @throws ChildNotFoundException */ - public function nextChild($id) + public function nextChild(int $id): AbstractNode { $child = $this->getChild($id); $next = $this->children[$child->id()]['next']; + if (is_null($next)) { + throw new ChildNotFoundException("Child '$id' next not found in this node."); + } return $this->getChild($next); } @@ -270,10 +277,13 @@ public function nextChild($id) * @uses $this->getChild() * @throws ChildNotFoundException */ - public function previousChild($id) + public function previousChild(int $id): AbstractNode { $child = $this->getchild($id); $next = $this->children[$child->id()]['prev']; + if (is_null($next)) { + throw new ChildNotFoundException("Child '$id' previous not found in this node."); + } return $this->getChild($next); } @@ -285,7 +295,7 @@ public function previousChild($id) * @param int $id * @return bool */ - public function isChild($id) + public function isChild(int $id): bool { foreach ($this->children as $childId => $child) { if ($id == $childId) { @@ -303,8 +313,9 @@ public function isChild($id) * @param int $childId * @param AbstractNode $newChild * @throws ChildNotFoundException + * @return void */ - public function replaceChild($childId, AbstractNode $newChild) + public function replaceChild(int $childId, AbstractNode $newChild): void { $oldChild = $this->children[$childId]; @@ -336,7 +347,7 @@ public function replaceChild($childId, AbstractNode $newChild) * @return AbstractNode * @uses $this->getChild() */ - public function firstChild() + public function firstChild(): AbstractNode { reset($this->children); $key = key($this->children); @@ -349,7 +360,7 @@ public function firstChild() * * @return AbstractNode */ - public function lastChild() + public function lastChild(): AbstractNode { end($this->children); $key = key($this->children); @@ -364,7 +375,7 @@ public function lastChild() * @param int $id * @return bool */ - public function isDescendant($id) + public function isDescendant(int $id): bool { if ($this->isChild($id)) { return true; @@ -388,10 +399,11 @@ public function isDescendant($id) * Sets the parent node. * * @param InnerNode $parent - * @return $this + * @return AbstractNode * @throws CircularException + * @chainable */ - public function setParent(InnerNode $parent) + public function setParent(InnerNode $parent): AbstractNode { // check integrity if ($this->isDescendant($parent->id())) { diff --git a/src/PHPHtmlParser/Dom/MockNode.php b/src/PHPHtmlParser/Dom/MockNode.php index ab7ccfe2..5f18eb44 100644 --- a/src/PHPHtmlParser/Dom/MockNode.php +++ b/src/PHPHtmlParser/Dom/MockNode.php @@ -14,28 +14,28 @@ class MockNode extends InnerNode /** * Mock of innner html. */ - public function innerHtml() + public function innerHtml(): string { } /** * Mock of outer html. */ - public function outerHtml() + public function outerHtml(): string { } /** * Mock of text. */ - public function text() + public function text(): string { } /** * Clear content of this node */ - protected function clear() + protected function clear(): void { $this->innerHtml = null; $this->outerHtml = null; @@ -50,7 +50,7 @@ protected function clear() * * @return array */ - protected function getIteratorArray() + protected function getIteratorArray(): array { return $this->getChildren(); } diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index e3d9e109..6d6e1071 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -57,7 +57,7 @@ class Tag * * @param $name */ - public function __construct($name) + public function __construct(string $name) { $this->name = $name; } @@ -89,7 +89,7 @@ public function __set($key, $value) * * @return string */ - public function name() + public function name(): string { return $this->name; } @@ -97,9 +97,10 @@ public function name() /** * Sets the tag to be self closing. * - * @return $this + * @return Tag + * @chainable */ - public function selfClosing() + public function selfClosing(): Tag { $this->selfClosing = true; @@ -110,9 +111,10 @@ public function selfClosing() /** * Sets the tag to not use a trailing slash. * - * @return $this + * @return Tag + * @chainable */ - public function noTrailingSlash() + public function noTrailingSlash(): Tag { $this->trailingSlash = false; @@ -124,7 +126,7 @@ public function noTrailingSlash() * * @return bool */ - public function isSelfClosing() + public function isSelfClosing(): bool { return $this->selfClosing; } @@ -133,8 +135,9 @@ public function isSelfClosing() * Sets the encoding type to be used. * * @param Encode $encode + * @return void */ - public function setEncoding(Encode $encode) + public function setEncoding(Encode $encode): void { $this->encode = $encode; } @@ -142,10 +145,11 @@ public function setEncoding(Encode $encode) /** * Sets the noise for this tag (if any) * - * @param $noise - * @return $this + * @param string $noise + * @return Tag + * @chainable */ - public function noise($noise) + public function noise(string $noise): Tag { $this->noise = $noise; @@ -157,9 +161,10 @@ public function noise($noise) * * @param string $key * @param string|array $value - * @return $this + * @return Tag + * @chainable */ - public function setAttribute($key, $value) + public function setAttribute(string $key, $value): Tag { $key = strtolower($key); if ( ! is_array($value)) { @@ -176,10 +181,10 @@ public function setAttribute($key, $value) /** * Set inline style attribute value. * - * @param $attr_key - * @param $attr_value + * @param mixed $attr_key + * @param mixed $attr_value */ - public function setStyleAttributeValue($attr_key, $attr_value) + public function setStyleAttributeValue($attr_key, $attr_value): void { $style_array = $this->getStyleAttributeArray(); @@ -196,14 +201,14 @@ public function setStyleAttributeValue($attr_key, $attr_value) /** * Get style attribute in array * - * @return array|null + * @return array */ - public function getStyleAttributeArray() + public function getStyleAttributeArray(): array { $value = $this->getAttribute('style')['value']; if ($value === null) { - return null; + return []; } $value = explode(';', substr(trim($value), 0, -1)); @@ -221,7 +226,7 @@ public function getStyleAttributeArray() /** * Removes an attribute from this tag. * - * @param $key + * @param mixed $key * @return void */ public function removeAttribute($key) @@ -276,7 +281,7 @@ public function getAttributes() * @param string $key * @return mixed */ - public function getAttribute($key) + public function getAttribute(string $key) { if ( ! isset($this->attr[$key])) { return null; @@ -296,7 +301,7 @@ public function getAttribute($key) * @param string $key * @return bool */ - public function hasAttribute($key) + public function hasAttribute(string $key) { return isset($this->attr[$key]); } diff --git a/src/PHPHtmlParser/Dom/TextNode.php b/src/PHPHtmlParser/Dom/TextNode.php index c4c65c28..a389b061 100644 --- a/src/PHPHtmlParser/Dom/TextNode.php +++ b/src/PHPHtmlParser/Dom/TextNode.php @@ -34,8 +34,9 @@ class TextNode extends LeafNode * Sets the text for this node. * * @param string $text + * @param bool $removeDoubleSpace */ - public function __construct($text, $removeDoubleSpace = true) + public function __construct(string $text, $removeDoubleSpace = true) { if ($removeDoubleSpace) { // remove double spaces @@ -55,7 +56,7 @@ public function __construct($text, $removeDoubleSpace = true) * * @return string */ - public function text() + public function text(): string { // convert charset if ( ! is_null($this->encode)) { @@ -80,7 +81,7 @@ public function text() * @var string $text * @return void */ - public function setText($text) + public function setText(string $text): void { $this->text = $text; @@ -98,7 +99,7 @@ public function setText($text) * @return string * @uses $this->text() */ - public function innerHtml() + public function innerHtml(): string { return $this->text(); } @@ -109,7 +110,7 @@ public function innerHtml() * @return string * @uses $this->text() */ - public function outerHtml() + public function outerHtml(): string { return $this->text(); } @@ -118,7 +119,7 @@ public function outerHtml() * Call this when something in the node tree has changed. Like a child has been added * or a parent has been changed. */ - protected function clear() + protected function clear(): void { $this->convertedText = null; } @@ -128,7 +129,7 @@ protected function clear() * * @return bool */ - public function isTextNode() + public function isTextNode(): bool { return true; } diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index 9baf6c09..c1587576 100644 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -59,9 +59,10 @@ public function __get($key) * Sets a new options param to override the current option array. * * @param array $options - * @return $this + * @return Options + * @chainable */ - public function setOptions(array $options) + public function setOptions(array $options): Options { foreach ($options as $key => $option) { $this->options[$key] = $option; @@ -77,7 +78,7 @@ public function setOptions(array $options) * @param string * @return mixed */ - public function get($key) + public function get(string $key) { if (isset($this->options[$key])) { return $this->options[$key]; diff --git a/src/PHPHtmlParser/Selector.php b/src/PHPHtmlParser/Selector.php index e327ad05..608d973f 100644 --- a/src/PHPHtmlParser/Selector.php +++ b/src/PHPHtmlParser/Selector.php @@ -52,7 +52,7 @@ public function getSelectors() * @param AbstractNode $node * @return Collection */ - public function find(AbstractNode $node) + public function find(AbstractNode $node): Collection { $results = new Collection; foreach ($this->selectors as $selector) { @@ -86,7 +86,7 @@ public function find(AbstractNode $node) * * @param string $selector */ - protected function parseSelectorString($selector) + protected function parseSelectorString(string $selector): void { $matches = []; preg_match_all($this->pattern, trim($selector).' ', $matches, PREG_SET_ORDER); @@ -166,7 +166,7 @@ protected function parseSelectorString($selector) * @return array * @recursive */ - protected function seek(array $nodes, array $rule, array $options) + protected function seek(array $nodes, array $rule, array $options): array { // XPath index if (array_key_exists('tag', $rule) && @@ -316,7 +316,7 @@ protected function seek(array $nodes, array $rule, array $options) * @param string $value * @return bool */ - protected function match($operator, $pattern, $value) + protected function match(string $operator, string $pattern, string $value): bool { $value = strtolower($value); $pattern = strtolower($pattern); @@ -347,7 +347,7 @@ protected function match($operator, $pattern, $value) * @param array $rule * @return array */ - protected function alterNext($rule) + protected function alterNext(array $rule): array { $options = []; if ($rule['tag'] == '>') { diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php index 2d41d4d4..c77a86e1 100644 --- a/src/PHPHtmlParser/StaticDom.php +++ b/src/PHPHtmlParser/StaticDom.php @@ -22,7 +22,7 @@ final class StaticDom * @throws NotLoadedException * @return mixed */ - public static function __callStatic($method, $arguments) + public static function __callStatic(string $method, array $arguments) { if (self::$dom instanceof Dom) { return call_user_func_array([self::$dom, $method], $arguments); @@ -39,7 +39,7 @@ public static function __callStatic($method, $arguments) * @param Dom $dom * @return bool */ - public static function mount($className = 'Dom', Dom $dom = null) + public static function mount(string $className = 'Dom', Dom $dom = null): bool { if (class_exists($className)) { return false; @@ -57,9 +57,9 @@ class_alias(__CLASS__, $className); * new object. * * @param string $str - * @return $this + * @return Dom */ - public static function load($str) + public static function load(string $str): Dom { $dom = new Dom; self::$dom = $dom; @@ -72,9 +72,9 @@ public static function load($str) * new object. * * @param string $file - * @return $this + * @return Dom */ - public static function loadFromFile($file) + public static function loadFromFile(string $file): Dom { $dom = new Dom; self::$dom = $dom; @@ -89,9 +89,9 @@ public static function loadFromFile($file) * @param string $url * @param array $options * @param CurlInterface $curl - * @return $this + * @return Dom */ - public static function loadFromUrl($url, $options = [], CurlInterface $curl = null) + public static function loadFromUrl(string $url, array $options = [], CurlInterface $curl = null): Dom { $dom = new Dom; self::$dom = $dom; @@ -106,7 +106,7 @@ public static function loadFromUrl($url, $options = [], CurlInterface $curl = nu /** * Sets the $dom variable to null. */ - public static function unload() + public static function unload(): void { self::$dom = null; } From 8b759abeea51df22f6a6cc473039758d95aceea4 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 20 Jan 2019 21:00:26 -0500 Subject: [PATCH 102/200] Updated version to 2.0.0 --- README.md | 2 +- composer.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5826d2c2..28cccfe3 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 1.7.0 +Version 2.0.0 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) diff --git a/composer.json b/composer.json index 885b470a..20b4ed76 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,7 @@ { "name": "paquettg/php-html-parser", "type": "library", - "version": "1.7.0", + "version": "2.0.0", "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.", "keywords": ["html", "dom", "parser"], "homepage": "https://github.com/paquettg/php-html-parser", From 7b9ec6d8eb28d1676b46439d8007bd00ef79346d Mon Sep 17 00:00:00 2001 From: Patrick Date: Sat, 2 Feb 2019 18:18:50 +0100 Subject: [PATCH 103/200] Update CurlInterface.php (#162) Fix PHPHtmlParser\CurlInterface::get() compatible issue --- src/PHPHtmlParser/CurlInterface.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/CurlInterface.php b/src/PHPHtmlParser/CurlInterface.php index 622857ec..c05c3f98 100644 --- a/src/PHPHtmlParser/CurlInterface.php +++ b/src/PHPHtmlParser/CurlInterface.php @@ -15,5 +15,5 @@ interface CurlInterface * @param string $url * @return string */ - public function get($url): string; + public function get(string $url): string; } From 5281c08bf27729e48634af11ca3bcddddda66b2b Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 2 Feb 2019 12:25:36 -0500 Subject: [PATCH 104/200] Added test for issue #164 --- tests/DomTest.php | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/DomTest.php b/tests/DomTest.php index 3cd0df15..7048eb4f 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -389,4 +389,12 @@ public function testWhitespaceInText() $dom->load('
    Hello world
'); $this->assertEquals('
    Hello world
', (string) $dom); } + + public function testGetComplexAttribute() + { + $dom = new Dom; + $dom->load('Next >'); + $href = $dom->find('a', 0)->href; + $this->assertEquals('?search=Fort+William&session_type=face&distance=100&uqs=119846&page=4', $href); + } } From ed035322cbc699f928372691c8c0c544fb9b4710 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 2 Feb 2019 12:47:12 -0500 Subject: [PATCH 105/200] Removed old child in replace child method --- src/PHPHtmlParser/Dom/InnerNode.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 2c521f70..0fb20411 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -332,13 +332,18 @@ public function replaceChild(int $childId, AbstractNode $newChild): void 'next' => $oldChild['next'] ); + // chnge previous child id to new child if ($oldChild['prev'] && isset($this->children[$newChild->prev])) { $this->children[$oldChild['prev']]['next'] = $newChild->id(); } + // change next child id to new child if ($oldChild['next'] && isset($this->children[$newChild->next])) { $this->children[$oldChild['next']]['prev'] = $newChild->id(); } + + // remove old child + unset($this->children[$childId]); } /** From eb40c3e3535670cc34f8dfcfcb99c90a51d872dc Mon Sep 17 00:00:00 2001 From: Yavor Kirov Date: Thu, 22 Mar 2018 18:47:32 +0200 Subject: [PATCH 106/200] Fixed some typos. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 28cccfe3..b5861229 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Version 2.0.0 [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/paquettg/php-html-parser/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/paquettg/php-html-parser/?branch=master) -PHPHtmlParser is a simple, flexible, html parser which allows you to select tags using any css selector, like jQuery. The goal is to assist in the development of tools which require a quick, easy way to scrap html, whether it's valid or not! This project was original supported by [sunra/php-simple-html-dom-parser](https://github.com/sunra/php-simple-html-dom-parser) but the support seems to have stopped so this project is my adaptation of his previous work. +PHPHtmlParser is a simple, flexible, html parser which allows you to select tags using any css selector, like jQuery. The goal is to assist in the development of tools which require a quick, easy way to scrap html, whether it's valid or not! Install ------- From 696f448bb68046b2362abb186857614f8f48c88a Mon Sep 17 00:00:00 2001 From: thenotsoft <44147615+thenotsoft@users.noreply.github.com> Date: Sat, 2 Feb 2019 19:52:54 +0200 Subject: [PATCH 107/200] fixed compatible type (#165) From 9b6024376ce98767f33f2d070b417778304d5427 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 2 Feb 2019 12:59:46 -0500 Subject: [PATCH 108/200] Fix #85 --- src/PHPHtmlParser/Dom/HtmlNode.php | 1 + src/PHPHtmlParser/Dom/InnerNode.php | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index 87be6c52..81b9a14e 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -186,6 +186,7 @@ protected function clear(): void $this->innerHtml = null; $this->outerHtml = null; $this->text = null; + $this->textWithChildren = null; if (is_null($this->parent) === false) { $this->parent->clear(); diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 0fb20411..f08704ef 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -203,6 +203,9 @@ public function insertAfter(AbstractNode $child, int $id): bool return $this->addChild($child, $this->children[$id]['next']); } + // clear cache + $this->clear(); + return $this->addChild($child); } @@ -344,6 +347,9 @@ public function replaceChild(int $childId, AbstractNode $newChild): void // remove old child unset($this->children[$childId]); + + // clean out cache + $this->clear(); } /** @@ -415,6 +421,9 @@ public function setParent(InnerNode $parent): AbstractNode throw new CircularException('Can not add descendant "'.$parent->id().'" as my parent.'); } + // clear cache + $this->clear(); + return parent::setParent($parent); } } From d0c27a20065597217f755d82a543379f9d389664 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 2 Feb 2019 13:09:10 -0500 Subject: [PATCH 109/200] Update scrytinizer --- .scrutinizer.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.scrutinizer.yml b/.scrutinizer.yml index 7c9a4375..8b3532b9 100644 --- a/.scrutinizer.yml +++ b/.scrutinizer.yml @@ -33,3 +33,9 @@ tools: php_cpd: enabled: true excluded_dirs: [vendor, test] +build: + nodes: + analysis: + tests: + override: + - php-scrutinizer-run From 39b1babf265fb73f17c9ed1b11ee8ef890c935f5 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 2 Feb 2019 13:44:46 -0500 Subject: [PATCH 110/200] Cleaned up code base and added new tests --- src/PHPHtmlParser/Dom/AbstractNode.php | 14 +++++++++--- src/PHPHtmlParser/Dom/Collection.php | 4 ++-- src/PHPHtmlParser/Options.php | 7 +++++- tests/CollectionTest.php | 30 ++++++++++++++++++++++++++ tests/Node/TextTest.php | 2 +- 5 files changed, 50 insertions(+), 7 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index fbd1c906..ec7dc24e 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -44,7 +44,7 @@ abstract class AbstractNode /** * The unique id of the class. Given by PHP. * - * @var string + * @var int */ protected $id; @@ -55,6 +55,13 @@ abstract class AbstractNode */ protected $encode; + /** + * An array of all the children. + * + * @var array + */ + protected $children = []; + /** * Creates a unique id for this node. */ @@ -242,7 +249,8 @@ public function hasNextSibling(): bool { try { - $sibling = $this->nextSibling(); + $this->nextSibling(); + // sibling found, return true; return true; } @@ -418,7 +426,7 @@ public function ancestorByTag(string $tag): AbstractNode * * @param string $selector * @param int $nth - * @return array|AbstractNode + * @return mixed */ public function find(string $selector, int $nth = null) { diff --git a/src/PHPHtmlParser/Dom/Collection.php b/src/PHPHtmlParser/Dom/Collection.php index f2733b8d..baf1b657 100644 --- a/src/PHPHtmlParser/Dom/Collection.php +++ b/src/PHPHtmlParser/Dom/Collection.php @@ -28,7 +28,7 @@ class Collection implements IteratorAggregate, ArrayAccess, Countable * * @param string $method * @param array $arguments - * @return mixed; + * @return mixed * @throws EmptyCollectionException */ public function __call(string $method, array $arguments) @@ -72,7 +72,7 @@ public function __toString(): string if ($node instanceof AbstractNode) { return (string)$node; } else { - throw new EmptyCollectionException('The collection does not contain any Nodes.'); + return ''; } } diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index c1587576..3d8d22ce 100644 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -7,7 +7,12 @@ * @package PHPHtmlParser * @property bool whitespaceTextNode * @property bool strict - * @property bool enforceEncoding + * @property string|null enforceEncoding + * @property bool cleanupInput + * @property bool removeScripts + * @property bool removeStyles + * @property bool preserveLineBreaks + * @property bool removeDoubleSpace */ class Options { diff --git a/tests/CollectionTest.php b/tests/CollectionTest.php index 8e9f1c59..7325b656 100644 --- a/tests/CollectionTest.php +++ b/tests/CollectionTest.php @@ -39,6 +39,13 @@ public function testCallNoNodes() $collection->innerHtml(); } + public function testNoNodeString() + { + $collection = new Collection(); + $string = (string) $collection; + $this->assertEmpty($string); + } + public function testCallMagic() { $root = new HtmlNode(new Tag('root')); @@ -114,4 +121,27 @@ public function testToArray() $lastA = end($array); $this->assertEquals($child3->id(), $lastA->id()); } + + public function testGetIterator() + { + $collection = new Collection(); + $iterator = $collection->getIterator(); + $this->assertTrue($iterator instanceof \ArrayIterator); + + } + + public function testOffsetSet() + { + $collection = new Collection(); + $collection->offsetSet(7, true); + $this->assertTrue($collection->offsetGet(7)); + } + + public function testOffsetUnset() + { + $collection = new Collection(); + $collection->offsetSet(7, true); + $collection->offsetUnset(7); + $this->assertTrue(is_null($collection->offsetGet(7))); + } } diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index 7fad611d..e04b9ee2 100644 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -30,7 +30,7 @@ public function testAncestorByTag() public function testPreserveEntity() { $node = new TextNode('i'); - $text = $node->innerhtml; + $text = $node->outerhtml; $this->assertEquals('i', $text); } From 268bdcb83acc2663ecf6e622f814a6553e61fd5a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 2 Feb 2019 13:50:42 -0500 Subject: [PATCH 111/200] 2.0.1 --- README.md | 2 +- composer.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b5861229..64f54e5f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 2.0.0 +Version 2.0.1 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) diff --git a/composer.json b/composer.json index 20b4ed76..bdf30c83 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,7 @@ { "name": "paquettg/php-html-parser", "type": "library", - "version": "2.0.0", + "version": "2.0.1", "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.", "keywords": ["html", "dom", "parser"], "homepage": "https://github.com/paquettg/php-html-parser", From 45cdfb4f26256c03f9f0e1a836d7c7fff237e886 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 3 Feb 2019 22:06:13 -0500 Subject: [PATCH 112/200] Fixed issue with return type in Dom and Selector --- src/PHPHtmlParser/Dom.php | 4 ++-- src/PHPHtmlParser/Selector.php | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index cc88281d..a8c517be 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -419,9 +419,9 @@ public function hasChildren(): bool * id. * * @param string $id - * @return \PHPHtmlParser\Dom\AbstractNode + * @return \PHPHtmlParser\Dom\AbstractNode|null */ - public function getElementById($id): \PHPHtmlParser\Dom\AbstractNode + public function getElementById($id) { $this->isLoaded(); diff --git a/src/PHPHtmlParser/Selector.php b/src/PHPHtmlParser/Selector.php index 608d973f..dbf9207c 100644 --- a/src/PHPHtmlParser/Selector.php +++ b/src/PHPHtmlParser/Selector.php @@ -326,15 +326,15 @@ protected function match(string $operator, string $pattern, string $value): bool case '!=': return $value !== $pattern; case '^=': - return preg_match('/^'.preg_quote($pattern, '/').'/', $value); + return preg_match('/^'.preg_quote($pattern, '/').'/', $value) == 1; case '$=': - return preg_match('/'.preg_quote($pattern, '/').'$/', $value); + return preg_match('/'.preg_quote($pattern, '/').'$/', $value) == 1; case '*=': if ($pattern[0] == '/') { - return preg_match($pattern, $value); + return preg_match($pattern, $value) == 1; } - return preg_match("/".$pattern."/i", $value); + return preg_match("/".$pattern."/i", $value) == 1; } return false; From c6f67128bdce30d2784cd73c0cd66afd4ebfd7ce Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 4 Feb 2019 18:47:49 -0500 Subject: [PATCH 113/200] Fix issue #166 --- src/PHPHtmlParser/Dom.php | 2 +- src/PHPHtmlParser/Dom/InnerNode.php | 13 +++++++++++++ tests/DomTest.php | 10 ++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index a8c517be..dbc3741a 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -192,7 +192,7 @@ public function loadFromUrl(string $url, array $options = [], CurlInterface $cur * @return Dom * @chainable */ - public function loadStr(string $str, array $option): Dom + public function loadStr(string $str, array $option = []): Dom { $this->options = new Options; $this->options->setOptions($this->globalOptions) diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index f08704ef..63093338 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -357,9 +357,15 @@ public function replaceChild(int $childId, AbstractNode $newChild): void * * @return AbstractNode * @uses $this->getChild() + * @throws ChildNotFoundException */ public function firstChild(): AbstractNode { + if (count($this->children) == 0) { + // no children + throw new ChildNotFoundException("No children found in node."); + } + reset($this->children); $key = key($this->children); @@ -370,9 +376,16 @@ public function firstChild(): AbstractNode * Attempts to get the last child. * * @return AbstractNode + * @uses $this->getChild() + * @throws ChildNotFoundException */ public function lastChild(): AbstractNode { + if (count($this->children) == 0) { + // no children + throw new ChildNotFoundException("No children found in node."); + } + end($this->children); $key = key($this->children); diff --git a/tests/DomTest.php b/tests/DomTest.php index 7048eb4f..bff1a2da 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -397,4 +397,14 @@ public function testGetComplexAttribute() $href = $dom->find('a', 0)->href; $this->assertEquals('?search=Fort+William&session_type=face&distance=100&uqs=119846&page=4', $href); } + + public function testGetChildrenNoChildren() + { + $dom = new Dom(); + $dom->loadStr('
Test
'); + + $imgNode = $dom->root->find('img'); + $children = $imgNode->getChildren(); + $this->assertTrue(count($children) === 0); + } } From 8bf35cd70184014a384cb19dc8ccc10363f0332d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 4 Feb 2019 19:48:54 -0500 Subject: [PATCH 114/200] Refactored the Selector class --- src/PHPHtmlParser/Dom/AbstractNode.php | 5 +- src/PHPHtmlParser/Selector/Parser.php | 97 +++++++++++++++++++ .../Selector/ParserInterface.php | 7 ++ src/PHPHtmlParser/{ => Selector}/Selector.php | 87 +---------------- tests/CollectionTest.php | 13 +-- tests/{ => Selector}/SelectorTest.php | 33 ++++--- 6 files changed, 135 insertions(+), 107 deletions(-) create mode 100644 src/PHPHtmlParser/Selector/Parser.php create mode 100644 src/PHPHtmlParser/Selector/ParserInterface.php rename src/PHPHtmlParser/{ => Selector}/Selector.php (77%) rename tests/{ => Selector}/SelectorTest.php (86%) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index ec7dc24e..425f3803 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -4,7 +4,8 @@ use PHPHtmlParser\Exceptions\CircularException; use PHPHtmlParser\Exceptions\ParentNotFoundException; use PHPHtmlParser\Exceptions\ChildNotFoundException; -use PHPHtmlParser\Selector; +use PHPHtmlParser\Selector\Selector; +use PHPHtmlParser\Selector\Parser as SelectorParser; use stringEncode\Encode; use PHPHtmlParser\Finder; @@ -430,7 +431,7 @@ public function ancestorByTag(string $tag): AbstractNode */ public function find(string $selector, int $nth = null) { - $selector = new Selector($selector); + $selector = new Selector($selector, new SelectorParser()); $nodes = $selector->find($this); if ( ! is_null($nth)) { diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php new file mode 100644 index 00000000..433d1b11 --- /dev/null +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -0,0 +1,97 @@ +]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + + /** + * Parses the selector string + * + * @param string $selector + */ + public function parseSelectorString(string $selector): array + { + $selectors = []; + + $matches = []; + preg_match_all($this->pattern, trim($selector).' ', $matches, PREG_SET_ORDER); + + // skip tbody + $result = []; + foreach ($matches as $match) { + // default values + $tag = strtolower(trim($match[1])); + $operator = '='; + $key = null; + $value = null; + $noKey = false; + $alterNext = false; + + // check for elements that alter the behavior of the next element + if ($tag == '>') { + $alterNext = true; + } + + // check for id selector + if ( ! empty($match[2])) { + $key = 'id'; + $value = $match[2]; + } + + // check for class selector + if ( ! empty($match[3])) { + $key = 'class'; + $value = $match[3]; + } + + // and final attribute selector + if ( ! empty($match[4])) { + $key = strtolower($match[4]); + } + if ( ! empty($match[5])) { + $operator = $match[5]; + } + if ( ! empty($match[6])) { + $value = $match[6]; + } + + // check for elements that do not have a specified attribute + if (isset($key[0]) && $key[0] == '!') { + $key = substr($key, 1); + $noKey = true; + } + + $result[] = [ + 'tag' => $tag, + 'key' => $key, + 'value' => $value, + 'operator' => $operator, + 'noKey' => $noKey, + 'alterNext' => $alterNext, + ]; + if (trim($match[7]) == ',') { + $selectors[] = $result; + $result = []; + } + } + + // save last results + if (count($result) > 0) { + $selectors[] = $result; + } + + return $selectors; + } +} diff --git a/src/PHPHtmlParser/Selector/ParserInterface.php b/src/PHPHtmlParser/Selector/ParserInterface.php new file mode 100644 index 00000000..99a074cc --- /dev/null +++ b/src/PHPHtmlParser/Selector/ParserInterface.php @@ -0,0 +1,7 @@ +]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; - protected $selectors = []; /** @@ -30,9 +25,9 @@ class Selector * * @param string $selector */ - public function __construct($selector) + public function __construct(string $selector, ParserInterface $parser) { - $this->parseSelectorString($selector); + $this->selectors = $parser->parseSelectorString($selector); } /** @@ -81,80 +76,6 @@ public function find(AbstractNode $node): Collection return $results; } - /** - * Parses the selector string - * - * @param string $selector - */ - protected function parseSelectorString(string $selector): void - { - $matches = []; - preg_match_all($this->pattern, trim($selector).' ', $matches, PREG_SET_ORDER); - - // skip tbody - $result = []; - foreach ($matches as $match) { - // default values - $tag = strtolower(trim($match[1])); - $operator = '='; - $key = null; - $value = null; - $noKey = false; - $alterNext = false; - - // check for elements that alter the behavior of the next element - if ($tag == '>') { - $alterNext = true; - } - - // check for id selector - if ( ! empty($match[2])) { - $key = 'id'; - $value = $match[2]; - } - - // check for class selector - if ( ! empty($match[3])) { - $key = 'class'; - $value = $match[3]; - } - - // and final attribute selector - if ( ! empty($match[4])) { - $key = strtolower($match[4]); - } - if ( ! empty($match[5])) { - $operator = $match[5]; - } - if ( ! empty($match[6])) { - $value = $match[6]; - } - - // check for elements that do not have a specified attribute - if (isset($key[0]) && $key[0] == '!') { - $key = substr($key, 1); - $noKey = true; - } - - $result[] = [ - 'tag' => $tag, - 'key' => $key, - 'value' => $value, - 'operator' => $operator, - 'noKey' => $noKey, - 'alterNext' => $alterNext, - ]; - if (trim($match[7]) == ',') { - $this->selectors[] = $result; - $result = []; - } - } - - // save last results - if (count($result) > 0) { - $this->selectors[] = $result; - } - } /** * Attempts to find all children that match the rule diff --git a/tests/CollectionTest.php b/tests/CollectionTest.php index 7325b656..2d702ecc 100644 --- a/tests/CollectionTest.php +++ b/tests/CollectionTest.php @@ -2,7 +2,8 @@ declare(strict_types=1); use PHPUnit\Framework\TestCase; -use PHPHtmlParser\Selector; +use PHPHtmlParser\Selector\Selector; +use PHPHtmlParser\Selector\Parser; use PHPHtmlParser\Dom\HtmlNode; use PHPHtmlParser\Dom\Tag; use PHPHtmlParser\Dom\Collection; @@ -21,7 +22,7 @@ public function testEach() $parent->addChild($child2); $child2->addChild($child3); - $selector = new Selector('a'); + $selector = new Selector('a', new Parser()); $collection = $selector->find($root); $count = 0; $collection->each(function ($node) use (&$count) { @@ -58,7 +59,7 @@ public function testCallMagic() $parent->addChild($child2); $child2->addChild($child3); - $selector = new Selector('div * a'); + $selector = new Selector('div * a', new Parser()); $this->assertEquals($child3->id(), $selector->find($root)->id()); } @@ -74,7 +75,7 @@ public function testGetMagic() $parent->addChild($child2); $child2->addChild($child3); - $selector = new Selector('div * a'); + $selector = new Selector('div * a', new Parser()); $this->assertEquals($child3->innerHtml, $selector->find($root)->innerHtml); } @@ -99,7 +100,7 @@ public function testToStringMagic() $parent->addChild($child2); $child2->addChild($child3); - $selector = new Selector('div * a'); + $selector = new Selector('div * a', new Parser()); $this->assertEquals((string)$child3, (string)$selector->find($root)); } @@ -115,7 +116,7 @@ public function testToArray() $parent->addChild($child2); $child2->addChild($child3); - $selector = new Selector('a'); + $selector = new Selector('a', new Parser()); $collection = $selector->find($root); $array = $collection->toArray(); $lastA = end($array); diff --git a/tests/SelectorTest.php b/tests/Selector/SelectorTest.php similarity index 86% rename from tests/SelectorTest.php rename to tests/Selector/SelectorTest.php index 3bf97f57..a3ae9ffc 100644 --- a/tests/SelectorTest.php +++ b/tests/Selector/SelectorTest.php @@ -2,7 +2,8 @@ declare(strict_types=1); use PHPUnit\Framework\TestCase; -use PHPHtmlParser\Selector; +use PHPHtmlParser\Selector\Selector; +use PHPHtmlParser\Selector\Parser; use PHPHtmlParser\Dom\HtmlNode; use PHPHtmlParser\Dom\Tag; @@ -10,28 +11,28 @@ class SelectorTest extends TestCase { public function testParseSelectorStringId() { - $selector = new Selector('#all'); + $selector = new Selector('#all', new Parser()); $selectors = $selector->getSelectors(); $this->assertEquals('id', $selectors[0][0]['key']); } public function testParseSelectorStringClass() { - $selector = new Selector('div.post'); + $selector = new Selector('div.post', new Parser()); $selectors = $selector->getSelectors(); $this->assertEquals('class', $selectors[0][0]['key']); } public function testParseSelectorStringAttribute() { - $selector = new Selector('div[visible=yes]'); + $selector = new Selector('div[visible=yes]', new Parser()); $selectors = $selector->getSelectors(); $this->assertEquals('yes', $selectors[0][0]['value']); } public function testParseSelectorStringNoKey() { - $selector = new Selector('div[!visible]'); + $selector = new Selector('div[!visible]', new Parser()); $selectors = $selector->getSelectors(); $this->assertTrue($selectors[0][0]['noKey']); } @@ -46,7 +47,7 @@ public function testFind() $parent->addChild($child2); $root->addChild($parent); - $selector = new Selector('div a'); + $selector = new Selector('div a', new Parser()); $this->assertEquals($child1->id(), $selector->find($root)[0]->id()); } @@ -64,7 +65,7 @@ public function testFindId() $parent->addChild($child1); $parent->addChild($child2); - $selector = new Selector('#content'); + $selector = new Selector('#content', new Parser()); $this->assertEquals($child2->id(), $selector->find($parent)[0]->id()); } @@ -84,7 +85,7 @@ public function testFindClass() $parent->addChild($child2); $parent->addChild($child3); - $selector = new Selector('.link'); + $selector = new Selector('.link', new Parser()); $this->assertEquals($child3->id(), $selector->find($parent)[0]->id()); } @@ -104,7 +105,7 @@ public function testFindClassMultiple() $parent->addChild($child2); $parent->addChild($child3); - $selector = new Selector('.outer'); + $selector = new Selector('.outer', new Parser()); $this->assertEquals($child3->id(), $selector->find($parent)[0]->id()); } @@ -120,7 +121,7 @@ public function testFindWild() $parent->addChild($child2); $child2->addChild($child3); - $selector = new Selector('div * a'); + $selector = new Selector('div * a', new Parser()); $this->assertEquals($child3->id(), $selector->find($root)[0]->id()); } @@ -136,7 +137,7 @@ public function testFindMultipleSelectors() $parent->addChild($child2); $child2->addChild($child3); - $selector = new Selector('a, p'); + $selector = new Selector('a, p', new Parser()); $this->assertEquals(3, count($selector->find($root))); } @@ -156,7 +157,7 @@ public function testFindXpathKeySelector() $parent->addChild($child2); $parent->addChild($child3); - $selector = new Selector('div[1]'); + $selector = new Selector('div[1]', new Parser()); $this->assertEquals($parent->id(), $selector->find($parent)[0]->id()); } @@ -170,7 +171,7 @@ public function testFindChildMultipleLevelsDeep() $parent->addChild($child1); $child1->addChild($child2); - $selector = new Selector('div li'); + $selector = new Selector('div li', new Parser()); $this->assertEquals(1, count($selector->find($root))); } @@ -186,7 +187,7 @@ public function testFindAllChildren() $child2->addChild($child3); $parent->addChild($child2); - $selector = new Selector('div ul'); + $selector = new Selector('div ul', new Parser()); $this->assertEquals(2, count($selector->find($root))); } @@ -202,7 +203,7 @@ public function testFindChildUsingChildSelector() $child2->addChild($child3); $parent->addChild($child2); - $selector = new Selector('div > ul'); + $selector = new Selector('div > ul', new Parser()); $this->assertEquals(1, count($selector->find($root))); } @@ -213,7 +214,7 @@ public function testFindNodeByAttributeOnly() $child1->setAttribute('custom-attr', null); $root->addChild($child1); - $selector = new Selector('[custom-attr]'); + $selector = new Selector('[custom-attr]', new Parser()); $this->assertEquals(1, count($selector->find($root))); } } From 9ce023b4c27511ce142ab66e79dfb16b920ba14a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 5 Feb 2019 18:46:20 -0500 Subject: [PATCH 115/200] Refactoring --- src/PHPHtmlParser/Content.php | 4 ++-- src/PHPHtmlParser/Dom/AbstractNode.php | 2 ++ src/PHPHtmlParser/Dom/HtmlNode.php | 2 ++ src/PHPHtmlParser/Dom/InnerNode.php | 2 +- src/PHPHtmlParser/Dom/MockNode.php | 3 +++ src/PHPHtmlParser/Finder.php | 5 +++-- 6 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 09c74d6a..0aa6b1c1 100644 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -219,7 +219,7 @@ public function copyByToken(string $token, bool $char = false, bool $escape = fa * * @param string $string * @param bool $copy - * @return $this|string + * @return Content|string */ public function skip(string $string, bool $copy = false) { @@ -242,7 +242,7 @@ public function skip(string $string, bool $copy = false) * * @param string $token * @param bool $copy - * @return null|string + * @return Content|string * @uses $this->skip() */ public function skipByToken(string $token, bool $copy = false) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 425f3803..575acc41 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -15,6 +15,8 @@ * @property string outerhtml * @property string innerhtml * @property string text + * @property int prev + * @property int next * @property \PHPHtmlParser\Dom\Tag tag * @property InnerNode parent */ diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index 81b9a14e..79c40795 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -14,6 +14,8 @@ class HtmlNode extends InnerNode /** * Remembers what the innerHtml was if it was scanned previously. + * + * @var string */ protected $innerHtml = null; diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 63093338..136f307a 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -367,7 +367,7 @@ public function firstChild(): AbstractNode } reset($this->children); - $key = key($this->children); + $key = (int) key($this->children); return $this->getChild($key); } diff --git a/src/PHPHtmlParser/Dom/MockNode.php b/src/PHPHtmlParser/Dom/MockNode.php index 5f18eb44..94f08ec0 100644 --- a/src/PHPHtmlParser/Dom/MockNode.php +++ b/src/PHPHtmlParser/Dom/MockNode.php @@ -16,6 +16,7 @@ class MockNode extends InnerNode */ public function innerHtml(): string { + return ''; } /** @@ -23,6 +24,7 @@ public function innerHtml(): string */ public function outerHtml(): string { + return ''; } /** @@ -30,6 +32,7 @@ public function outerHtml(): string */ public function text(): string { + return ''; } /** diff --git a/src/PHPHtmlParser/Finder.php b/src/PHPHtmlParser/Finder.php index 1c754c3c..3b3f1033 100644 --- a/src/PHPHtmlParser/Finder.php +++ b/src/PHPHtmlParser/Finder.php @@ -3,6 +3,7 @@ namespace PHPHtmlParser; use PHPHtmlParser\Dom\AbstractNode; +use PHPHtmlParser\Dom\InnerNode; class Finder { @@ -26,7 +27,7 @@ public function __construct($id) */ public function find(AbstractNode $node) { - if (!$node->id()) { + if (!$node->id() && $node instanceof InnerNode) { return $this->find($node->firstChild()); } @@ -51,4 +52,4 @@ public function find(AbstractNode $node) return false; } -} \ No newline at end of file +} From b44c872729fc402227593e8b7e825a24f4733ab2 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 5 Feb 2019 19:25:39 -0500 Subject: [PATCH 116/200] Fixed issue #104 --- src/PHPHtmlParser/Dom.php | 6 ++++-- tests/DomTest.php | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index dbc3741a..d1694c61 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -544,16 +544,18 @@ protected function parse(): void // check if it was a closing tag if ($info['closing']) { - $originalNode = $activeNode; + $foundOpeningTag = true; + $originalNode = $activeNode; while ($activeNode->getTag()->name() != $info['tag']) { $activeNode = $activeNode->getParent(); if (is_null($activeNode)) { // we could not find opening tag $activeNode = $originalNode; + $foundOpeningTag = false; break; } } - if ( ! is_null($activeNode)) { + if ($foundOpeningTag) { $activeNode = $activeNode->getParent(); } continue; diff --git a/tests/DomTest.php b/tests/DomTest.php index bff1a2da..f3762c2c 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -294,6 +294,13 @@ public function testScriptCleanerScriptTag() $this->assertEquals('....', $dom->getElementsByTag('p')[1]->innerHtml); } + public function testClosingSpan() + { + $dom = new Dom; + $dom->load("
sometext
"); + $this->assertEquals('sometext', $dom->getElementsByTag('div')[0]->innerHtml); + } + public function testMultipleDoubleQuotes() { $dom = new Dom; From c97a84c3e2ca6674e2bd957c95f2582fbbe2369a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 9 Feb 2019 20:29:35 -0500 Subject: [PATCH 117/200] Refactoring --- src/PHPHtmlParser/Selector/Selector.php | 160 +++++++++++++++--------- 1 file changed, 100 insertions(+), 60 deletions(-) diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index d7a31607..fb8010e5 100644 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -129,67 +129,18 @@ protected function seek(array $nodes, array $rule, array $options): array // wild card, grab all if ($rule['tag'] == '*' && is_null($rule['key'])) { $return[] = $child; - try { - $child = $node->nextChild($child->id()); - } catch (ChildNotFoundException $e) { - // no more children - $child = null; - } + $child = $this->getNextChild($node, $child); continue; } - $pass = true; - // check tag - if ( ! empty($rule['tag']) && $rule['tag'] != $child->getTag()->name() && - $rule['tag'] != '*' - ) { - // child failed tag check - $pass = false; - } - - // check key + $pass = $this->checkTag($rule, $child); if ($pass && ! is_null($rule['key'])) { - if ($rule['noKey']) { - if ( ! is_null($child->getAttribute($rule['key']))) { - $pass = false; - } - } else { - if ($rule['key'] != 'plaintext' && !$child->hasAttribute($rule['key'])) { - $pass = false; - } - } + $pass = $this->checkKey($rule, $child); } - - // compare values if ($pass && ! is_null($rule['key']) && ! is_null($rule['value']) && $rule['value'] != '*' ) { - if ($rule['key'] == 'plaintext') { - // plaintext search - $nodeValue = $child->text(); - } else { - // normal search - $nodeValue = $child->getAttribute($rule['key']); - } - - $check = $this->match($rule['operator'], $rule['value'], $nodeValue); - - // handle multiple classes - if ( ! $check && $rule['key'] == 'class') { - $childClasses = explode(' ', $child->getAttribute('class')); - foreach ($childClasses as $class) { - if ( ! empty($class)) { - $check = $this->match($rule['operator'], $rule['value'], $class); - } - if ($check) { - break; - } - } - } - - if ( ! $check) { - $pass = false; - } + $pass = $this->checkComparison($rule, $child); } if ($pass) { @@ -205,13 +156,7 @@ protected function seek(array $nodes, array $rule, array $options): array } } - try { - // get next child - $child = $node->nextChild($child->id()); - } catch (ChildNotFoundException $e) { - // no more children - $child = null; - } + $child = $this->getNextChild($node, $child); } if (( ! isset($options['checkGrandChildren']) || @@ -295,4 +240,99 @@ protected function flattenOptions(array $optionsArray) return $options; } + + /** + * Returns the next child or null if no more children. + * + * @param AbstractNode $node + * @param AbstractNode $currentChild + * @return AbstractNode|null + */ + protected function getNextChild(AbstractNode $node, AbstractNode $currentChild) + { + try { + // get next child + $child = $node->nextChild($currentChild->id()); + } catch (ChildNotFoundException $e) { + // no more children + $child = null; + } + + return $child; + } + + /** + * Checks tag condition from rules against node. + * + * @param array $rule + * @param AbstractNode $node + * @return bool + */ + protected function checkTag(array $rule, AbstractNode $node): bool + { + if ( ! empty($rule['tag']) && $rule['tag'] != $node->getTag()->name() && + $rule['tag'] != '*' + ) { + return false; + } + + return true; + } + + /** + * Checks key condition from rules against node. + * + * @param array $rule + * @param AbstractNode $node + * @return bool + */ + protected function checkKey(array $rule, AbstractNode $node): bool + { + if ($rule['noKey']) { + if ( ! is_null($node->getAttribute($rule['key']))) { + return false; + } + } else { + if ($rule['key'] != 'plaintext' && !$node->hasAttribute($rule['key'])) { + return false; + } + } + + return true; + } + + /** + * Checks comparison condition from rules against node. + * + * @param array $rule + * @param AbstractNode $node + * @return bool + */ + public function checkComparison(array $rule, AbstractNode $node): bool + { + if ($rule['key'] == 'plaintext') { + // plaintext search + $nodeValue = $node->text(); + } else { + // normal search + $nodeValue = $node->getAttribute($rule['key']); + } + + $check = $this->match($rule['operator'], $rule['value'], $nodeValue); + + // handle multiple classes + if ( ! $check && $rule['key'] == 'class') { + $nodeClasses = explode(' ', $node->getAttribute('class')); + foreach ($nodeClasses as $class) { + if ( ! empty($class)) { + $check = $this->match($rule['operator'], $rule['value'], $class); + } + if ($check) { + break; + } + } + } + + return $check; + } } From 77e4a44b0916690b4300fe9abf98fd05bbba48f0 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 9 Feb 2019 20:35:49 -0500 Subject: [PATCH 118/200] Version 2.0.2 --- README.md | 2 +- composer.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 64f54e5f..959b753e 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 2.0.1 +Version 2.0.2 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) diff --git a/composer.json b/composer.json index bdf30c83..108b7101 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,7 @@ { "name": "paquettg/php-html-parser", "type": "library", - "version": "2.0.1", + "version": "2.0.2", "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.", "keywords": ["html", "dom", "parser"], "homepage": "https://github.com/paquettg/php-html-parser", From 60eaead94c69a9abddbcc9f7852ea51168da591a Mon Sep 17 00:00:00 2001 From: Chris Jones Date: Tue, 12 Feb 2019 14:35:57 -0500 Subject: [PATCH 119/200] Fix typo in README I found a typo and a missing line break which was causing weird formatting in the readme. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 959b753e..6a6c0dd1 100644 --- a/README.md +++ b/README.md @@ -170,7 +170,8 @@ Preserves Line Breaks if set to `true`. If set to `false` line breaks are cleane **removeDoubleSpace** -Set this to `false` if you want to preserver whitespace inside of text nodes. It is set to `true` by default. +Set this to `false` if you want to preserve whitespace inside of text nodes. It is set to `true` by default. + Static Facade ------------- From f16a28b0fd9ec6483bcfe70278481bfe21f54c72 Mon Sep 17 00:00:00 2001 From: John Coles Date: Sun, 7 Apr 2019 12:04:44 +0100 Subject: [PATCH 120/200] Changed cURL to non-verbose. Removes issue of log spam. https://github.com/paquettg/php-html-parser/issues/176 --- src/PHPHtmlParser/Curl.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php index 8ce8fa37..21b0f151 100644 --- a/src/PHPHtmlParser/Curl.php +++ b/src/PHPHtmlParser/Curl.php @@ -29,7 +29,7 @@ public function get(string $url): string curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); - curl_setopt($ch, CURLOPT_VERBOSE, true); + curl_setopt($ch, CURLOPT_VERBOSE, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'); curl_setopt($ch, CURLOPT_URL, $url); From 77de30fd590392f7a57d4460e8e7d669c8e3c6f7 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 10 Jul 2019 19:25:48 -0400 Subject: [PATCH 121/200] Fixed #179 --- CHANGELOG.md | 113 +------------------------------------- src/PHPHtmlParser/Dom.php | 5 ++ tests/DomTest.php | 16 ++++++ 3 files changed, 23 insertions(+), 111 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 618fb1b5..e409f3de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,114 +1,5 @@ ### Development -## 1.7.0 +## 1.2.0 -- Added .scrutinizer.yml to repo -- Reformated code to PSR-1/2 -- Improved the test coverage and some small code changes -- Added removeAttribute and removeAllAttributes tag methods fixes #57 -- Added replaceNode method implements #52 -- Added a delete method. fixes #43 -- Added semicolon after for linebreak preservation. fixes #62 -- Removed code that removed tag fixes #60 -- Added new test related to #63 -- Refactored the nodes into inner and leaf nodes -- Fixed Strings example in README -- Close this header so the markdown will render properly -- Added preserve line break option. Defaults to false. - - -## 1.6.9 - -- Added Changelog -- Fixed issue with spaces befor closing tag Fixes #45 -- Fixed some code quality issues found by scrutinizer -- Added Scrutinizer to README -- Reformated code to comply with PSR-1/2 -- Added preserve line break option. Defaults to false. fixes #40 -- Updated the tests -- Added options: cleanupInput, removeScripts and removeStyles - -## 1.6.8 - -- Added comments and reformated some code -- Added support for non-escaped quotes in attribute value fixes #37 -- Cleaned up the comments and php docs -- Removed version in composer json -- Updated composer version -- Refactoring out isChild method. -- Updated in code documentation -- Updated composer - -$$ 1.6.7 - -- Added tests for the new array access -- Added feature to allow array usage of html node. fixes #26 -- Update HtmlNode.php -- Added test to cover issue #28 -- FIX: File name is longer than the maximum allowed path - -## 1.6.6 - -- Replaced preg_replace with mb_ereg_replace -- Added child selector fixes #24 -- Updated the dev version of phpunit - -## 1.6.5 - -- Fixed bug when no attribute tags are last tag (with out space). fixes #16 -- Fixed some documentation inconsistencies fixes #15 -- Made loadStr a public methor Fixes #18 -- Update a problem with the README fixes #11 -- Added setAttribute to the node fixes #7 -- Check if open_basedir is enabled: Dont use CURLOPT_FOLLOWLOCATION - -## 1.6.4 - -- Added tests and updated README -- Updated the tests and moved some files -- Added the option to enforce the encoding -- Fixed a problem with handeling the unknown child exception -- Updated some tests -- Added coverall badge and package - -## 1.6.3 - -- Added initial support for 'strict' parsing option -- Added an optional paramter to enable recursive text -- Added appropriat Options tests -- Changed all exception to specific objects -- Added a whitespaceTextNode option and test -- Added support for an options array - -## 1.6.2 - -- Standardised indentation for easyer reading on github -- Update AbstractNode.php -- Added a test for hhvm in my travis.yml -- Added a LICENSE.md file for MIT -- Added build status to README -- Added travis.yml -- Changed the file name of the abstract node -- fixed code in collection class where instance of arrayIterator is to be rturned -- Updated documentation -- Added a curl interface and a simple curl class. -- Removed the Guzzle dependancy -- Abstracted the Node class as it should have been done in the first place -- Added integrity checks for the cached html -- Added some basic caching of the dom html -- Added a toArray() method to the collection and a test - -## 1.6.1 - -- Moved back to using guzzle so expections are thrown when their was an error with loading a url -- Added tests for the Static Facade Fixed a few issues brought to light from the new tests -- Added a static facade -- Changed encoding to be a local attribute instead of a static attribute -- Solved issue #2 When you attempt to load an html page from a URL using loadFromUrl the encoding is incorrect. -- Added easyer loading of files and urls. Still have a problem with encoding while loading from url. -- Added guzzle and loadFromUrl option -- Fixed an issue with no value attributes -- Added magic and each methods to the collection. Plus some tests -- Added a collection object -- Added charset encoding -- fixed a bug with closing tags If a closing tag did not have an opening tag it would cause the scan to end instead of ignoring the closing tag. +- Fixed bug that caused an infinite loop when no content found in tags. diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index d1694c61..fd361c51 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -626,6 +626,11 @@ protected function parseTag(): array } $tag = strtolower($this->content->copyByToken('slash', true)); + if (trim($tag) == '') + { + // no tag found, invalide < found + return $return; + } $node = new HtmlNode($tag); // attributes diff --git a/tests/DomTest.php b/tests/DomTest.php index f3762c2c..28bc3ed5 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -414,4 +414,20 @@ public function testGetChildrenNoChildren() $children = $imgNode->getChildren(); $this->assertTrue(count($children) === 0); } + + public function testInfiniteLoopNotHappening() + { + $dom = new Dom(); + $dom->loadStr(' + + + + + + + <'); + + $metaNodes = $dom->root->find('meta'); + $this->assertEquals(4, count($metaNodes)); + } } From f85eea6d37eb0a002728386ec9b1a0094c308eca Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 10 Jul 2019 19:28:37 -0400 Subject: [PATCH 122/200] Added Smarty flag for #184 --- CHANGELOG.md | 12 +++++++++++- README.md | 7 ++++++- src/PHPHtmlParser/Dom.php | 10 ++++++---- src/PHPHtmlParser/Options.php | 18 ++++++++++-------- tests/Options/CleanupTest.php | 20 ++++++++++++++++++++ 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e409f3de..12aa15ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ ### Development -## 1.2.0 +All notable changes to this project will be documented in this file. +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- New `removeSmartyScripts` configuration setting. Defaults to true. + +### Changed +- Started using a changelog. - Fixed bug that caused an infinite loop when no content found in tags. diff --git a/README.md b/README.md index 959b753e..6a83aeda 100644 --- a/README.md +++ b/README.md @@ -170,7 +170,12 @@ Preserves Line Breaks if set to `true`. If set to `false` line breaks are cleane **removeDoubleSpace** -Set this to `false` if you want to preserver whitespace inside of text nodes. It is set to `true` by default. +Set this to `false` if you want to preserve whitespace inside of text nodes. It is set to `true` by default. + +**removeSmartyScripts** + +Set this to `false` if you want to preserve smarty sccript found in the html content. It is set to `true` by default. + Static Facade ------------- diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index fd361c51..77db0350 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -502,24 +502,26 @@ protected function clean(string $str): string $str = mb_eregi_replace("", '', $str); // strip out "; + $dom = new Dom(); + $dom->setOptions(['cleanupInput' => false,]); + $dom->load($html); + $this->assertSame($html, $dom->root->outerHtml()); + } + public function testLoad() { $dom = new Dom; From 0689a0468f47b479e49eff02dc04cf3f757b097f Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 27 Apr 2020 02:42:30 +0000 Subject: [PATCH 160/200] Added support for PSR7 requests --- CHANGELOG.md | 6 ++++ composer.json | 5 ++- src/PHPHtmlParser/Curl.php | 54 ----------------------------- src/PHPHtmlParser/CurlInterface.php | 20 ----------- src/PHPHtmlParser/Dom.php | 30 +++++++++++----- src/PHPHtmlParser/StaticDom.php | 29 +++++++++++----- tests/DomTest.php | 24 ++++++++----- tests/StaticDomTest.php | 19 ++++++---- 8 files changed, 80 insertions(+), 107 deletions(-) delete mode 100755 src/PHPHtmlParser/Curl.php delete mode 100755 src/PHPHtmlParser/CurlInterface.php diff --git a/CHANGELOG.md b/CHANGELOG.md index d4921140..0039aa5e 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Added support for PSR7 HTTP clients and requests for URL calls. + ### Changed - Fixed issue with \ causing an infite loop. +### Removed +- Removed curl interface and curl implementation. + ## 2.2.0 ### Added diff --git a/composer.json b/composer.json index e924886e..8e643b31 100755 --- a/composer.json +++ b/composer.json @@ -17,7 +17,10 @@ "ext-mbstring": "*", "paquettg/string-encode": "~1.0.0", "ext-zlib": "*", - "ext-curl": "*" + "ext-curl": "*", + "php-http/httplug": "^2.1", + "php-http/guzzle6-adapter": "^2.0", + "guzzlehttp/psr7": "^1.6" }, "require-dev": { "phpunit/phpunit": "^7.5.1", diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php deleted file mode 100755 index b3e33edc..00000000 --- a/src/PHPHtmlParser/Curl.php +++ /dev/null @@ -1,54 +0,0 @@ -get($url, $options); + if (is_null($request)) { + $request = new Request('GET', $url); + } + + $response = $client->sendRequest($request); + $content = $response->getBody()->getContents(); return $this->loadStr($content, $options); } diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php index 0114bb70..cb70d1d1 100755 --- a/src/PHPHtmlParser/StaticDom.php +++ b/src/PHPHtmlParser/StaticDom.php @@ -1,11 +1,17 @@ loadFromUrl($url, $options, $curl); + return $dom->loadFromUrl($url, $options, $client, $request); } /** diff --git a/tests/DomTest.php b/tests/DomTest.php index 3297923e..7a2cc4ef 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -228,14 +228,22 @@ public function testLoadFileBigTwicePreserveOption() public function testLoadFromUrl() { - $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); - $curl->shouldReceive('get') - ->once() - ->with('http://google.com', []) - ->andReturn(file_get_contents('tests/data/files/small.html')); - - $dom = new Dom; - $dom->loadFromUrl('http://google.com', [], $curl); + $streamMock = Mockery::mock(\Psr\Http\Message\StreamInterface::class); + $streamMock->shouldReceive('getContents') + ->once() + ->andReturn(file_get_contents('tests/data/files/small.html')); + $responseMock = Mockery::mock(\Psr\Http\Message\ResponseInterface::class); + $responseMock->shouldReceive('getBody') + ->once() + ->andReturn($streamMock); + $clientMock = Mockery::mock(\Psr\Http\Client\ClientInterface::class); + $clientMock->shouldReceive('sendRequest') + ->once() + ->andReturn($responseMock); + + + $dom = new Dom; + $dom->loadFromUrl('http://google.com', [], $clientMock); $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); } diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index a6fba2c0..ac9df656 100755 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -58,13 +58,20 @@ public function testFindI() public function testLoadFromUrl() { - $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); - $curl->shouldReceive('get') - ->once() - ->with('http://google.com', []) - ->andReturn(file_get_contents('tests/data/files/small.html')); + $streamMock = Mockery::mock(\Psr\Http\Message\StreamInterface::class); + $streamMock->shouldReceive('getContents') + ->once() + ->andReturn(file_get_contents('tests/data/files/small.html')); + $responseMock = Mockery::mock(\Psr\Http\Message\ResponseInterface::class); + $responseMock->shouldReceive('getBody') + ->once() + ->andReturn($streamMock); + $clientMock = Mockery::mock(\Psr\Http\Client\ClientInterface::class); + $clientMock->shouldReceive('sendRequest') + ->once() + ->andReturn($responseMock); - Dom::loadFromUrl('http://google.com', [], $curl); + Dom::loadFromUrl('http://google.com', [], $clientMock); $this->assertEquals('VonBurgermeister', Dom::find('.post-row div .post-user font', 0)->text); } From 1d4e3792b487387d1328f7a04bda2ae42e318770 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 27 Apr 2020 04:04:03 +0000 Subject: [PATCH 161/200] Added php_cs --- .gitattributes | 1 + .gitignore | 1 + .php_cs.dist | 148 +++++++++ composer.json | 5 +- src/PHPHtmlParser/Content.php | 77 ++--- src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 109 ++++--- src/PHPHtmlParser/Dom.php | 251 +++++++--------- src/PHPHtmlParser/Dom/AbstractNode.php | 189 +++++------- src/PHPHtmlParser/Dom/ArrayNode.php | 18 +- src/PHPHtmlParser/Dom/Collection.php | 59 ++-- src/PHPHtmlParser/Dom/HtmlNode.php | 52 ++-- src/PHPHtmlParser/Dom/InnerNode.php | 146 ++++----- src/PHPHtmlParser/Dom/LeafNode.php | 11 +- src/PHPHtmlParser/Dom/Tag.php | 123 ++++---- src/PHPHtmlParser/Dom/TextNode.php | 50 ++-- .../Exceptions/ChildNotFoundException.php | 10 +- .../Exceptions/CircularException.php | 9 +- .../Exceptions/CurlException.php | 9 +- .../Exceptions/EmptyCollectionException.php | 9 +- .../Exceptions/LogicalException.php | 10 +- .../Exceptions/NotLoadedException.php | 9 +- .../Exceptions/ParentNotFoundException.php | 9 +- .../Exceptions/StrictException.php | 9 +- .../Tag/AttributeNotFoundException.php | 25 +- .../Exceptions/UnknownChildTypeException.php | 9 +- src/PHPHtmlParser/Finder.php | 14 +- src/PHPHtmlParser/Options.php | 40 ++- src/PHPHtmlParser/Selector/Parser.php | 59 ++-- .../Selector/ParserInterface.php | 5 +- src/PHPHtmlParser/Selector/Selector.php | 170 +++++------ src/PHPHtmlParser/StaticDom.php | 48 ++- tests/CollectionTest.php | 45 +-- tests/ContentTest.php | 11 +- tests/DomTest.php | 166 +++++------ tests/Node/ChildrenTest.php | 80 ++--- tests/Node/HtmlTest.php | 99 +++--- tests/Node/ParentTest.php | 281 +++++++++--------- tests/Node/TagTest.php | 15 +- tests/Node/TextTest.php | 16 +- tests/Options/CleanupTest.php | 44 +-- tests/Options/PreserveLineBreaks.php | 24 +- tests/Options/StrictTest.php | 34 +-- tests/Options/WhitespaceTextNodeTest.php | 14 +- tests/OptionsTest.php | 21 +- tests/Selector/SelectorTest.php | 50 ++-- tests/StaticDomTest.php | 17 +- tests/data/MockNode.php | 14 +- 47 files changed, 1265 insertions(+), 1350 deletions(-) create mode 100644 .php_cs.dist diff --git a/.gitattributes b/.gitattributes index 93691f38..ebfea7c7 100755 --- a/.gitattributes +++ b/.gitattributes @@ -10,3 +10,4 @@ /phpunit.xml export-ignore /infection.json.dist export-ignore /.phan export-ignore +/.php_cs.dist export-ignore diff --git a/.gitignore b/.gitignore index 274cf429..9a550fad 100755 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ composer.lock infection.log /vendor/ .idea/ +.php_cs.cache *.swp diff --git a/.php_cs.dist b/.php_cs.dist new file mode 100644 index 00000000..56af284d --- /dev/null +++ b/.php_cs.dist @@ -0,0 +1,148 @@ +in('src') + ->in('tests') +; + +return PhpCsFixer\Config::create() + ->setRiskyAllowed(true) + ->setRules([ + 'array_indentation' => true, + 'array_syntax' => ['syntax' => 'short'], + 'binary_operator_spaces' => ['align_double_arrow'=>true], + 'blank_line_after_namespace' => true, + 'blank_line_after_opening_tag' => true, + 'blank_line_before_statement' => ['statements'=>['return']], + 'braces' => ['allow_single_line_closure' => false], + 'cast_spaces' => true, + 'class_attributes_separation' => ['elements'=>['method']], + 'class_definition' => ['single_line'=>true], + 'combine_consecutive_issets' => true, + 'concat_space' => ['spacing' => 'one'], + 'declare_equal_normalize' => true, + 'elseif' => true, + 'encoding' => true, + 'full_opening_tag' => true, + 'function_declaration' => true, + 'function_typehint_space' => true, + 'include' => true, + 'increment_style' => true, + 'indentation_type' => true, + 'line_ending' => true, + 'linebreak_after_opening_tag' => true, + 'lowercase_cast' => true, + 'lowercase_keywords' => true, + 'lowercase_static_reference' => true, + 'magic_constant_casing' => true, + 'magic_method_casing' => true, + 'mb_str_functions' => false, + 'method_argument_space' => true, + 'native_function_casing' => true, + 'native_function_invocation' => true, + 'native_function_type_declaration_casing' => true, + 'new_with_braces' => true, + 'no_blank_lines_after_class_opening' => true, + 'no_blank_lines_after_phpdoc' => true, + 'no_break_comment' => true, + 'no_closing_tag' => true, + 'no_empty_comment' => true, + 'no_empty_phpdoc' => true, + 'no_empty_statement' => true, + 'no_extra_blank_lines' => ['tokens'=>[ + 'curly_brace_block', + 'extra', + 'parenthesis_brace_block', + 'square_brace_block', + 'throw', + 'use', + ]], + 'no_leading_import_slash' => true, + 'no_leading_namespace_whitespace' => true, + 'no_mixed_echo_print' => true, + 'no_multiline_whitespace_around_double_arrow' => true, + 'no_null_property_initialization' => true, + 'no_short_bool_cast' => true, + 'no_singleline_whitespace_before_semicolons' => true, + 'no_superfluous_elseif' => true, + 'no_spaces_after_function_name' => true, + 'no_spaces_around_offset' => true, + 'no_spaces_inside_parenthesis' => true, + 'no_superfluous_phpdoc_tags' => ['allow_mixed' => true, 'allow_unused_params' => true], + 'no_trailing_comma_in_list_call' => true, + 'no_trailing_comma_in_singleline_array' => true, + 'no_trailing_whitespace' => true, + 'no_trailing_whitespace_in_comment' => true, + 'no_unneeded_control_parentheses' => true, + 'no_unneeded_curly_braces' => ['namespaces' => true], + 'no_unused_imports' => true, + 'no_useless_else' => true, + 'no_useless_return' => true, + 'no_whitespace_before_comma_in_array' => true, + 'no_whitespace_in_blank_line' => true, + 'normalize_index_brace' => true, + 'not_operator_with_space' => false, + 'object_operator_without_whitespace' => true, + 'ordered_class_elements' => true, + 'ordered_imports' => true, + 'php_unit_fqcn_annotation' => true, + 'phpdoc_align' => ['tags' => [ + 'method', + 'param', + 'property', + 'return', + 'throws', + 'type', + 'var', + ]], + 'phpdoc_add_missing_param_annotation' => true, + 'phpdoc_annotation_without_dot' => true, + 'phpdoc_indent' => true, + 'phpdoc_inline_tag' => true, + 'phpdoc_no_access' => true, + 'phpdoc_no_alias_tag' => true, + 'phpdoc_no_package' => true, + 'phpdoc_no_useless_inheritdoc' => true, + 'phpdoc_order' => true, + 'phpdoc_return_self_reference' => true, + 'phpdoc_scalar' => true, + 'phpdoc_separation' => true, + 'phpdoc_single_line_var_spacing' => true, + 'phpdoc_summary' => true, + 'phpdoc_to_comment' => true, + 'phpdoc_trim' => true, + 'phpdoc_trim_consecutive_blank_line_separation' => true, + 'phpdoc_types' => true, + 'phpdoc_types_order' => ['null_adjustment' => 'always_last', 'sort_algorithm' => 'none'], + 'phpdoc_var_without_name' => true, + 'return_assignment' => true, + 'return_type_declaration' => true, + 'semicolon_after_instruction' => true, + 'simplified_null_return' => true, + 'short_scalar_cast' => true, + 'single_blank_line_at_eof' => true, + 'single_blank_line_before_namespace' => true, + 'single_class_element_per_statement' => true, + 'single_import_per_statement' => true, + 'single_line_after_imports' => true, + 'single_line_comment_style' => ['comment_types' => ['hash']], + 'single_line_throw' => true, + 'single_quote' => true, + 'single_trait_insert_per_statement' => true, + 'space_after_semicolon' => ['remove_in_empty_for_expressions'=>true], + 'standardize_increment' => true, + 'standardize_not_equals' => true, + 'switch_case_semicolon_to_colon' => true, + 'switch_case_space' => true, + 'ternary_operator_spaces' => true, + 'ternary_to_null_coalescing' => true, + 'trailing_comma_in_multiline_array' => true, + 'trim_array_spaces' => true, + 'unary_operator_spaces' => true, + 'visibility_required' => true, + 'whitespace_after_comma_in_array' => true, + 'yoda_style' => false, + ]) + ->setFinder($finder) + ->setCacheFile(__DIR__.'/.php_cs.cache') +; \ No newline at end of file diff --git a/composer.json b/composer.json index 8e643b31..79258c58 100755 --- a/composer.json +++ b/composer.json @@ -15,9 +15,9 @@ "require": { "php": ">=7.1", "ext-mbstring": "*", - "paquettg/string-encode": "~1.0.0", "ext-zlib": "*", "ext-curl": "*", + "paquettg/string-encode": "~1.0.0", "php-http/httplug": "^2.1", "php-http/guzzle6-adapter": "^2.0", "guzzlehttp/psr7": "^1.6" @@ -27,7 +27,8 @@ "mockery/mockery": "^1.2", "php-coveralls/php-coveralls": "^2.1", "infection/infection": "^0.13.4", - "phan/phan": "^2.4" + "phan/phan": "^2.4", + "friendsofphp/php-cs-fixer": "^2.16" }, "autoload": { "psr-4": { diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 93b3a73b..37415a91 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -1,20 +1,16 @@ -content = $content; - $this->size = strlen($content); - $this->pos = 0; + $this->size = \strlen($content); + $this->pos = 0; } /** * Returns the current position of the content. - * - * @return int */ public function getPosition(): int { @@ -72,16 +64,15 @@ public function getPosition(): int * Gets the current character we are at. * * @param ?int $char - * @return string */ public function char(?int $char = null): string { $pos = $this->pos; - if ( ! is_null($char)) { + if (!\is_null($char)) { $pos = $char; } - if ( ! isset($this->content[$pos])) { + if (!isset($this->content[$pos])) { return ''; } @@ -91,8 +82,6 @@ public function char(?int $char = null): string /** * Moves the current position forward. * - * @param int $count - * @return Content * @chainable */ public function fastForward(int $count): Content @@ -105,8 +94,6 @@ public function fastForward(int $count): Content /** * Moves the current position backward. * - * @param int $count - * @return Content * @chainable */ public function rewind(int $count): Content @@ -121,11 +108,6 @@ public function rewind(int $count): Content /** * Copy the content until we find the given string. - * - * @param string $string - * @param bool $char - * @param bool $escape - * @return string */ public function copyUntil(string $string, bool $char = false, bool $escape = false): string { @@ -136,9 +118,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal if ($escape) { $position = $this->pos; - $found = false; - while ( ! $found) { - $position = strpos($this->content, $string, $position); + $found = false; + while (!$found) { + $position = \strpos($this->content, $string, $position); if ($position === false) { // reached the end break; @@ -153,17 +135,17 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal $found = true; } } elseif ($char) { - $position = strcspn($this->content, $string, $this->pos); + $position = \strcspn($this->content, $string, $this->pos); $position += $this->pos; } else { - $position = strpos($this->content, $string, $this->pos); + $position = \strpos($this->content, $string, $this->pos); } if ($position === false) { // could not find character, just return the remaining of the content - $return = substr($this->content, $this->pos, $this->size - $this->pos); + $return = \substr($this->content, $this->pos, $this->size - $this->pos); if ($return === false) { - throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + throw new LogicalException('Substr returned false with position ' . $this->pos . '.'); } $this->pos = $this->size; @@ -175,9 +157,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal return ''; } - $return = substr($this->content, $this->pos, $position - $this->pos); + $return = \substr($this->content, $this->pos, $position - $this->pos); if ($return === false) { - throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + throw new LogicalException('Substr returned false with position ' . $this->pos . '.'); } // set the new position $this->pos = $position; @@ -189,8 +171,6 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal * Copies the content until the string is found and return it * unless the 'unless' is found in the substring. * - * @param string $string - * @param string $unless * @return string */ public function copyUntilUnless(string $string, string $unless) @@ -199,9 +179,9 @@ public function copyUntilUnless(string $string, string $unless) $this->fastForward(1); $foundString = $this->copyUntil($string, true, true); - $position = strcspn($foundString, $unless); - if ($position == strlen($foundString)) { - return $string.$foundString; + $position = \strcspn($foundString, $unless); + if ($position == \strlen($foundString)) { + return $string . $foundString; } // rewind changes and return nothing $this->pos = $lastPos; @@ -210,12 +190,10 @@ public function copyUntilUnless(string $string, string $unless) } /** - * Copies the content until it reaches the token string., + * Copies the content until it reaches the token string.,. * - * @param string $token - * @param bool $char - * @param bool $escape * @return string + * * @uses $this->copyUntil() */ public function copyByToken(string $token, bool $char = false, bool $escape = false) @@ -228,20 +206,18 @@ public function copyByToken(string $token, bool $char = false, bool $escape = fa /** * Skip a given set of characters. * - * @param string $string - * @param bool $copy * @return Content|string */ public function skip(string $string, bool $copy = false) { - $len = strspn($this->content, $string, $this->pos); + $len = \strspn($this->content, $string, $this->pos); // make it chainable if they don't want a copy $return = $this; if ($copy) { - $return = substr($this->content, $this->pos, $len); + $return = \substr($this->content, $this->pos, $len); if ($return === false) { - throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + throw new LogicalException('Substr returned false with position ' . $this->pos . '.'); } } @@ -254,9 +230,8 @@ public function skip(string $string, bool $copy = false) /** * Skip a given token of pre-defined characters. * - * @param string $token - * @param bool $copy * @return Content|string + * * @uses $this->skip() */ public function skipByToken(string $token, bool $copy = false) diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php index 44b9bb2f..489b843c 100755 --- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php +++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php @@ -1,57 +1,52 @@ -value = $values['value']; - $this->doubleQuote = $values['doubleQuote']; - } - - /** - * @return string - */ - public function getValue(): ?string - { - return $this->value; - } - - /** - * @return bool - */ - public function isDoubleQuote(): bool - { - return $this->doubleQuote; - } - - public function htmlspecialcharsDecode(): void - { - $this->value = htmlspecialchars_decode($this->value); - } - - /** - * @param Encode $encode - * @throws Exception - */ - public function encodeValue(Encode $encode) - { - $this->value = $encode->convert($this->value); - } -} +value = $values['value']; + $this->doubleQuote = $values['doubleQuote']; + } + + public function getValue(): ?string + { + return $this->value; + } + + public function isDoubleQuote(): bool + { + return $this->doubleQuote; + } + + public function htmlspecialcharsDecode(): void + { + if (!\is_null($this->value)) { + $this->value = \htmlspecialchars_decode($this->value); + } + } + + /** + * @throws Exception + */ + public function encodeValue(Encode $encode) + { + $this->value = $encode->convert($this->value); + } +} diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 99e5b796..b0b30a31 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -1,11 +1,11 @@ - at the end (html5 style) + * A list of tags where there should be no /> at the end (html5 style). * * @var array */ @@ -122,7 +119,6 @@ class Dom /** * Returns the inner html of the root node. * - * @return string * @throws ChildNotFoundException * @throws UnknownChildTypeException */ @@ -135,6 +131,7 @@ public function __toString(): string * A simple wrapper around the root node. * * @param string $name + * * @return mixed */ public function __get($name) @@ -144,9 +141,7 @@ public function __get($name) /** * Attempts to load the dom from any resource, string, file, or URL. - * @param string $str - * @param array $options - * @return Dom + * * @throws ChildNotFoundException * @throws CircularException * @throws CurlException @@ -156,11 +151,11 @@ public function load(string $str, array $options = []): Dom { AbstractNode::resetCount(); // check if it's a file - if (strpos($str, "\n") === false && is_file($str)) { + if (\strpos($str, "\n") === false && \is_file($str)) { return $this->loadFromFile($str, $options); } // check if it's a url - if (preg_match("/^https?:\/\//i", $str)) { + if (\preg_match("/^https?:\/\//i", $str)) { return $this->loadFromUrl($str, $options); } @@ -168,10 +163,8 @@ public function load(string $str, array $options = []): Dom } /** - * Loads the dom from a document file/url - * @param string $file - * @param array $options - * @return Dom + * Loads the dom from a document file/url. + * * @throws ChildNotFoundException * @throws CircularException * @throws StrictException @@ -179,32 +172,31 @@ public function load(string $str, array $options = []): Dom */ public function loadFromFile(string $file, array $options = []): Dom { - $content = file_get_contents($file); + $content = \file_get_contents($file); if ($content === false) { - throw new LogicalException('file_get_contents failed and returned false when trying to read "'.$file.'".'); + throw new LogicalException('file_get_contents failed and returned false when trying to read "' . $file . '".'); } + return $this->loadStr($content, $options); } /** * Use a curl interface implementation to attempt to load * the content from a url. - * @param string $url - * @param array $options + * * @param ClientInterface $client - * @param RequestInterface|null $request - * @return Dom + * * @throws ChildNotFoundException * @throws CircularException * @throws StrictException * @throws \Psr\Http\Client\ClientExceptionInterface */ - public function loadFromUrl(string $url, array $options = [], ClientInterface $client = null, RequestInterface $request = null): Dom + public function loadFromUrl(string $url, array $options = [], ?ClientInterface $client = null, ?RequestInterface $request = null): Dom { - if (is_null($client)) { + if (\is_null($client)) { $client = new Client(); } - if (is_null($request)) { + if (\is_null($request)) { $request = new Request('GET', $url); } @@ -217,25 +209,23 @@ public function loadFromUrl(string $url, array $options = [], ClientInterface $c /** * Parsers the html of the given string. Used for load(), loadFromFile(), * and loadFromUrl(). - * @param string $str - * @param array $option - * @return Dom + * * @throws ChildNotFoundException * @throws CircularException * @throws StrictException */ public function loadStr(string $str, array $option = []): Dom { - $this->options = new Options; + $this->options = new Options(); $this->options->setOptions($this->globalOptions) ->setOptions($option); - $this->rawSize = strlen($str); - $this->raw = $str; + $this->rawSize = \strlen($str); + $this->raw = $str; $html = $this->clean($str); - $this->size = strlen($str); + $this->size = \strlen($str); $this->content = new Content($html); $this->parse(); @@ -247,8 +237,6 @@ public function loadStr(string $str, array $option = []): Dom /** * Sets a global options array to be used by all load calls. * - * @param array $options - * @return Dom * @chainable */ public function setOptions(array $options): Dom @@ -260,18 +248,18 @@ public function setOptions(array $options): Dom /** * Find elements by css selector on the root node. - * @param string $selector - * @param int|null $nth - * @return mixed|Collection|null + * * @throws ChildNotFoundException * @throws NotLoadedException + * + * @return mixed|Collection|null */ public function find(string $selector, int $nth = null) { $this->isLoaded(); $depthFirstSearch = $this->options->get('depthFirstSearch'); - if (is_bool($depthFirstSearch)) { + if (\is_bool($depthFirstSearch)) { $result = $this->root->find($selector, $nth, $depthFirstSearch); } else { $result = $this->root->find($selector, $nth); @@ -281,12 +269,13 @@ public function find(string $selector, int $nth = null) } /** - * Find element by Id on the root node - * @param int $id - * @return bool|AbstractNode + * Find element by Id on the root node. + * * @throws ChildNotFoundException * @throws NotLoadedException * @throws ParentNotFoundException + * + * @return bool|AbstractNode */ public function findById(int $id) { @@ -300,12 +289,11 @@ public function findById(int $id) * be self closing. * * @param string|array $tag - * @return Dom * @chainable */ public function addSelfClosingTag($tag): Dom { - if ( ! is_array($tag)) { + if (!\is_array($tag)) { $tag = [$tag]; } foreach ($tag as $value) { @@ -320,15 +308,14 @@ public function addSelfClosingTag($tag): Dom * always be self closing. * * @param string|array $tag - * @return Dom * @chainable */ public function removeSelfClosingTag($tag): Dom { - if ( ! is_array($tag)) { + if (!\is_array($tag)) { $tag = [$tag]; } - $this->selfClosing = array_diff($this->selfClosing, $tag); + $this->selfClosing = \array_diff($this->selfClosing, $tag); return $this; } @@ -336,7 +323,6 @@ public function removeSelfClosingTag($tag): Dom /** * Sets the list of self closing tags to empty. * - * @return Dom * @chainable */ public function clearSelfClosingTags(): Dom @@ -346,17 +332,15 @@ public function clearSelfClosingTags(): Dom return $this; } - /** - * Adds a tag to the list of self closing tags that should not have a trailing slash + * Adds a tag to the list of self closing tags that should not have a trailing slash. * * @param $tag - * @return Dom * @chainable */ public function addNoSlashTag($tag): Dom { - if ( ! is_array($tag)) { + if (!\is_array($tag)) { $tag = [$tag]; } foreach ($tag as $value) { @@ -370,15 +354,14 @@ public function addNoSlashTag($tag): Dom * Removes a tag from the list of no-slash tags. * * @param $tag - * @return Dom * @chainable */ public function removeNoSlashTag($tag): Dom { - if ( ! is_array($tag)) { + if (!\is_array($tag)) { $tag = [$tag]; } - $this->noSlash = array_diff($this->noSlash, $tag); + $this->noSlash = \array_diff($this->noSlash, $tag); return $this; } @@ -386,7 +369,6 @@ public function removeNoSlashTag($tag): Dom /** * Empties the list of no-slash tags. * - * @return Dom * @chainable */ public function clearNoSlashTags(): Dom @@ -398,7 +380,7 @@ public function clearNoSlashTags(): Dom /** * Simple wrapper function that returns the first child. - * @return AbstractNode + * * @throws ChildNotFoundException * @throws NotLoadedException */ @@ -411,7 +393,7 @@ public function firstChild(): AbstractNode /** * Simple wrapper function that returns the last child. - * @return AbstractNode + * * @throws ChildNotFoundException * @throws NotLoadedException */ @@ -423,9 +405,8 @@ public function lastChild(): AbstractNode } /** - * Simple wrapper function that returns count of child elements + * Simple wrapper function that returns count of child elements. * - * @return int * @throws NotLoadedException */ public function countChildren(): int @@ -436,9 +417,8 @@ public function countChildren(): int } /** - * Get array of children + * Get array of children. * - * @return array * @throws NotLoadedException */ public function getChildren(): array @@ -449,9 +429,8 @@ public function getChildren(): array } /** - * Check if node have children nodes + * Check if node have children nodes. * - * @return bool * @throws NotLoadedException */ public function hasChildren(): bool @@ -464,25 +443,29 @@ public function hasChildren(): bool /** * Simple wrapper function that returns an element by the * id. + * * @param $id - * @return mixed|Collection|null + * * @throws ChildNotFoundException * @throws NotLoadedException + * + * @return mixed|Collection|null */ public function getElementById($id) { $this->isLoaded(); - return $this->find('#'.$id, 0); + return $this->find('#' . $id, 0); } /** * Simple wrapper function that returns all elements by * tag name. - * @param string $name - * @return mixed|Collection|null + * * @throws ChildNotFoundException * @throws NotLoadedException + * + * @return mixed|Collection|null */ public function getElementsByTag(string $name) { @@ -494,16 +477,17 @@ public function getElementsByTag(string $name) /** * Simple wrapper function that returns all elements by * class name. - * @param string $class - * @return mixed|Collection|null + * * @throws ChildNotFoundException * @throws NotLoadedException + * + * @return mixed|Collection|null */ public function getElementsByClass(string $class) { $this->isLoaded(); - return $this->find('.'.$class); + return $this->find('.' . $class); } /** @@ -513,16 +497,13 @@ public function getElementsByClass(string $class) */ protected function isLoaded(): void { - if (is_null($this->content)) { + if (\is_null($this->content)) { throw new NotLoadedException('Content is not loaded!'); } } /** * Cleans the html of any none-html information. - * - * @param string $str - * @return string */ protected function clean(string $str): string { @@ -531,20 +512,20 @@ protected function clean(string $str): string return $str; } - $is_gzip = 0 === mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII"); + $is_gzip = 0 === \mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, 'US-ASCII'); if ($is_gzip) { - $str = gzdecode($str); + $str = \gzdecode($str); if ($str === false) { throw new LogicalException('gzdecode returned false. Error when trying to decode the string.'); } } // remove white space before closing tags - $str = mb_eregi_replace("'\s+>", "'>", $str); + $str = \mb_eregi_replace("'\s+>", "'>", $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.'); } - $str = mb_eregi_replace('"\s+>', '">', $str); + $str = \mb_eregi_replace('"\s+>', '">', $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.'); } @@ -554,36 +535,36 @@ protected function clean(string $str): string if ($this->options->get('preserveLineBreaks')) { $replace = ' '; } - $str = str_replace(["\r\n", "\r", "\n"], $replace, $str); + $str = \str_replace(["\r\n", "\r", "\n"], $replace, $str); if ($str === false) { throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.'); } // strip the doctype - $str = mb_eregi_replace("", '', $str); + $str = \mb_eregi_replace('', '', $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.'); } // strip out comments - $str = mb_eregi_replace("", '', $str); + $str = \mb_eregi_replace('', '', $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.'); } // strip out cdata - $str = mb_eregi_replace("", '', $str); + $str = \mb_eregi_replace("", '', $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.'); } // strip out "; $dom = new Dom(); - $dom->setOptions(['cleanupInput' => false,]); + $dom->setOptions(['cleanupInput' => false]); $dom->load($html); $this->assertSame($html, $dom->root->outerHtml()); } From edec82b2ac45135ec8fbe4a88140d2ddedf71f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Fedor?= Date: Sat, 25 Jan 2020 19:36:43 +0100 Subject: [PATCH 163/200] Throw exception when trying to set unknown option --- .../Exceptions/UnknownOptionException.php | 13 +++++++ src/PHPHtmlParser/Options.php | 6 ++++ tests/OptionsTest.php | 35 +++++++++++++++---- 3 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 src/PHPHtmlParser/Exceptions/UnknownOptionException.php diff --git a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php new file mode 100644 index 00000000..3b139c0b --- /dev/null +++ b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php @@ -0,0 +1,13 @@ + $option) { + if (!isset($this->defaults[$key])) { + throw new UnknownOptionException("Option '$option' is not recognized"); + } $this->options[$key] = $option; } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index 03fe3ee0..90468b8d 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -2,8 +2,10 @@ declare(strict_types=1); -use PHPHtmlParser\Options; +use PHPHtmlParser\Dom; +use PHPHtmlParser\Exceptions\UnknownOptionException; use PHPUnit\Framework\TestCase; +use PHPHtmlParser\Options; class OptionsTest extends TestCase { @@ -14,26 +16,37 @@ public function testDefaultWhitespaceTextNode() $this->assertTrue($options->whitespaceTextNode); } + public function testSettingOption() + { + $options = new Options; + $options->setOptions([ + 'strict' => true, + ]); + + $this->assertTrue($options->strict); + } + public function testAddingOption() { + $this->expectException(UnknownOptionException::class); + $options = new Options(); $options->setOptions([ 'test' => true, ]); - - $this->assertTrue($options->test); } - public function testAddingOver() + public function testOverwritingOption() { $options = new Options(); $options->setOptions([ - 'test' => false, + 'strict' => false, ])->setOptions([ - 'test' => true, + 'strict' => true, 'whitespaceTextNode' => false, ]); + $this->assertTrue($options->get('strict')); $this->assertFalse($options->get('whitespaceTextNode')); } @@ -42,4 +55,14 @@ public function testGettingNoOption() $options = new Options(); $this->assertEquals(null, $options->get('doesnotexist')); } + + public function testUnknownOptionDom() { + $dom = new Dom; + $dom->setOptions([ + 'unknown_option' => true, + ]); + + $this->expectException(UnknownOptionException::class); + $dom->load('
'); + } } From b86c1d3c5e7a6368cbc756e3eb33826fcac5d12e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Fedor?= Date: Sat, 25 Jan 2020 19:57:22 +0100 Subject: [PATCH 164/200] Fix option existence check --- src/PHPHtmlParser/Options.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index 5b2a7b34..b0bb747e 100755 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -80,8 +80,8 @@ public function __get($key) public function setOptions(array $options): Options { foreach ($options as $key => $option) { - if (!isset($this->defaults[$key])) { - throw new UnknownOptionException("Option '$option' is not recognized"); + if (!array_key_exists($key, $this->defaults)) { + throw new UnknownOptionException("Option '$key' is not recognized"); } $this->options[$key] = $option; } From 71c3758da857203423b0071350c59fb5624a504a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 10 May 2020 23:59:09 +0000 Subject: [PATCH 165/200] Updated the way we calculate selector logic --- CHANGELOG.md | 6 +- .../Contracts/Selector/ParserInterface.php | 12 + .../Contracts/Selector/SeekerInterface.php | 17 + .../Contracts/Selector/SelectorInterface.php | 33 ++ .../Selector/ParsedSelectorCollectionDTO.php | 30 ++ .../DTO/Selector/ParsedSelectorDTO.php | 30 ++ src/PHPHtmlParser/DTO/Selector/RuleDTO.php | 96 +++++ src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 6 +- .../Discovery/ParserDiscovery.php | 25 ++ .../Discovery/SeekerDiscovery.php | 25 ++ src/PHPHtmlParser/Dom/AbstractNode.php | 14 +- .../Exceptions/UnknownOptionException.php | 9 +- src/PHPHtmlParser/Options.php | 101 +++-- src/PHPHtmlParser/Selector/Parser.php | 28 +- .../Selector/ParserInterface.php | 10 - src/PHPHtmlParser/Selector/Seeker.php | 321 ++++++++++++++++ src/PHPHtmlParser/Selector/Selector.php | 359 ++---------------- tests/OptionsTest.php | 18 +- tests/Selector/SelectorTest.php | 16 +- 19 files changed, 737 insertions(+), 419 deletions(-) create mode 100644 src/PHPHtmlParser/Contracts/Selector/ParserInterface.php create mode 100644 src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php create mode 100644 src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php create mode 100644 src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php create mode 100644 src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php create mode 100644 src/PHPHtmlParser/DTO/Selector/RuleDTO.php create mode 100644 src/PHPHtmlParser/Discovery/ParserDiscovery.php create mode 100644 src/PHPHtmlParser/Discovery/SeekerDiscovery.php delete mode 100755 src/PHPHtmlParser/Selector/ParserInterface.php create mode 100644 src/PHPHtmlParser/Selector/Seeker.php diff --git a/CHANGELOG.md b/CHANGELOG.md index f765b63e..770a5d92 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,6 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -### Changed -- Added tag attribute DTO. - ## [Unreleased] ### Added @@ -17,7 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Fixed issue with \ causing an infite loop. - CDATA should not be altered when cleanupInput is false. - +- Added tag attribute DTO. +- Cleaned up the selector logic. ### Removed - Removed curl interface and curl implementation. diff --git a/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php b/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php new file mode 100644 index 00000000..3b2477b9 --- /dev/null +++ b/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php @@ -0,0 +1,12 @@ +parsedSelectorDTO[] = $value; + } + } + } + + /** + * @return ParsedSelectorDTO[] + */ + public function getParsedSelectorDTO(): array + { + return $this->parsedSelectorDTO; + } +} diff --git a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php new file mode 100644 index 00000000..5424e2a7 --- /dev/null +++ b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php @@ -0,0 +1,30 @@ +rules[] = $value; + } + } + } + + /** + * @return RuleDTO[] + */ + public function getRules(): array + { + return $this->rules; + } +} diff --git a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php new file mode 100644 index 00000000..1c336149 --- /dev/null +++ b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php @@ -0,0 +1,96 @@ +tag = $values['tag']; + $this->operator = $values['operator']; + $this->key = $values['key']; + $this->value = $values['value']; + $this->noKey = $values['noKey']; + $this->alterNext = $values['alterNext']; + } + + /** + * @return string + */ + public function getTag(): string + { + return $this->tag; + } + + /** + * @return string + */ + public function getOperator(): string + { + return $this->operator; + } + + /** + * @return string|array|null + */ + public function getKey() + { + return $this->key; + } + + /** + * @return string|array|null + */ + public function getValue() + { + return $this->value; + } + + /** + * @return bool + */ + public function isNoKey(): bool + { + return $this->noKey; + } + + /** + * @return bool + */ + public function isAlterNext(): bool + { + return $this->alterNext; + } +} diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php index 489b843c..1f15c492 100755 --- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php +++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php @@ -7,17 +7,17 @@ use stringEncode\Encode; use stringEncode\Exception; -class AttributeDTO +final class AttributeDTO { /** * @var ?string */ - protected $value; + private $value; /** * @var bool */ - protected $doubleQuote = true; + private $doubleQuote = true; public function __construct(array $values) { diff --git a/src/PHPHtmlParser/Discovery/ParserDiscovery.php b/src/PHPHtmlParser/Discovery/ParserDiscovery.php new file mode 100644 index 00000000..a7d3c60a --- /dev/null +++ b/src/PHPHtmlParser/Discovery/ParserDiscovery.php @@ -0,0 +1,25 @@ +setDepthFirstFind($depthFirst); $nodes = $selector->find($this); diff --git a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php index 3b139c0b..58be8198 100644 --- a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php +++ b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php @@ -1,12 +1,13 @@ -options = $this->defaults; } + /** + * A magic get to call the get() method. + * + * @param string $key + * + * @return mixed + * + * @uses $this->get() + */ + public function __get($key) + { + return $this->get($key); + } + /** * The whitespaceTextNode, by default true, option tells the parser to save textnodes even if the content of the * node is empty (only whitespace). Setting it to false will ignore all whitespace only text node found in the document. - * @param bool $value + * * @return Options */ - public function setWhitespaceTextNode(bool $value): self { + public function setWhitespaceTextNode(bool $value): self + { $this->options['whitespaceTextNode'] = $value; + return $this; } /** * Strict, by default false, will throw a StrictException if it finds that the html is not strictly compliant * (all tags must have a closing tag, no attribute with out a value, etc.). - * @param bool $value + * * @return Options */ - public function setStrict(bool $value): self { + public function setStrict(bool $value): self + { $this->options['strict'] = $value; + return $this; } @@ -83,21 +101,25 @@ public function setStrict(bool $value): self { * The enforceEncoding, by default null, option will enforce an character set to be used for reading the content * and returning the content in that encoding. Setting it to null will trigger an attempt to figure out * the encoding from within the content of the string given instead. - * @param string|null $value + * * @return Options */ - public function setEnforceEncoding(?string $value): self { + public function setEnforceEncoding(?string $value): self + { $this->options['enforceEncoding'] = $value; + return $this; } /** * Set this to false to skip the entire clean up phase of the parser. Defaults to true. - * @param bool $value + * * @return Options */ - public function setCleanupInput(bool $value): self { + public function setCleanupInput(bool $value): self + { $this->options['cleanupInput'] = $value; + return $this; } @@ -107,11 +129,12 @@ public function setCleanupInput(bool $value): self { * * NOTE: Ignored if cleanupInit is true. * - * @param bool $value * @return Options */ - public function setRemoveScripts(bool $value): self { + public function setRemoveScripts(bool $value): self + { $this->options['removeScripts'] = $value; + return $this; } @@ -119,11 +142,13 @@ public function setRemoveScripts(bool $value): self { * Set this to false to skip removing of style tags from the document body. This might have adverse effects. Defaults to true. * * NOTE: Ignored if cleanupInit is true. - * @param bool $value + * * @return Options */ - public function setRemoveStyles(bool $value): self { + public function setRemoveStyles(bool $value): self + { $this->options['removeStyles'] = $value; + return $this; } @@ -132,31 +157,37 @@ public function setRemoveStyles(bool $value): self { * as part of the input clean up process. Defaults to false. * * NOTE: Ignored if cleanupInit is true. - * @param bool $value + * * @return Options */ - public function setPreserveLineBreaks(bool $value): self { + public function setPreserveLineBreaks(bool $value): self + { $this->options['preserveLineBreaks'] = $value; + return $this; } /** * Set this to false if you want to preserve whitespace inside of text nodes. It is set to true by default. - * @param bool $value + * * @return Options */ - public function setRemoveDoubleSpace(bool $value): self { + public function setRemoveDoubleSpace(bool $value): self + { $this->options['removeDoubleSpace'] = $value; + return $this; } /** * Set this to false if you want to preserve smarty script found in the html content. It is set to true by default. - * @param bool $value + * * @return Options */ - public function setRemoveSmartyScripts(bool $value): self { + public function setRemoveSmartyScripts(bool $value): self + { $this->options['removeSmartyScripts'] = $value; + return $this; } @@ -164,49 +195,40 @@ public function setRemoveSmartyScripts(bool $value): self { * By default this is set to false for legacy support. Setting this to true will change the behavior of find * to order elements by depth first. This will properly preserve the order of elements as they where in the HTML. * - * @param bool $value * @return Options + * * @deprecated This option will be removed in version 3.0.0 with the new behavior being as if it was set to true. */ - public function setDepthFirstSearch(bool $value): self { + public function setDepthFirstSearch(bool $value): self + { $this->options['depthFirstSearch'] = $value; + return $this; } /** * By default this is set to false. Setting this to true will apply the php function htmlspecialchars_decode too all attribute values and text nodes. - * @param bool $value + * * @return Options */ - public function setHtmlSpecialCharsDecode(bool $value): self { + public function setHtmlSpecialCharsDecode(bool $value): self + { $this->options['htmlSpecialCharsDecode'] = $value; - return $this; - } - /** - * A magic get to call the get() method. - * - * @param string $key - * - * @return mixed - * - * @uses $this->get() - */ - public function __get($key) - { - return $this->get($key); + return $this; } /** * Sets a new options param to override the current option array. * * @chainable + * * @throws UnknownOptionException */ public function setOptions(array $options): Options { foreach ($options as $key => $option) { - if (!array_key_exists($key, $this->defaults)) { + if (!\array_key_exists($key, $this->defaults)) { throw new UnknownOptionException("Option '$key' is not recognized"); } $this->options[$key] = $option; @@ -229,11 +251,12 @@ public function get(string $key) } /** - * Return current options as array + * Return current options as array. * * @return array */ - public function asArray() { + public function asArray() + { return $this->options; } } diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index 0f987903..a70a7a5e 100755 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -4,8 +4,13 @@ namespace PHPHtmlParser\Selector; +use PHPHtmlParser\Contracts\Selector\ParserInterface; +use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO; +use PHPHtmlParser\DTO\Selector\ParsedSelectorDTO; +use PHPHtmlParser\DTO\Selector\RuleDTO; + /** - * This is the parser for the selector. + * This is the default parser for the selector. */ class Parser implements ParserInterface { @@ -14,20 +19,19 @@ class Parser implements ParserInterface * * @var string */ - protected $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + private $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; /** * Parses the selector string. */ - public function parseSelectorString(string $selector): array + public function parseSelectorString(string $selector): ParsedSelectorCollectionDTO { $selectors = []; - $matches = []; + $rules = []; \preg_match_all($this->pattern, \trim($selector) . ' ', $matches, PREG_SET_ORDER); // skip tbody - $result = []; foreach ($matches as $match) { // default values $tag = \strtolower(\trim($match[1])); @@ -88,25 +92,25 @@ public function parseSelectorString(string $selector): array $noKey = true; } - $result[] = [ + $rules[] = new RuleDTO([ 'tag' => $tag, 'key' => $key, 'value' => $value, 'operator' => $operator, 'noKey' => $noKey, 'alterNext' => $alterNext, - ]; + ]); if (isset($match[7]) && \is_string($match[7]) && \trim($match[7]) == ',') { - $selectors[] = $result; - $result = []; + $selectors[] = new ParsedSelectorDTO($rules); + $rules = []; } } // save last results - if (\count($result) > 0) { - $selectors[] = $result; + if (\count($rules) > 0) { + $selectors[] = new ParsedSelectorDTO($rules); } - return $selectors; + return new ParsedSelectorCollectionDTO($selectors); } } diff --git a/src/PHPHtmlParser/Selector/ParserInterface.php b/src/PHPHtmlParser/Selector/ParserInterface.php deleted file mode 100755 index 755966ae..00000000 --- a/src/PHPHtmlParser/Selector/ParserInterface.php +++ /dev/null @@ -1,10 +0,0 @@ -getTag() !== null && \is_numeric($rule->getKey())) { + $count = 0; + /** @var AbstractNode $node */ + foreach ($nodes as $node) { + if ($rule->getTag() == '*' + || $rule->getTag() == $node->getTag() + ->name() + ) { + ++$count; + if ($count == $rule->getKey()) { + // found the node we wanted + return [$node]; + } + } + } + + return []; + } + + $options = $this->flattenOptions($options); + + $return = []; + /** @var InnerNode $node */ + foreach ($nodes as $node) { + // check if we are a leaf + if ($node instanceof LeafNode || !$node->hasChildren() + ) { + continue; + } + + $children = []; + $child = $node->firstChild(); + while (!\is_null($child)) { + // wild card, grab all + if ($rule->getTag() == '*' && \is_null($rule->getKey())) { + $return[] = $child; + $child = $this->getNextChild($node, $child); + continue; + } + + $pass = $this->checkTag($rule, $child); + if ($pass && $rule->getKey() != null) { + $pass = $this->checkKey($rule, $child); + } + if ($pass && + $rule->getKey() != null && + $rule->getValue() != null && + $rule->getValue() != '*' + ) { + $pass = $this->checkComparison($rule, $child); + } + + if ($pass) { + // it passed all checks + $return[] = $child; + } else { + // this child failed to be matched + if ($child instanceof InnerNode && $child->hasChildren() + ) { + if ($depthFirst) { + if (!isset($options['checkGrandChildren']) + || $options['checkGrandChildren'] + ) { + // we have a child that failed but are not leaves. + $matches = $this->seek([$child], $rule, $options, $depthFirst); + foreach ($matches as $match) { + $return[] = $match; + } + } + } else { + // we still want to check its children + $children[] = $child; + } + } + } + + $child = $this->getNextChild($node, $child); + } + + if ((!isset($options['checkGrandChildren']) + || $options['checkGrandChildren']) + && \count($children) > 0 + ) { + // we have children that failed but are not leaves. + $matches = $this->seek($children, $rule, $options, $depthFirst); + foreach ($matches as $match) { + $return[] = $match; + } + } + } + + return $return; + } + + /** + * Checks comparison condition from rules against node. + */ + private function checkComparison(RuleDTO $rule, AbstractNode $node): bool + { + if ($rule->getKey() == 'plaintext') { + // plaintext search + $nodeValue = $node->text(); + $result = $this->checkNodeValue($nodeValue, $rule, $node); + } else { + // normal search + if (!\is_array($rule->getKey())) { + $nodeValue = $node->getAttribute($rule->getKey()); + $result = $this->checkNodeValue($nodeValue, $rule, $node); + } else { + $result = true; + foreach ($rule->getKey() as $index => $key) { + $nodeValue = $node->getAttribute($key); + $result = $result && + $this->checkNodeValue($nodeValue, $rule, $node, $index); + } + } + } + + return $result; + } + + /** + * Flattens the option array. + * + * @return array + */ + private function flattenOptions(array $optionsArray) + { + $options = []; + foreach ($optionsArray as $optionArray) { + foreach ($optionArray as $key => $option) { + $options[$key] = $option; + } + } + + return $options; + } + + /** + * Returns the next child or null if no more children. + * + * @return AbstractNode|null + */ + private function getNextChild( + AbstractNode $node, + AbstractNode $currentChild + ) { + try { + $child = null; + if ($node instanceof InnerNode) { + // get next child + $child = $node->nextChild($currentChild->id()); + } + } catch (ChildNotFoundException $e) { + // no more children + unset($e); + $child = null; + } + + return $child; + } + + /** + * Checks tag condition from rules against node. + */ + private function checkTag(RuleDTO $rule, AbstractNode $node): bool + { + if (!empty($rule->getTag()) && $rule->getTag() != $node->getTag()->name() + && $rule->getTag() != '*' + ) { + return false; + } + + return true; + } + + /** + * Checks key condition from rules against node. + */ + private function checkKey(RuleDTO $rule, AbstractNode $node): bool + { + if (!\is_array($rule->getKey())) { + if ($rule->isNoKey()) { + if ($node->getAttribute($rule->getKey()) !== null) { + return false; + } + } else { + if ($rule->getKey() != 'plaintext' + && !$node->hasAttribute($rule->getKey()) + ) { + return false; + } + } + } else { + if ($rule->isNoKey()) { + foreach ($rule->getKey() as $key) { + if (!\is_null($node->getAttribute($key))) { + return false; + } + } + } else { + foreach ($rule->getKey() as $key) { + if ($key != 'plaintext' + && !$node->hasAttribute($key) + ) { + return false; + } + } + } + } + + return true; + } + + private function checkNodeValue( + ?string $nodeValue, + RuleDTO $rule, + AbstractNode $node, + ?int $index = null + ): bool { + $check = false; + if ( + $rule->getValue() != null && + \is_string($rule->getValue()) + ) { + $check = $this->match($rule->getOperator(), $rule->getValue(), $nodeValue); + } + + // handle multiple classes + $key = $rule->getKey(); + if ( + !$check && + $key == 'class' && + \is_array($rule->getValue()) + ) { + $nodeClasses = \explode(' ', $node->getAttribute('class') ?? ''); + foreach ($rule->getValue() as $value) { + foreach ($nodeClasses as $class) { + if ( + !empty($class) && + \is_string($rule->getOperator()) + ) { + $check = $this->match($rule->getOperator(), $value, $class); + } + if ($check) { + break; + } + } + if (!$check) { + break; + } + } + } elseif ( + !$check && + \is_array($key) && + !\is_null($nodeValue) && + \is_string($rule->getOperator()) && + \is_string($rule->getValue()[$index]) + ) { + $check = $this->match($rule->getOperator(), $rule->getValue()[$index], $nodeValue); + } + + return $check; + } + + /** + * Attempts to match the given arguments with the given operator. + */ + private function match( + string $operator, + string $pattern, + string $value + ): bool { + $value = \strtolower($value); + $pattern = \strtolower($pattern); + switch ($operator) { + case '=': + return $value === $pattern; + case '!=': + return $value !== $pattern; + case '^=': + return \preg_match('/^' . \preg_quote($pattern, '/') . '/', + $value) == 1; + case '$=': + return \preg_match('/' . \preg_quote($pattern, '/') . '$/', + $value) == 1; + case '*=': + if ($pattern[0] == '/') { + return \preg_match($pattern, $value) == 1; + } + + return \preg_match('/' . $pattern . '/i', $value) == 1; + } + + return false; + } +} diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 21d6920d..95c47001 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -4,33 +4,51 @@ namespace PHPHtmlParser\Selector; +use PHPHtmlParser\Contracts\Selector\ParserInterface; +use PHPHtmlParser\Contracts\Selector\SeekerInterface; +use PHPHtmlParser\Contracts\Selector\SelectorInterface; +use PHPHtmlParser\Discovery\ParserDiscovery; +use PHPHtmlParser\Discovery\SeekerDiscovery; use PHPHtmlParser\Dom\AbstractNode; use PHPHtmlParser\Dom\Collection; -use PHPHtmlParser\Dom\InnerNode; -use PHPHtmlParser\Dom\LeafNode; +use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO; +use PHPHtmlParser\DTO\Selector\RuleDTO; use PHPHtmlParser\Exceptions\ChildNotFoundException; /** * Class Selector. */ -class Selector +class Selector implements SelectorInterface { /** - * @var array + * @var ParsedSelectorCollectionDTO */ - protected $selectors = []; + private $ParsedSelectorCollectionDTO; /** * @var bool */ private $depthFirst = false; + /** + * @var SeekerInterface + */ + private $seeker; + /** * Constructs with the selector string. */ - public function __construct(string $selector, ParserInterface $parser) + public function __construct(string $selector, ?ParserInterface $parser = null, ?SeekerInterface $seeker = null) { - $this->selectors = $parser->parseSelectorString($selector); + if ($parser == null) { + $parser = ParserDiscovery::find(); + } + if ($seeker == null) { + $seeker = SeekerDiscovery::find(); + } + + $this->ParsedSelectorCollectionDTO = $parser->parseSelectorString($selector); + $this->seeker = $seeker; } /** @@ -38,9 +56,9 @@ public function __construct(string $selector, ParserInterface $parser) * * @return array */ - public function getSelectors() + public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO { - return $this->selectors; + return $this->ParsedSelectorCollectionDTO; } public function setDepthFirstFind(bool $status): void @@ -57,19 +75,19 @@ public function setDepthFirstFind(bool $status): void public function find(AbstractNode $node): Collection { $results = new Collection(); - foreach ($this->selectors as $selector) { + foreach ($this->ParsedSelectorCollectionDTO->getParsedSelectorDTO() as $selector) { $nodes = [$node]; - if (\count($selector) == 0) { + if (\count($selector->getRules()) == 0) { continue; } $options = []; - foreach ($selector as $rule) { - if ($rule['alterNext']) { + foreach ($selector->getRules() as $rule) { + if ($rule->isAlterNext()) { $options[] = $this->alterNext($rule); continue; } - $nodes = $this->seek($nodes, $rule, $options); + $nodes = $this->seeker->seek($nodes, $rule, $options, $this->depthFirst); // clear the options $options = []; } @@ -83,326 +101,17 @@ public function find(AbstractNode $node): Collection return $results; } - /** - * Checks comparison condition from rules against node. - */ - public function checkComparison(array $rule, AbstractNode $node): bool - { - if ($rule['key'] == 'plaintext') { - // plaintext search - $nodeValue = $node->text(); - $result = $this->checkNodeValue($nodeValue, $rule, $node); - } else { - // normal search - if (!\is_array($rule['key'])) { - $nodeValue = $node->getAttribute($rule['key']); - $result = $this->checkNodeValue($nodeValue, $rule, $node); - } else { - $result = true; - foreach ($rule['key'] as $index => $key) { - $nodeValue = $node->getAttribute($key); - $result = $result && - $this->checkNodeValue($nodeValue, $rule, $node, $index); - } - } - } - - return $result; - } - - /** - * Attempts to find all children that match the rule - * given. - * - * @throws ChildNotFoundException - */ - protected function seek(array $nodes, array $rule, array $options): array - { - // XPath index - if (\array_key_exists('tag', $rule) && \array_key_exists('key', $rule) - && \is_numeric($rule['key']) - ) { - $count = 0; - /** @var AbstractNode $node */ - foreach ($nodes as $node) { - if ($rule['tag'] == '*' - || $rule['tag'] == $node->getTag() - ->name() - ) { - ++$count; - if ($count == $rule['key']) { - // found the node we wanted - return [$node]; - } - } - } - - return []; - } - - $options = $this->flattenOptions($options); - - $return = []; - /** @var InnerNode $node */ - foreach ($nodes as $node) { - // check if we are a leaf - if ($node instanceof LeafNode || !$node->hasChildren() - ) { - continue; - } - - $children = []; - $child = $node->firstChild(); - while (!\is_null($child)) { - // wild card, grab all - if ($rule['tag'] == '*' && \is_null($rule['key'])) { - $return[] = $child; - $child = $this->getNextChild($node, $child); - continue; - } - - $pass = $this->checkTag($rule, $child); - if ($pass && !\is_null($rule['key'])) { - $pass = $this->checkKey($rule, $child); - } - if ($pass && !\is_null($rule['key']) && !\is_null($rule['value']) - && $rule['value'] != '*' - ) { - $pass = $this->checkComparison($rule, $child); - } - - if ($pass) { - // it passed all checks - $return[] = $child; - } else { - // this child failed to be matched - if ($child instanceof InnerNode && $child->hasChildren() - ) { - if ($this->depthFirst) { - if (!isset($options['checkGrandChildren']) - || $options['checkGrandChildren'] - ) { - // we have a child that failed but are not leaves. - $matches = $this->seek([$child], $rule, - $options); - foreach ($matches as $match) { - $return[] = $match; - } - } - } else { - // we still want to check its children - $children[] = $child; - } - } - } - - $child = $this->getNextChild($node, $child); - } - - if ((!isset($options['checkGrandChildren']) - || $options['checkGrandChildren']) - && \count($children) > 0 - ) { - // we have children that failed but are not leaves. - $matches = $this->seek($children, $rule, $options); - foreach ($matches as $match) { - $return[] = $match; - } - } - } - - return $return; - } - - /** - * Attempts to match the given arguments with the given operator. - */ - protected function match( - string $operator, - string $pattern, - string $value - ): bool { - $value = \strtolower($value); - $pattern = \strtolower($pattern); - switch ($operator) { - case '=': - return $value === $pattern; - case '!=': - return $value !== $pattern; - case '^=': - return \preg_match('/^' . \preg_quote($pattern, '/') . '/', - $value) == 1; - case '$=': - return \preg_match('/' . \preg_quote($pattern, '/') . '$/', - $value) == 1; - case '*=': - if ($pattern[0] == '/') { - return \preg_match($pattern, $value) == 1; - } - - return \preg_match('/' . $pattern . '/i', $value) == 1; - } - - return false; - } - /** * Attempts to figure out what the alteration will be for * the next element. */ - protected function alterNext(array $rule): array + private function alterNext(RuleDTO $rule): array { $options = []; - if ($rule['tag'] == '>') { + if ($rule->getTag() == '>') { $options['checkGrandChildren'] = false; } return $options; } - - /** - * Flattens the option array. - * - * @return array - */ - protected function flattenOptions(array $optionsArray) - { - $options = []; - foreach ($optionsArray as $optionArray) { - foreach ($optionArray as $key => $option) { - $options[$key] = $option; - } - } - - return $options; - } - - /** - * Returns the next child or null if no more children. - * - * @return AbstractNode|null - */ - protected function getNextChild( - AbstractNode $node, - AbstractNode $currentChild - ) { - try { - $child = null; - if ($node instanceof InnerNode) { - // get next child - $child = $node->nextChild($currentChild->id()); - } - } catch (ChildNotFoundException $e) { - // no more children - unset($e); - $child = null; - } - - return $child; - } - - /** - * Checks tag condition from rules against node. - */ - protected function checkTag(array $rule, AbstractNode $node): bool - { - if (!empty($rule['tag']) && $rule['tag'] != $node->getTag()->name() - && $rule['tag'] != '*' - ) { - return false; - } - - return true; - } - - /** - * Checks key condition from rules against node. - */ - protected function checkKey(array $rule, AbstractNode $node): bool - { - if (!\is_array($rule['key'])) { - if ($rule['noKey']) { - if (!\is_null($node->getAttribute($rule['key']))) { - return false; - } - } else { - if ($rule['key'] != 'plaintext' - && !$node->hasAttribute($rule['key']) - ) { - return false; - } - } - } else { - if ($rule['noKey']) { - foreach ($rule['key'] as $key) { - if (!\is_null($node->getAttribute($key))) { - return false; - } - } - } else { - foreach ($rule['key'] as $key) { - if ($key != 'plaintext' - && !$node->hasAttribute($key) - ) { - return false; - } - } - } - } - - return true; - } - - private function checkNodeValue( - ?string $nodeValue, - array $rule, - AbstractNode $node, - ?int $index = null - ): bool { - $check = false; - if ( - \array_key_exists('value', $rule) && !\is_array($rule['value']) && - !\is_null($nodeValue) && - \array_key_exists('operator', $rule) && \is_string($rule['operator']) && - \array_key_exists('value', $rule) && \is_string($rule['value']) - ) { - $check = $this->match($rule['operator'], $rule['value'], $nodeValue); - } - - // handle multiple classes - $key = $rule['key']; - if ( - !$check && - $key == 'class' && - \array_key_exists('value', $rule) && \is_array($rule['value']) - ) { - $nodeClasses = \explode(' ', $node->getAttribute('class') ?? ''); - foreach ($rule['value'] as $value) { - foreach ($nodeClasses as $class) { - if ( - !empty($class) && - \array_key_exists('operator', $rule) && \is_string($rule['operator']) - ) { - $check = $this->match($rule['operator'], $value, $class); - } - if ($check) { - break; - } - } - if (!$check) { - break; - } - } - } elseif ( - !$check && - \is_array($key) && - !\is_null($nodeValue) && - \array_key_exists('operator', $rule) && \is_string($rule['operator']) && - \array_key_exists('value', $rule) && \is_string($rule['value'][$index]) - ) { - $check = $this->match($rule['operator'], $rule['value'][$index], $nodeValue); - } - - return $check; - } } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index ed83b177..91c62591 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -4,8 +4,8 @@ use PHPHtmlParser\Dom; use PHPHtmlParser\Exceptions\UnknownOptionException; -use PHPUnit\Framework\TestCase; use PHPHtmlParser\Options; +use PHPUnit\Framework\TestCase; class OptionsTest extends TestCase { @@ -18,7 +18,7 @@ public function testDefaultWhitespaceTextNode() public function testSettingOption() { - $options = new Options; + $options = new Options(); $options->setOptions([ 'strict' => true, ]); @@ -42,7 +42,7 @@ public function testOverwritingOption() $options->setOptions([ 'strict' => false, ])->setOptions([ - 'strict' => true, + 'strict' => true, 'whitespaceTextNode' => false, ]); @@ -56,7 +56,8 @@ public function testGettingNoOption() $this->assertEquals(null, $options->get('doesnotexist')); } - public function testSetters() { + public function testSetters() + { $options = new Options(); $options->setOptions([ @@ -79,8 +80,8 @@ public function testSetters() { $options->setStrict(true); $this->assertTrue($options->get('strict')); - $options->setEnforceEncoding("utf8"); - $this->assertEquals("utf8", $options->get('enforceEncoding')); + $options->setEnforceEncoding('utf8'); + $this->assertEquals('utf8', $options->get('enforceEncoding')); $options->setCleanupInput(true); $this->assertTrue($options->get('cleanupInput')); @@ -142,8 +143,9 @@ public function testSetters() { $this->assertFalse($options->get('htmlSpecialCharsDecode')); } - public function testUnknownOptionDom() { - $dom = new Dom; + public function testUnknownOptionDom() + { + $dom = new Dom(); $dom->setOptions([ 'unknown_option' => true, ]); diff --git a/tests/Selector/SelectorTest.php b/tests/Selector/SelectorTest.php index 261b3cb8..d2a12a59 100755 --- a/tests/Selector/SelectorTest.php +++ b/tests/Selector/SelectorTest.php @@ -13,29 +13,29 @@ class SelectorTest extends TestCase public function testParseSelectorStringId() { $selector = new Selector('#all', new Parser()); - $selectors = $selector->getSelectors(); - $this->assertEquals('id', $selectors[0][0]['key']); + $selectors = $selector->getParsedSelectorCollectionDTO(); + $this->assertEquals('id', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getKey()); } public function testParseSelectorStringClass() { $selector = new Selector('div.post', new Parser()); - $selectors = $selector->getSelectors(); - $this->assertEquals('class', $selectors[0][0]['key']); + $selectors = $selector->getParsedSelectorCollectionDTO(); + $this->assertEquals('class', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getKey()); } public function testParseSelectorStringAttribute() { $selector = new Selector('div[visible=yes]', new Parser()); - $selectors = $selector->getSelectors(); - $this->assertEquals('yes', $selectors[0][0]['value']); + $selectors = $selector->getParsedSelectorCollectionDTO(); + $this->assertEquals('yes', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getValue()); } public function testParseSelectorStringNoKey() { $selector = new Selector('div[!visible]', new Parser()); - $selectors = $selector->getSelectors(); - $this->assertTrue($selectors[0][0]['noKey']); + $selectors = $selector->getParsedSelectorCollectionDTO(); + $this->assertTrue($selectors->getParsedSelectorDTO()[0]->getRules()[0]->isNoKey()); } public function testFind() From 04321f991ba37b9b47ee22ae52dc2319c353a6b0 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 11 May 2020 00:52:31 +0000 Subject: [PATCH 166/200] fixes #82 --- CHANGELOG.md | 2 + src/PHPHtmlParser/Dom.php | 58 +- src/PHPHtmlParser/Selector/Seeker.php | 4 +- tests/DomTest.php | 71 + tests/Selector/SeekerTest.php | 25 + tests/data/files/big.html | 2 +- tests/data/files/html5.html | 2957 +++++++++++++++++++++++++ 7 files changed, 3096 insertions(+), 23 deletions(-) create mode 100644 tests/Selector/SeekerTest.php create mode 100644 tests/data/files/html5.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 770a5d92..8daa5304 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,12 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added support for PSR7 HTTP clients and requests for URL calls. - Added PHAN support and fixed all issues from PHAN. +- Added support for html5 charset detection. ### Changed - Fixed issue with \ causing an infite loop. - CDATA should not be altered when cleanupInput is false. - Added tag attribute DTO. - Cleaned up the selector logic. +- Fixed issue with greedy regex for charset detection. ### Removed - Removed curl interface and curl implementation. diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 9a980ab5..8c2ebcde 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -39,42 +39,42 @@ class Dom * * @var string */ - protected $defaultCharset = 'UTF-8'; + private $defaultCharset = 'UTF-8'; /** * The raw version of the document string. * * @var string */ - protected $raw; + private $raw; /** * The document string. * * @var Content */ - protected $content; + private $content; /** * The original file size of the document. * * @var int */ - protected $rawSize; + private $rawSize; /** * The size of the document after it is cleaned. * * @var int */ - protected $size; + private $size; /** * A global options array to be used by all load calls. * * @var array */ - protected $globalOptions = []; + private $globalOptions = []; /** * A persistent option object to be used for all options in the @@ -82,14 +82,14 @@ class Dom * * @var Options */ - protected $options; + private $options; /** * A list of tags which will always be self closing. * * @var array */ - protected $selfClosing = [ + private $selfClosing = [ 'area', 'base', 'basefont', @@ -114,7 +114,7 @@ class Dom * * @var array */ - protected $noSlash = []; + private $noSlash = []; /** * Returns the inner html of the root node. @@ -173,7 +173,7 @@ public function load(string $str, array $options = []): Dom */ public function loadFromFile(string $file, array $options = []): Dom { - $content = \file_get_contents($file); + $content = @\file_get_contents($file); if ($content === false) { throw new LogicalException('file_get_contents failed and returned false when trying to read "' . $file . '".'); } @@ -496,7 +496,7 @@ public function getElementsByClass(string $class) * * @throws NotLoadedException */ - protected function isLoaded(): void + private function isLoaded(): void { if (\is_null($this->content)) { throw new NotLoadedException('Content is not loaded!'); @@ -506,7 +506,7 @@ protected function isLoaded(): void /** * Cleans the html of any none-html information. */ - protected function clean(string $str): string + private function clean(string $str): string { if ($this->options->get('cleanupInput') != true) { // skip entire cleanup step @@ -610,7 +610,7 @@ protected function clean(string $str): string * @throws StrictException * @throws LogicalException */ - protected function parse(): void + private function parse(): void { // add the root node $this->root = new HtmlNode('root'); @@ -679,7 +679,7 @@ protected function parse(): void * * @throws StrictException */ - protected function parseTag(): array + private function parseTag(): array { $return = [ 'status' => false, @@ -823,7 +823,7 @@ protected function parseTag(): array * * @throws ChildNotFoundException */ - protected function detectCharset(): bool + private function detectCharset(): bool { // set the default $encode = new Encode(); @@ -841,11 +841,15 @@ protected function detectCharset(): bool /** @var AbstractNode $meta */ $meta = $this->root->find('meta[http-equiv=Content-Type]', 0); - if (\is_null($meta)) { - // could not find meta tag - $this->root->propagateEncoding($encode); + if ($meta == null) { + if (!$this->detectHTML5Charset($encode)) { + // could not find meta tag + $this->root->propagateEncoding($encode); - return false; + return false; + } + + return true; } $content = $meta->getAttribute('content'); if (\is_null($content)) { @@ -855,7 +859,7 @@ protected function detectCharset(): bool return false; } $matches = []; - if (\preg_match('/charset=(.+)/', $content, $matches)) { + if (\preg_match('/charset=([^;]+)/', $content, $matches)) { $encode->from(\trim($matches[1])); $this->root->propagateEncoding($encode); @@ -867,4 +871,18 @@ protected function detectCharset(): bool return false; } + + private function detectHTML5Charset(Encode $encode): bool + { + /** @var AbstractNode|null $meta */ + $meta = $this->root->find('meta[charset]', 0); + if ($meta == null) { + return false; + } + + $encode->from(\trim($meta->getAttribute('charset'))); + $this->root->propagateEncoding($encode); + + return true; + } } diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index e9ed2484..971c40f9 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -314,8 +314,8 @@ private function match( } return \preg_match('/' . $pattern . '/i', $value) == 1; + default: + return false; } - - return false; } } diff --git a/tests/DomTest.php b/tests/DomTest.php index 0a50021e..ea570561 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -196,6 +196,13 @@ public function testLoadFromFileFind() $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); } + public function testLoadFromFileNotFound() + { + $dom = new Dom(); + $this->expectException(\PHPHtmlParser\Exceptions\LogicalException::class); + $dom->loadFromFile('tests/data/files/unkowne.html'); + } + public function testLoadUtf8() { $dom = new Dom(); @@ -531,6 +538,60 @@ public function testMultipleSquareSelector() $this->assertEquals(1, \count($items)); } + public function testNotSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[type!=foo]'); + $this->assertEquals(1, \count($items)); + } + + public function testStartSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[name^=f]'); + $this->assertEquals(1, \count($items)); + } + + public function testEndSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[baz$=g]'); + $this->assertEquals(1, \count($items)); + } + + public function testStarSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[baz*=*]'); + $this->assertEquals(1, \count($items)); + } + + public function testStarFullRegexSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[baz*=/\w+/]'); + $this->assertEquals(1, \count($items)); + } + + public function testFailedSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[baz%=g]'); + $this->assertEquals(1, \count($items)); + } + public function testLoadGetAttributeWithBackslash() { $dom = new Dom(); @@ -547,4 +608,14 @@ public function test25ChildrenFound() $children = $dom->find('#red-line-g *'); $this->assertEquals(25, \count($children)); } + + public function testHtml5PageLoad() + { + $dom = new Dom(); + $dom->loadFromFile('tests/data/files/html5.html'); + + /** @var Dom\AbstractNode $meta */ + $div = $dom->find('div.d-inline-block', 0); + $this->assertEquals('max-width: 29px', $div->getAttribute('style')); + } } diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php new file mode 100644 index 00000000..4e2d9e4f --- /dev/null +++ b/tests/Selector/SeekerTest.php @@ -0,0 +1,25 @@ + 'tag', + 'key' => 1, + 'value' => null, + 'operator' => null, + 'noKey' => false, + 'alterNext' => false, + ]); + $seeker = new Seeker(); + $results = $seeker->seek([], $ruleDTO, [], false); + $this->assertCount(0, $results); + } +} diff --git a/tests/data/files/big.html b/tests/data/files/big.html index 6b5e3ee5..a26f5093 100755 --- a/tests/data/files/big.html +++ b/tests/data/files/big.html @@ -2,7 +2,7 @@ - + diff --git a/tests/data/files/html5.html b/tests/data/files/html5.html new file mode 100644 index 00000000..b2b1413d --- /dev/null +++ b/tests/data/files/html5.html @@ -0,0 +1,2957 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Dom.php uses greedy RegEx to match charset · Issue #82 · paquettg/php-html-parser + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content + + + + + + + + + + + + + +
+ +
+ + + + +
+ + + +
+ + + + + + + + + +
+
+
+ + + + + + + + + + + + + +
+ +
+ +
+

+ + + + + / + + php-html-parser + + +

+ + +
+ +
    + + + +
  • + + + + +
  • + +
  • + +
    + +
    + + + Unwatch + + +
    + Notifications +
    +
    + + + + + + + +
    +
    +
    + +
    +
  • + +
  • +
    +
    + + +
    +
    + + +
    + +
  • + +
  • + + + Fork + + +
  • +
+ +
+ + + + + +
+ + + + + + +
+
+ + + +
+ + + + +
+ + +
+
+
+
+ + + + New issue + + +
+ +

+ + Dom.php uses greedy RegEx to match charset + + #82 +

+
+
+ +
+
+ +
+ + +
+
+
+
+ + + Open + + + +
+
+ thinkingmedia opened this issue + Jul 17, 2016 + · 0 comments + + + + + +
+
+ + +
+
+
+
+
+ + + Open + + + +
+
+

+ Dom.php uses greedy RegEx to match charset + #82 +

+ +
+ thinkingmedia opened this issue + Jul 17, 2016 + · 0 comments + + + + + +
+
+
+
+
+
+
+
+ + + +
+ +
+ Labels +
+ bug +
+
+ +
+ Projects + +
+ +
+ Milestone + +
+
+ + +

Comments

+
+
+ +
+
+ +
+ +
+ + +
+ @thinkingmedia + +
+ + +
+
+ + + +
+
+ + + + + + + + + + + + + + + + + +
+ + + + + +
+

+ Pick your reaction +

+ + + +
+ + + + + + + + + + +
+
+ +
+ + + + +
+ + + + + + Copy link + + + + + + + + + + + + Report content + + + +
+ +
+ + +
+ + + + + + +
+ +

+ + + @thinkingmedia + + + + + thinkingmedia + + + + + + commented + + + Jul 17, 2016 + + + + +

+
+ + +
+ + + + + + + + + +
+

I found an edge case where Dom.php would call detectCharset and extract an invalid charset

+

The example comes from https://duckduckgo.com/

+

They have this meta tag.

+
    <meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
+
+

The problem is that this section of Dom.php uses a greedy regex.

+
        $matches = [];
+        if (preg_match('/charset=(.+)/', $content, $matches)) {
+            $encode->from(trim($matches[1]));
+            $this->root->propagateEncoding($encode);
+
+            return true;
+        }
+
+

So I changed it to this and it works.

+
        if (preg_match('/charset=([^;]+)/', $content, $matches)) {
+
+

I use the ; character as a terminator for the charset identifier.

+

I also noticed that the Dom.php does not support this meta tag.

+
<meta charset="utf-8" />
+
+

This is the new charset identifier for HTML5.

+
+
+ + + + +
+ +
+ +
+ +
+
+ + +
+ + +
+ +
+ +
+ + +
+ + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ + + + + +
+ Select a reply + ctrl . +
+ + + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ + +
+ +

+ The content you are editing has changed. Please try again. +

+ + + +
+ + + + + + + + + + + + + + + + +
+
+ +
+
+
+

Nothing to preview

+
+
+ +
+ +
+ + +
+ + +
+
+ + +
+ +
+
+ +
+ + + + +
+ + +
+ + + + +
+ +
+
+ +
+
+ + + + @paquettg + paquettg + + + + + + added + the + + bug + + label + + + Jan 30, 2019 + +
+
+ + +
+
+ +
+
+ + + + @paquettg + paquettg + + + + + + added this to the 3.0.0 milestone + + + Aug 18, 2019 + +
+
+ + +
+
+ +
+
+ + + + @paquettg + paquettg + + + + added this to To do + in 3.0.0 + + + Aug 18, 2019 + +
+
+ + + + +
+ + + + + + + +
+
+ +
+ +
+ + +
+ + + +
+
+
+ @paquettg +
+ +
+ +
+ + + +
+
+ + +
+ +
+ +
+ + +
+ + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ + + + + +
+ Select a reply + ctrl . +
+ + + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ + + + +
+ + + + + + + + + +
+
+ + + + + +
+ +
+
+ +
+
+ + +
+
+ +
+
+
+ +
+
+
+
+ Remember, contributions to this repository should follow + its + contributing guidelines. +
+ + +
+ + +
+
+ +
+ +
+ +
+ + + + + + + +
+
+ +
+ + + + Projects + + + +
+ Projects + +
+ + + + +
+
+ + + + + +
+
+ +
+
+ + 3.0.0 +
+
+ +    + +
+ +
+ +
+ +
+
+
+ + +
+ +
+ + + + +
+
+ +
+ + + + Linked pull requests + + + +
+ Link a pull request from this repository + +
+ + + + +
+
+ + + +

Successfully merging a pull request may close this issue.

+ + None yet + +
+
+ + + +
+
+ +
+ 2 participants +
+ +
+
+ + + +
+
+ + + Lock conversation + + +
+
+ +

+ Lock conversation on this issue +

+
+
+
    +
  • Other users can’t add new comments to this issue.
  • +
  • + You and other collaborators + with access + to this repository can still leave comments that others can see. +
  • +
  • You can always unlock this issue again in the future.
  • +
+ +
+
+ +
+
+ +

+ Optionally, choose a reason for locking that others can see. Learn more about when + it’s appropriate to lock conversations. +

+
+
+
+ +
+
+
+ +
+
+ + +
+ + +
+
+
+ + + Transfer issue + + + + +
+ Loading transfer form... +
+
+
+
+
+
+ +
+
+ + + Delete issue + + + +
+ + + + +
+

Are you sure you want to delete this issue?

+
+
    +
  • This cannot be undone
  • +
  • Only administrators can delete issues
  • +
  • Deletion will remove the issue from search and previous references will point to a placeholder
  • +
+
+ + +
+
+
+
+ + + +
+ + +
+
+ + +
+
+ +
+ + +
+
+ +
+
+ + +
+ + + + + + +
+ + + You can’t perform that action at this time. +
+ + + + + + + + + + + + + + + + + + + \ No newline at end of file From 9d8149016d0eb45b6695d860dae7581dbdcc4b98 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 11 May 2020 01:01:36 +0000 Subject: [PATCH 167/200] fix #103 --- CHANGELOG.md | 10 ++++---- src/PHPHtmlParser/Selector/Seeker.php | 34 +++++++++++++-------------- tests/DomTest.php | 14 ++++++++++- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8daa5304..1de2d167 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,9 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Added support for PSR7 HTTP clients and requests for URL calls. -- Added PHAN support and fixed all issues from PHAN. -- Added support for html5 charset detection. +- Support for PSR7 HTTP clients and requests for URL calls has been added. +- PHAN support and fixed all issues from PHAN has been added. +- PHP-CS-Fixer added. +- Support for html5 charset detection. +- Added the ability to match both parent and children. ### Changed - Fixed issue with \ causing an infite loop. @@ -20,7 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed issue with greedy regex for charset detection. ### Removed -- Removed curl interface and curl implementation. +- Curl interface and curl implementation has been removed. ## 2.2.0 diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index 971c40f9..fa101e9e 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -17,6 +17,7 @@ class Seeker implements SeekerInterface * Attempts to find all children that match the rule * given. * + * @var InnerNode[] $nodes * @throws ChildNotFoundException */ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array @@ -24,7 +25,6 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir // XPath index if ($rule->getTag() !== null && \is_numeric($rule->getKey())) { $count = 0; - /** @var AbstractNode $node */ foreach ($nodes as $node) { if ($rule->getTag() == '*' || $rule->getTag() == $node->getTag() @@ -44,7 +44,6 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir $options = $this->flattenOptions($options); $return = []; - /** @var InnerNode $node */ foreach ($nodes as $node) { // check if we are a leaf if ($node instanceof LeafNode || !$node->hasChildren() @@ -77,24 +76,23 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir if ($pass) { // it passed all checks $return[] = $child; - } else { - // this child failed to be matched - if ($child instanceof InnerNode && $child->hasChildren() - ) { - if ($depthFirst) { - if (!isset($options['checkGrandChildren']) - || $options['checkGrandChildren'] - ) { - // we have a child that failed but are not leaves. - $matches = $this->seek([$child], $rule, $options, $depthFirst); - foreach ($matches as $match) { - $return[] = $match; - } + } + // this child failed to be matched + if ($child instanceof InnerNode && $child->hasChildren() + ) { + if ($depthFirst) { + if (!isset($options['checkGrandChildren']) + || $options['checkGrandChildren'] + ) { + // we have a child that failed but are not leaves. + $matches = $this->seek([$child], $rule, $options, $depthFirst); + foreach ($matches as $match) { + $return[] = $match; } - } else { - // we still want to check its children - $children[] = $child; } + } else { + // we still want to check its children + $children[] = $child; } } diff --git a/tests/DomTest.php b/tests/DomTest.php index ea570561..7c29b508 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -223,7 +223,7 @@ public function testLoadFileBig() { $dom = new Dom(); $dom->loadFromFile('tests/data/files/big.html'); - $this->assertEquals(10, \count($dom->find('.content-border'))); + $this->assertEquals(20, \count($dom->find('.content-border'))); } public function testLoadFileBigTwice() @@ -618,4 +618,16 @@ public function testHtml5PageLoad() $div = $dom->find('div.d-inline-block', 0); $this->assertEquals('max-width: 29px', $div->getAttribute('style')); } + + public function testFindAttributeInBothParentAndChild() + { + $dom = new Dom(); + $dom->load(' + +'); + + /** @var Dom\AbstractNode $meta */ + $nodes = $dom->find('[attribute]'); + $this->assertCount(2, $nodes); + } } From 4e13ad24dadd0313ed48448632e1bc317b9c780c Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 12 May 2020 02:49:20 +0000 Subject: [PATCH 168/200] Removed the depricated depthFirstSearch option --- CHANGELOG.md | 1 + .../Contracts/Selector/SeekerInterface.php | 2 +- .../Contracts/Selector/SelectorInterface.php | 2 -- src/PHPHtmlParser/Dom.php | 7 +----- src/PHPHtmlParser/Dom/AbstractNode.php | 3 +-- src/PHPHtmlParser/Options.php | 17 -------------- src/PHPHtmlParser/Selector/Seeker.php | 23 ++++++++----------- src/PHPHtmlParser/Selector/Selector.php | 12 +--------- tests/DomTest.php | 13 ----------- tests/OptionsTest.php | 7 ------ 10 files changed, 14 insertions(+), 73 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1de2d167..3cd22cd0 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed - Curl interface and curl implementation has been removed. +- Removed support for the depth first search option. ## 2.2.0 diff --git a/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php b/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php index cca4eb54..23357795 100644 --- a/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php +++ b/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php @@ -13,5 +13,5 @@ interface SeekerInterface * * @throws ChildNotFoundException */ - public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array; + public function seek(array $nodes, RuleDTO $rule, array $options): array; } diff --git a/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php b/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php index 8eca7d1e..c1aceeb9 100644 --- a/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php +++ b/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php @@ -21,8 +21,6 @@ public function __construct(string $selector, ?ParserInterface $parser = null, ? */ public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO; - public function setDepthFirstFind(bool $status): void; - /** * Attempts to find the selectors starting from the given * node object. diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 8c2ebcde..fa659f67 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -259,12 +259,7 @@ public function find(string $selector, int $nth = null) { $this->isLoaded(); - $depthFirstSearch = $this->options->get('depthFirstSearch'); - if (\is_bool($depthFirstSearch)) { - $result = $this->root->find($selector, $nth, $depthFirstSearch); - } else { - $result = $this->root->find($selector, $nth); - } + $result = $this->root->find($selector, $nth); return $result; } diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 0d096904..596a3ae8 100755 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -439,13 +439,12 @@ public function ancestorByTag(string $tag): AbstractNode * * @return mixed|Collection|null */ - public function find(string $selectorString, ?int $nth = null, bool $depthFirst = false, ?SelectorInterface $selector = null) + public function find(string $selectorString, ?int $nth = null, ?SelectorInterface $selector = null) { if (\is_null($selector)) { $selector = new Selector($selectorString); } - $selector->setDepthFirstFind($depthFirst); $nodes = $selector->find($this); if ($nth !== null) { diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index e90e435f..b7e1cd17 100755 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -18,7 +18,6 @@ * @property bool $preserveLineBreaks * @property bool $removeDoubleSpace * @property bool $removeSmartyScripts - * @property bool $depthFirstSearch * @property bool $htmlSpecialCharsDecode */ class Options @@ -38,7 +37,6 @@ class Options 'preserveLineBreaks' => false, 'removeDoubleSpace' => true, 'removeSmartyScripts' => true, - 'depthFirstSearch' => false, 'htmlSpecialCharsDecode' => false, ]; @@ -191,21 +189,6 @@ public function setRemoveSmartyScripts(bool $value): self return $this; } - /** - * By default this is set to false for legacy support. Setting this to true will change the behavior of find - * to order elements by depth first. This will properly preserve the order of elements as they where in the HTML. - * - * @return Options - * - * @deprecated This option will be removed in version 3.0.0 with the new behavior being as if it was set to true. - */ - public function setDepthFirstSearch(bool $value): self - { - $this->options['depthFirstSearch'] = $value; - - return $this; - } - /** * By default this is set to false. Setting this to true will apply the php function htmlspecialchars_decode too all attribute values and text nodes. * diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index fa101e9e..93d1bc1c 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -20,7 +20,7 @@ class Seeker implements SeekerInterface * @var InnerNode[] $nodes * @throws ChildNotFoundException */ - public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array + public function seek(array $nodes, RuleDTO $rule, array $options): array { // XPath index if ($rule->getTag() !== null && \is_numeric($rule->getKey())) { @@ -80,19 +80,14 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir // this child failed to be matched if ($child instanceof InnerNode && $child->hasChildren() ) { - if ($depthFirst) { - if (!isset($options['checkGrandChildren']) - || $options['checkGrandChildren'] - ) { - // we have a child that failed but are not leaves. - $matches = $this->seek([$child], $rule, $options, $depthFirst); - foreach ($matches as $match) { - $return[] = $match; - } + if (!isset($options['checkGrandChildren']) + || $options['checkGrandChildren'] + ) { + // we have a child that failed but are not leaves. + $matches = $this->seek([$child], $rule, $options); + foreach ($matches as $match) { + $return[] = $match; } - } else { - // we still want to check its children - $children[] = $child; } } @@ -104,7 +99,7 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir && \count($children) > 0 ) { // we have children that failed but are not leaves. - $matches = $this->seek($children, $rule, $options, $depthFirst); + $matches = $this->seek($children, $rule, $options); foreach ($matches as $match) { $return[] = $match; } diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 95c47001..4c45da01 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -25,11 +25,6 @@ class Selector implements SelectorInterface */ private $ParsedSelectorCollectionDTO; - /** - * @var bool - */ - private $depthFirst = false; - /** * @var SeekerInterface */ @@ -61,11 +56,6 @@ public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO return $this->ParsedSelectorCollectionDTO; } - public function setDepthFirstFind(bool $status): void - { - $this->depthFirst = $status; - } - /** * Attempts to find the selectors starting from the given * node object. @@ -87,7 +77,7 @@ public function find(AbstractNode $node): Collection $options[] = $this->alterNext($rule); continue; } - $nodes = $this->seeker->seek($nodes, $rule, $options, $this->depthFirst); + $nodes = $this->seeker->seek($nodes, $rule, $options); // clear the options $options = []; } diff --git a/tests/DomTest.php b/tests/DomTest.php index 7c29b508..9922f17f 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -483,19 +483,6 @@ public function testFindOrder() $dom->load($str); $images = $dom->find('img'); - $this->assertEquals('', (string) $images[0]); - } - - public function testFindDepthFirstSearch() - { - $str = '

'; - $dom = new Dom(); - $dom->setOptions([ - 'depthFirstSearch' => true, - ]); - $dom->load($str); - $images = $dom->find('img'); - $this->assertEquals('', (string) $images[0]); } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index 91c62591..899a0622 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -70,7 +70,6 @@ public function testSetters() 'preserveLineBreaks' => false, 'removeDoubleSpace' => false, 'removeSmartyScripts' => false, - 'depthFirstSearch' => false, 'htmlSpecialCharsDecode' => false, ]); @@ -101,9 +100,6 @@ public function testSetters() $options->setRemoveSmartyScripts(true); $this->assertTrue($options->get('removeSmartyScripts')); - $options->setDepthFirstSearch(true); - $this->assertTrue($options->get('depthFirstSearch')); - $options->setHtmlSpecialCharsDecode(true); $this->assertTrue($options->get('htmlSpecialCharsDecode')); @@ -136,9 +132,6 @@ public function testSetters() $options->setRemoveSmartyScripts(false); $this->assertFalse($options->get('removeSmartyScripts')); - $options->setDepthFirstSearch(false); - $this->assertFalse($options->get('depthFirstSearch')); - $options->setHtmlSpecialCharsDecode(false); $this->assertFalse($options->get('htmlSpecialCharsDecode')); } From 924a594e7df145511466939171ca6c1966cd0cc6 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 12 May 2020 02:55:03 +0000 Subject: [PATCH 169/200] Fix #187 --- CHANGELOG.md | 1 + src/PHPHtmlParser/Content.php | 2 +- src/PHPHtmlParser/Dom.php | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cd22cd0..5d9b1bab 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added tag attribute DTO. - Cleaned up the selector logic. - Fixed issue with greedy regex for charset detection. +- Fixed bug causing infinite loops in some cases. ### Removed - Curl interface and curl implementation has been removed. diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 0ae7e0e4..24bca182 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -164,7 +164,7 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal * * @return string */ - public function copyUntilUnless(string $string, string $unless) + public function copyUntilUnless(string $string, string $unless): string { $lastPos = $this->pos; $this->fastForward(1); diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index fa659f67..f772e707 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -749,7 +749,7 @@ private function parseTag(): array do { $moreString = $this->content->copyUntilUnless('"', '=>'); $string .= $moreString; - } while (!empty($moreString)); + } while (strlen($moreString) > 0 && $this->content->getPosition() < $this->size); $attr['value'] = $string; $this->content->fastForward(1); $node->getTag()->setAttribute($name, $string); @@ -760,7 +760,7 @@ private function parseTag(): array do { $moreString = $this->content->copyUntilUnless("'", '=>'); $string .= $moreString; - } while (!empty($moreString)); + } while (strlen($moreString) > 0 && $this->content->getPosition() < $this->size); $attr['value'] = $string; $this->content->fastForward(1); $node->getTag()->setAttribute($name, $string, false); From 0127b9e354e92f9c515653b8b50423e38010762c Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 12 May 2020 03:15:13 +0000 Subject: [PATCH 170/200] fixes #188 --- tests/DomTest.php | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/DomTest.php b/tests/DomTest.php index 9922f17f..47aeb6ae 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -617,4 +617,16 @@ public function testFindAttributeInBothParentAndChild() $nodes = $dom->find('[attribute]'); $this->assertCount(2, $nodes); } + + public function testLessThanCharacterInJavascript() + { + $results = (new Dom())->load('
', + [ + 'cleanupInput' => false, + 'removeScripts' => false + ])->find('body'); + $this->assertCount(1, $results); + } } From 4bb7098f3a46582dd9c5cd289fae6f0e835f2916 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 13 May 2020 14:36:50 +0000 Subject: [PATCH 171/200] Fixes #203 --- CHANGELOG.md | 1 + src/PHPHtmlParser/Dom.php | 17 ---------- src/PHPHtmlParser/Dom/AbstractNode.php | 10 ------ tests/DomTest.php | 44 ++++++++++---------------- 4 files changed, 18 insertions(+), 54 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d9b1bab..bd2dffcb 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed - Curl interface and curl implementation has been removed. - Removed support for the depth first search option. +- findById() method removed from Dom object. ## 2.2.0 diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index f772e707..ba9ea4ae 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -150,7 +150,6 @@ public function __get($name) */ public function load(string $str, array $options = []): Dom { - AbstractNode::resetCount(); // check if it's a file if (\strpos($str, "\n") === false && \is_file($str)) { return $this->loadFromFile($str, $options); @@ -264,22 +263,6 @@ public function find(string $selector, int $nth = null) return $result; } - /** - * Find element by Id on the root node. - * - * @throws ChildNotFoundException - * @throws NotLoadedException - * @throws ParentNotFoundException - * - * @return bool|AbstractNode - */ - public function findById(int $id) - { - $this->isLoaded(); - - return $this->root->findById($id); - } - /** * Adds the tag (or tags in an array) to the list of tags that will always * be self closing. diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 596a3ae8..3d67ab5c 100755 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -140,16 +140,6 @@ public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void $this->htmlSpecialCharsDecode = $htmlSpecialCharsDecode; } - /** - * Reset node counter. - * - * @return void - */ - public static function resetCount() - { - self::$count = 0; - } - /** * Returns the id of this object. */ diff --git a/tests/DomTest.php b/tests/DomTest.php index 47aeb6ae..8e800487 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -394,33 +394,6 @@ public function testHasChildren() $this->assertTrue($dom->hasChildren()); } - public function testFindByIdVar1() - { - $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); - /** @var Dom\AbstractNode $result */ - $result = $dom->findById(4); - $this->assertEquals(4, $result->id()); - } - - public function testFindByIdVar2() - { - $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); - /** @var Dom\AbstractNode $result */ - $result = $dom->findById(5); - $this->assertEquals(5, $result->id()); - } - - public function testFindByIdNotFountEleement() - { - $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); - /** @var Dom\AbstractNode $result */ - $result = $dom->findById(8); - $this->assertFalse($result); - } - public function testWhitespaceInText() { $dom = new Dom(); @@ -629,4 +602,21 @@ public function testLessThanCharacterInJavascript() ])->find('body'); $this->assertCount(1, $results); } + + public function testUniqueIdForAllObjects() + { + // Create a dom which will be used as a parent/container for a paragraph + $dom1 = new \PHPHtmlParser\Dom; + $dom1->load('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting) + $div = $dom1->firstChild(); + + // Create a paragraph outside of the first dom + $dom2 = new \PHPHtmlParser\Dom; + $dom2->load('

Our new paragraph.

'); // Resets the counter + $paragraph = $dom2->firstChild(); + + $div->addChild($paragraph); + + $this->assertEquals('A container div

Our new paragraph.

', $div->innerhtml); + } } From 4e3158c561878076a82b32143e73537a1c391fa6 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 14 Jul 2020 20:24:14 +0000 Subject: [PATCH 172/200] Added test to cover #189 --- tests/DomTest.php | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/DomTest.php b/tests/DomTest.php index 8e800487..9c3781c4 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -619,4 +619,23 @@ public function testUniqueIdForAllObjects() $this->assertEquals('A container div

Our new paragraph.

', $div->innerhtml); } + + public function testFindDescendantsOfMatch() + { + $dom = new Dom(); + $dom->load('

+ + test + testing + This is a test + italic + password123 + + another +

'); + + /** @var Dom\AbstractNode $meta */ + $nodes = $dom->find('b'); + $this->assertCount(5, $nodes); + } } From 8fccd89f73faa2fb7dd523605d927226b78d20a3 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 14 Jul 2020 20:28:33 +0000 Subject: [PATCH 173/200] Added coverage for #174 --- tests/DomTest.php | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/DomTest.php b/tests/DomTest.php index 9c3781c4..d68b0fc2 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -634,8 +634,19 @@ public function testFindDescendantsOfMatch() another

'); - /** @var Dom\AbstractNode $meta */ $nodes = $dom->find('b'); $this->assertCount(5, $nodes); } + + public function testCompatibleWithWordPressShortcode() + { + $dom = new Dom(); + $dom->load('

+[wprs_alert type="success" content="this is a short code" /] +

'); + + $node = $dom->find('p', 0); + $this->assertEquals(' [wprs_alert type="success" content="this is a short code" /] ', $node->innerHtml); + + } } From 1a1c3eb2d20069ca811ffcdce7be8a7d4f0effe6 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 14 Jul 2020 20:41:19 +0000 Subject: [PATCH 174/200] Fixed #155, removed load method call --- CHANGELOG.md | 3 +- README.md | 8 +- src/PHPHtmlParser/Dom.php | 24 ----- src/PHPHtmlParser/StaticDom.php | 25 ++--- tests/DomTest.php | 116 +++++++++++------------ tests/Node/HtmlTest.php | 4 +- tests/Node/TextTest.php | 2 +- tests/Options/CleanupTest.php | 4 +- tests/Options/PreserveLineBreaks.php | 4 +- tests/Options/StrictTest.php | 8 +- tests/Options/WhitespaceTextNodeTest.php | 4 +- tests/OptionsTest.php | 2 +- tests/StaticDomTest.php | 10 +- 13 files changed, 91 insertions(+), 123 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd2dffcb..9862beac 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed - Curl interface and curl implementation has been removed. - Removed support for the depth first search option. -- findById() method removed from Dom object. +- `findById()` method removed from Dom object. +- Removed `load()` method in Dom object. ## 2.2.0 diff --git a/README.md b/README.md index 54c28d04..cbd64800 100755 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ require "vendor/autoload.php"; use PHPHtmlParser\Dom; $dom = new Dom; -$dom->load('

Hey bro, click here
:)

'); +$dom->loadStr('

Hey bro, click here
:)

'); $a = $dom->find('a')[0]; echo $a->text; // "click here" ``` @@ -86,7 +86,7 @@ $dom->loadFromUrl('http://google.com'); $html = $dom->outerHtml; // or -$dom->load('http://google.com'); +$dom->loadFromUrl('http://google.com'); $html = $dom->outerHtml; // same result as the first example ``` @@ -137,11 +137,11 @@ $dom->setOptions([ 'strict' => true, // Set a global option to enable strict html parsing. ]); -$dom->load('http://google.com', [ +$dom->loadFromUrl('http://google.com', [ 'whitespaceTextNode' => false, // Only applies to this load. ]); -$dom->load('http://gmail.com'); // will not have whitespaceTextNode set to false. +$dom->loadFromUrl('http://gmail.com'); // will not have whitespaceTextNode set to false. ``` At the moment we support 8 options. diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index ba9ea4ae..1cba5050 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -15,7 +15,6 @@ use PHPHtmlParser\Exceptions\CurlException; use PHPHtmlParser\Exceptions\LogicalException; use PHPHtmlParser\Exceptions\NotLoadedException; -use PHPHtmlParser\Exceptions\ParentNotFoundException; use PHPHtmlParser\Exceptions\StrictException; use PHPHtmlParser\Exceptions\UnknownChildTypeException; use Psr\Http\Client\ClientInterface; @@ -139,29 +138,6 @@ public function __get($name) return $this->root->$name; } - /** - * Attempts to load the dom from any resource, string, file, or URL. - * - * @throws ChildNotFoundException - * @throws CircularException - * @throws CurlException - * @throws StrictException - * @throws LogicalException - */ - public function load(string $str, array $options = []): Dom - { - // check if it's a file - if (\strpos($str, "\n") === false && \is_file($str)) { - return $this->loadFromFile($str, $options); - } - // check if it's a url - if (\preg_match("/^https?:\/\//i", $str)) { - return $this->loadFromUrl($str, $options); - } - - return $this->loadStr($str, $options); - } - /** * Loads the dom from a document file/url. * diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php index b4c3ef22..411ca3de 100755 --- a/src/PHPHtmlParser/StaticDom.php +++ b/src/PHPHtmlParser/StaticDom.php @@ -56,23 +56,6 @@ public static function mount(string $className = 'Dom', ?Dom $dom = null): bool return true; } - /** - * Creates a new dom object and calls load() on the - * new object. - * - * @throws ChildNotFoundException - * @throws CircularException - * @throws CurlException - * @throws StrictException - */ - public static function load(string $str): Dom - { - $dom = new Dom(); - self::$dom = $dom; - - return $dom->load($str); - } - /** * Creates a new dom object and calls loadFromFile() on the * new object. @@ -114,6 +97,14 @@ public static function loadFromUrl(string $url, array $options = [], ClientInter return $dom->loadFromUrl($url, $options, $client, $request); } + public static function loadStr(string $str, array $options = []): Dom + { + $dom = new Dom(); + self::$dom = $dom; + + return $dom->loadStr($str, $options); + } + /** * Sets the $dom variable to null. */ diff --git a/tests/DomTest.php b/tests/DomTest.php index d68b0fc2..2a904cc5 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -20,14 +20,14 @@ public function testParsingCData() $html = ""; $dom = new Dom(); $dom->setOptions(['cleanupInput' => false]); - $dom->load($html); + $dom->loadStr($html); $this->assertSame($html, $dom->root->outerHtml()); } - public function testLoad() + public function testloadStr() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $div = $dom->find('div', 0); $this->assertEquals('

Hey bro, click here
:)

', $div->outerHtml); } @@ -44,7 +44,7 @@ public function testNotLoaded() public function testIncorrectAccess() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $div = $dom->find('div', 0); $this->assertEquals(null, $div->foo); } @@ -52,7 +52,7 @@ public function testIncorrectAccess() public function testLoadSelfclosingAttr() { $dom = new Dom(); - $dom->load("

baz
"); + $dom->loadStr("

baz
"); $br = $dom->find('br', 0); $this->assertEquals('
', $br->outerHtml); } @@ -60,7 +60,7 @@ public function testLoadSelfclosingAttr() public function testLoadSelfclosingAttrToString() { $dom = new Dom(); - $dom->load("

baz
"); + $dom->loadStr("

baz
"); $br = $dom->find('br', 0); $this->assertEquals('
', (string) $br); } @@ -68,7 +68,7 @@ public function testLoadSelfclosingAttrToString() public function testLoadEscapeQuotes() { $dom = new Dom(); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $div = $dom->find('div', 0); $this->assertEquals('

Hey bro, click here

', $div->outerHtml); } @@ -76,14 +76,14 @@ public function testLoadEscapeQuotes() public function testLoadNoOpeningTag() { $dom = new Dom(); - $dom->load('
PR Manager
content
'); + $dom->loadStr('
PR Manager
content
'); $this->assertEquals('content', $dom->find('.content', 0)->text); } public function testLoadNoClosingTag() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $root = $dom->find('div', 0)->getParent(); $this->assertEquals('

Hey bro, click here


', $root->outerHtml); } @@ -91,7 +91,7 @@ public function testLoadNoClosingTag() public function testLoadAttributeOnSelfClosing() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $br = $dom->find('br', 0); $this->assertEquals('both', $br->getAttribute('class')); } @@ -99,7 +99,7 @@ public function testLoadAttributeOnSelfClosing() public function testLoadClosingTagOnSelfClosing() { $dom = new Dom(); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } @@ -108,7 +108,7 @@ public function testLoadClosingTagOnSelfClosingNoSlash() $dom = new Dom(); $dom->addNoSlashTag('br'); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } @@ -116,7 +116,7 @@ public function testLoadClosingTagAddSelfClosingTag() { $dom = new Dom(); $dom->addSelfClosingTag('mytag'); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } @@ -127,7 +127,7 @@ public function testLoadClosingTagAddSelfClosingTagArray() 'mytag', 'othertag', ]); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } @@ -135,7 +135,7 @@ public function testLoadClosingTagRemoveSelfClosingTag() { $dom = new Dom(); $dom->removeSelfClosingTag('br'); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); } @@ -143,35 +143,35 @@ public function testLoadClosingTagClearSelfClosingTag() { $dom = new Dom(); $dom->clearSelfClosingTags(); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); } public function testLoadNoValueAttribute() { $dom = new Dom(); - $dom->load('
Main content here
'); + $dom->loadStr('
Main content here
'); $this->assertEquals('
Main content here
', $dom->innerHtml); } public function testLoadBackslashAttributeValue() { $dom = new Dom(); - $dom->load('
Main content here
'); + $dom->loadStr('
Main content here
'); $this->assertEquals('
Main content here
', $dom->innerHtml); } public function testLoadNoValueAttributeBefore() { $dom = new Dom(); - $dom->load('
Main content here
'); + $dom->loadStr('
Main content here
'); $this->assertEquals('
Main content here
', $dom->innerHtml); } public function testLoadUpperCase() { $dom = new Dom(); - $dom->load('

hEY BRO, CLICK HERE

'); + $dom->loadStr('

hEY BRO, CLICK HERE

'); $this->assertEquals('

hEY BRO, CLICK HERE

', $dom->find('div', 0)->innerHtml); } @@ -206,7 +206,7 @@ public function testLoadFromFileNotFound() public function testLoadUtf8() { $dom = new Dom(); - $dom->load('

Dzień

'); + $dom->loadStr('

Dzień

'); $this->assertEquals('Dzień', $dom->find('p', 0)->text); } @@ -268,56 +268,56 @@ public function testLoadFromUrl() public function testToStringMagic() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $this->assertEquals('

Hey bro, click here
:)

', (string) $dom); } public function testGetMagic() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $this->assertEquals('

Hey bro, click here
:)

', $dom->innerHtml); } public function testFirstChild() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('

Hey bro, click here

', $dom->firstChild()->outerHtml); } public function testLastChild() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('
', $dom->lastChild()->outerHtml); } public function testGetElementById() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('click here', $dom->getElementById('78')->outerHtml); } public function testGetElementsByTag() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('

Hey bro, click here

', $dom->getElementsByTag('p')[0]->outerHtml); } public function testGetElementsByClass() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('

Hey bro, click here

', $dom->getElementsByClass('all')[0]->innerHtml); } public function testScriptCleanerScriptTag() { $dom = new Dom(); - $dom->load(' + $dom->loadStr('

.....

', [ @@ -607,12 +607,12 @@ public function testUniqueIdForAllObjects() { // Create a dom which will be used as a parent/container for a paragraph $dom1 = new \PHPHtmlParser\Dom; - $dom1->load('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting) + $dom1->loadStr('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting) $div = $dom1->firstChild(); // Create a paragraph outside of the first dom $dom2 = new \PHPHtmlParser\Dom; - $dom2->load('

Our new paragraph.

'); // Resets the counter + $dom2->loadStr('

Our new paragraph.

'); // Resets the counter $paragraph = $dom2->firstChild(); $div->addChild($paragraph); @@ -623,7 +623,7 @@ public function testUniqueIdForAllObjects() public function testFindDescendantsOfMatch() { $dom = new Dom(); - $dom->load('

+ $dom->loadStr('

test testing @@ -641,7 +641,7 @@ public function testFindDescendantsOfMatch() public function testCompatibleWithWordPressShortcode() { $dom = new Dom(); - $dom->load('

+ $dom->loadStr('

[wprs_alert type="success" content="this is a short code" /]

'); diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index 677b280e..a4db8142 100755 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -500,7 +500,7 @@ public function testAncestorByTagFailure() public function testReplaceNode() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $id = $dom->find('p')[0]->id(); $newChild = new HtmlNode('h1'); $dom->find('p')[0]->getParent()->replaceChild($id, $newChild); @@ -510,7 +510,7 @@ public function testReplaceNode() public function testTextNodeFirstChild() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $p = $dom->find('p'); foreach ($p as $element) { $child = $element->firstChild(); diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index 27dd03e9..d36eddc0 100755 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -57,7 +57,7 @@ public function testSetTextToTextNode() public function testSetText() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $a = $dom->find('a')[0]; $a->firstChild()->setText('biz baz'); $this->assertEquals('

Hey bro, biz baz
:)

', (string) $dom); diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php index 0a8a9baf..b7e5325e 100755 --- a/tests/Options/CleanupTest.php +++ b/tests/Options/CleanupTest.php @@ -76,7 +76,7 @@ public function testRemoveScriptsFalse() public function testSmartyScripts() { $dom = new Dom(); - $dom->load(' + $dom->loadStr(' aa={123} '); $this->assertEquals(' aa= ', $dom->innerHtml); @@ -88,7 +88,7 @@ public function testSmartyScriptsDisabled() $dom->setOptions([ 'removeSmartyScripts' => false, ]); - $dom->load(' + $dom->loadStr(' aa={123} '); $this->assertEquals(' aa={123} ', $dom->innerHtml); diff --git a/tests/Options/PreserveLineBreaks.php b/tests/Options/PreserveLineBreaks.php index 3df7223e..ad095a38 100755 --- a/tests/Options/PreserveLineBreaks.php +++ b/tests/Options/PreserveLineBreaks.php @@ -13,7 +13,7 @@ public function testPreserveLineBreakTrue() $dom->setOptions([ 'preserveLineBreaks' => true, ]); - $dom->load('
+ $dom->loadStr('
'); $this->assertEquals("
\n
", (string) $dom); @@ -25,7 +25,7 @@ public function testPreserveLineBreakBeforeClosingTag() $dom->setOptions([ 'preserveLineBreaks' => true, ]); - $dom->load('
loadStr('
'); $this->assertEquals('
', (string) $dom); diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php index cb015981..96d457b7 100755 --- a/tests/Options/StrictTest.php +++ b/tests/Options/StrictTest.php @@ -14,7 +14,7 @@ public function testConfigStrict() $dom->setOptions([ 'strict' => true, ]); - $dom->load('

Hey you

Ya you!

'); + $dom->loadStr('

Hey you

Ya you!

'); $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); } @@ -26,7 +26,7 @@ public function testConfigStrictMissingSelfClosing() ]); try { // should throw an exception - $dom->load('

Hey you


Ya you!

'); + $dom->loadStr('

Hey you


Ya you!

'); // we should not get here $this->assertTrue(false); } catch (StrictException $e) { @@ -42,7 +42,7 @@ public function testConfigStrictMissingAttribute() ]); try { // should throw an exception - $dom->load('

Hey you

Ya you!

'); + $dom->loadStr('

Hey you

Ya you!

'); // we should not get here $this->assertTrue(false); } catch (StrictException $e) { @@ -56,7 +56,7 @@ public function testConfigStrictBRTag() $dom->setOptions([ 'strict' => true, ]); - $dom->load('
'); + $dom->loadStr('
'); $this->assertTrue(true); } } diff --git a/tests/Options/WhitespaceTextNodeTest.php b/tests/Options/WhitespaceTextNodeTest.php index 541fbec0..0097f28d 100755 --- a/tests/Options/WhitespaceTextNodeTest.php +++ b/tests/Options/WhitespaceTextNodeTest.php @@ -13,7 +13,7 @@ public function testConfigGlobalNoWhitespaceTextNode() $dom->setOptions([ 'whitespaceTextNode' => false, ]); - $dom->load('

Hey you

Ya you!

'); + $dom->loadStr('

Hey you

Ya you!

'); $this->assertEquals('Ya you!', $dom->getElementById('hey')->nextSibling()->text); } @@ -23,7 +23,7 @@ public function testConfigLocalOverride() $dom->setOptions([ 'whitespaceTextNode' => false, ]); - $dom->load('

Hey you

Ya you!

', [ + $dom->loadStr('

Hey you

Ya you!

', [ 'whitespaceTextNode' => true, ]); $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index 899a0622..a78f508f 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -144,6 +144,6 @@ public function testUnknownOptionDom() ]); $this->expectException(UnknownOptionException::class); - $dom->load('
'); + $dom->loadStr('
'); } } diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index 2fb225fb..fbc1a5bd 100755 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -25,16 +25,16 @@ public function testMountWithDom() $this->assertTrue($status); } - public function testLoad() + public function testloadStr() { - $dom = Dom::load('

Hey bro, click here
:)

'); + $dom = Dom::loadStr('

Hey bro, click here
:)

'); $div = $dom->find('div', 0); $this->assertEquals('

Hey bro, click here
:)

', $div->outerHtml); } public function testLoadWithFile() { - $dom = Dom::load('tests/data/files/small.html'); + $dom = Dom::loadFromFile('tests/data/files/small.html'); $this->assertEquals('VonBurgermeister', $dom->find('.post-user font', 0)->text); } @@ -47,14 +47,14 @@ public function testLoadFromFile() /** * @expectedException \PHPHtmlParser\Exceptions\NotLoadedException */ - public function testFindNoLoad() + public function testFindNoloadStr() { Dom::find('.post-user font', 0); } public function testFindI() { - Dom::load('tests/data/files/big.html'); + Dom::loadFromFile('tests/data/files/big.html'); $this->assertEquals('В кустах блестит металл
И искрится ток
Человечеству конец', Dom::find('i')[1]->innerHtml); } From e37e8ef9eda6bb44f50519b51fd80f0207f29585 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 00:03:46 +0000 Subject: [PATCH 175/200] Fixed #187 and added tests --- src/PHPHtmlParser/Content.php | 25 +++++++++++++++---- src/PHPHtmlParser/Dom.php | 20 ++++++++++++--- .../Exceptions/ContentLengthException.php | 14 +++++++++++ tests/DomTest.php | 8 ++++++ 4 files changed, 59 insertions(+), 8 deletions(-) create mode 100644 src/PHPHtmlParser/Exceptions/ContentLengthException.php diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 24bca182..66bc7794 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -4,6 +4,7 @@ namespace PHPHtmlParser; +use PHPHtmlParser\Exceptions\ContentLengthException; use PHPHtmlParser\Exceptions\LogicalException; /** @@ -74,14 +75,27 @@ public function char(?int $char = null): string * Moves the current position forward. * * @chainable + * @throws ContentLengthException */ public function fastForward(int $count): Content { + if (!$this->canFastForward()) { + // trying to go over the content length, throw exception + throw new ContentLengthException('Attempt to fastForward pass the length of the content.'); + } $this->pos += $count; return $this; } + /** + * Checks if we can move the position forward. + */ + public function canFastForward(): bool + { + return \strlen($this->content) > $this->pos; + } + /** * Moves the current position backward. * @@ -197,14 +211,15 @@ public function copyByToken(string $token, bool $char = false, bool $escape = fa /** * Skip a given set of characters. * - * @return Content|string + * @throws LogicalException */ - public function skip(string $string, bool $copy = false) + public function skip(string $string, bool $copy = false): string { $len = \strspn($this->content, $string, $this->pos); - - // make it chainable if they don't want a copy - $return = $this; + if ($len === false) { + throw new LogicalException('Strspn returned false with position ' . $this->pos . '.'); + } + $return = ''; if ($copy) { $return = \substr($this->content, $this->pos, $len); if ($return === false) { diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 1cba5050..d23110df 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -12,6 +12,7 @@ use PHPHtmlParser\Dom\TextNode; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; +use PHPHtmlParser\Exceptions\ContentLengthException; use PHPHtmlParser\Exceptions\CurlException; use PHPHtmlParser\Exceptions\LogicalException; use PHPHtmlParser\Exceptions\NotLoadedException; @@ -646,7 +647,13 @@ private function parseTag(): array } // check if this is a closing tag - if ($this->content->fastForward(1)->char() == '/') { + try { + $this->content->fastForward(1); + } catch (ContentLengthException $exception) { + // we are at the end of the file + return $return; + } + if ($this->content->char() == '/') { // end tag $tag = $this->content->fastForward(1) ->copyByToken('slash', true); @@ -683,7 +690,12 @@ private function parseTag(): array ) { $space = $this->content->skipByToken('blank', true); if (empty($space)) { - $this->content->fastForward(1); + try { + $this->content->fastForward(1); + } catch (ContentLengthException $exception) { + // reached the end of the content + break; + } continue; } @@ -764,7 +776,9 @@ private function parseTag(): array } } - $this->content->fastForward(1); + if ($this->content->canFastForward()) { + $this->content->fastForward(1); + } $return['status'] = true; $return['node'] = $node; diff --git a/src/PHPHtmlParser/Exceptions/ContentLengthException.php b/src/PHPHtmlParser/Exceptions/ContentLengthException.php new file mode 100644 index 00000000..83c9e771 --- /dev/null +++ b/src/PHPHtmlParser/Exceptions/ContentLengthException.php @@ -0,0 +1,14 @@ +assertEquals(' [wprs_alert type="success" content="this is a short code" /] ', $node->innerHtml); } + + public function testBrokenHtml() + { + $dom = new Dom(); + $dom->loadStr('assertEquals('', $dom->outerHtml); + } } From b58c6da6c58e9da334de20b46f602e9cb70d5095 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 01:18:55 +0000 Subject: [PATCH 176/200] Removed magical option array --- CHANGELOG.md | 1 + composer.json | 3 +- src/PHPHtmlParser/Content.php | 22 +- src/PHPHtmlParser/Dom.php | 86 ++++--- src/PHPHtmlParser/Enum/StringToken.php | 21 ++ src/PHPHtmlParser/Options.php | 291 +++++++++++------------ src/PHPHtmlParser/Selector/Seeker.php | 3 +- src/PHPHtmlParser/StaticDom.php | 9 +- tests/ContentTest.php | 5 +- tests/DomTest.php | 30 ++- tests/Options/CleanupTest.php | 29 +-- tests/Options/PreserveLineBreaks.php | 10 +- tests/Options/StrictTest.php | 17 +- tests/Options/WhitespaceTextNodeTest.php | 13 +- tests/OptionsTest.php | 117 ++------- tests/StaticDomTest.php | 2 +- 16 files changed, 273 insertions(+), 386 deletions(-) create mode 100644 src/PHPHtmlParser/Enum/StringToken.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 9862beac..05d2146f 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Cleaned up the selector logic. - Fixed issue with greedy regex for charset detection. - Fixed bug causing infinite loops in some cases. +- Refactored the way we handle options. Removed the magical option array. ### Removed - Curl interface and curl implementation has been removed. diff --git a/composer.json b/composer.json index 79258c58..5549a5ee 100755 --- a/composer.json +++ b/composer.json @@ -20,7 +20,8 @@ "paquettg/string-encode": "~1.0.0", "php-http/httplug": "^2.1", "php-http/guzzle6-adapter": "^2.0", - "guzzlehttp/psr7": "^1.6" + "guzzlehttp/psr7": "^1.6", + "myclabs/php-enum": "^1.7" }, "require-dev": { "phpunit/phpunit": "^7.5.1", diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 66bc7794..fdb741c4 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -4,6 +4,7 @@ namespace PHPHtmlParser; +use PHPHtmlParser\Enum\StringToken; use PHPHtmlParser\Exceptions\ContentLengthException; use PHPHtmlParser\Exceptions\LogicalException; @@ -75,11 +76,12 @@ public function char(?int $char = null): string * Moves the current position forward. * * @chainable + * * @throws ContentLengthException */ public function fastForward(int $count): Content { - if (!$this->canFastForward()) { + if (!$this->canFastForward($count)) { // trying to go over the content length, throw exception throw new ContentLengthException('Attempt to fastForward pass the length of the content.'); } @@ -91,9 +93,9 @@ public function fastForward(int $count): Content /** * Checks if we can move the position forward. */ - public function canFastForward(): bool + public function canFastForward(int $count): bool { - return \strlen($this->content) > $this->pos; + return \strlen($this->content) >= $this->pos + $count; } /** @@ -175,8 +177,6 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal /** * Copies the content until the string is found and return it * unless the 'unless' is found in the substring. - * - * @return string */ public function copyUntilUnless(string $string, string $unless): string { @@ -197,13 +197,11 @@ public function copyUntilUnless(string $string, string $unless): string /** * Copies the content until it reaches the token string.,. * - * @return string - * * @uses $this->copyUntil() */ - public function copyByToken(string $token, bool $char = false, bool $escape = false) + public function copyByToken(StringToken $stringToken, bool $char = false, bool $escape = false): string { - $string = $this->$token; + $string = $stringToken->getValue(); return $this->copyUntil($string, $char, $escape); } @@ -236,13 +234,11 @@ public function skip(string $string, bool $copy = false): string /** * Skip a given token of pre-defined characters. * - * @return Content|string - * * @uses $this->skip() */ - public function skipByToken(string $token, bool $copy = false) + public function skipByToken(StringToken $skipToken, bool $copy = false): string { - $string = $this->$token; + $string = $skipToken->getValue(); return $this->skip($string, $copy); } diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index d23110df..d2db15e2 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -10,10 +10,10 @@ use PHPHtmlParser\Dom\Collection; use PHPHtmlParser\Dom\HtmlNode; use PHPHtmlParser\Dom\TextNode; +use PHPHtmlParser\Enum\StringToken; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; use PHPHtmlParser\Exceptions\ContentLengthException; -use PHPHtmlParser\Exceptions\CurlException; use PHPHtmlParser\Exceptions\LogicalException; use PHPHtmlParser\Exceptions\NotLoadedException; use PHPHtmlParser\Exceptions\StrictException; @@ -72,9 +72,9 @@ class Dom /** * A global options array to be used by all load calls. * - * @var array + * @var ?Options */ - private $globalOptions = []; + private $globalOptions; /** * A persistent option object to be used for all options in the @@ -147,7 +147,7 @@ public function __get($name) * @throws StrictException * @throws LogicalException */ - public function loadFromFile(string $file, array $options = []): Dom + public function loadFromFile(string $file, ?Options $options = null): Dom { $content = @\file_get_contents($file); if ($content === false) { @@ -168,7 +168,7 @@ public function loadFromFile(string $file, array $options = []): Dom * @throws StrictException * @throws \Psr\Http\Client\ClientExceptionInterface */ - public function loadFromUrl(string $url, array $options = [], ?ClientInterface $client = null, ?RequestInterface $request = null): Dom + public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $client = null, ?RequestInterface $request = null): Dom { if ($client === null) { $client = new Client(); @@ -191,11 +191,15 @@ public function loadFromUrl(string $url, array $options = [], ?ClientInterface $ * @throws CircularException * @throws StrictException */ - public function loadStr(string $str, array $option = []): Dom + public function loadStr(string $str, ?Options $options = null): Dom { $this->options = new Options(); - $this->options->setOptions($this->globalOptions) - ->setOptions($option); + if ($this->globalOptions !== null) { + $this->options->setFromOptions($this->globalOptions); + } + if ($options !== null) { + $this->options->setFromOptions($options); + } $this->rawSize = \strlen($str); $this->raw = $str; @@ -216,7 +220,7 @@ public function loadStr(string $str, array $option = []): Dom * * @chainable */ - public function setOptions(array $options): Dom + public function setOptions(Options $options): Dom { $this->globalOptions = $options; @@ -235,9 +239,7 @@ public function find(string $selector, int $nth = null) { $this->isLoaded(); - $result = $this->root->find($selector, $nth); - - return $result; + return $this->root->find($selector, $nth); } /** @@ -463,7 +465,7 @@ private function isLoaded(): void */ private function clean(string $str): string { - if ($this->options->get('cleanupInput') != true) { + if ($this->options->isCleanupInput() != true) { // skip entire cleanup step return $str; } @@ -488,7 +490,7 @@ private function clean(string $str): string // clean out the \n\r $replace = ' '; - if ($this->options->get('preserveLineBreaks')) { + if ($this->options->isPreserveLineBreaks()) { $replace = ' '; } $str = \str_replace(["\r\n", "\r", "\n"], $replace, $str); @@ -515,7 +517,7 @@ private function clean(string $str): string } // strip out "; $dom = new Dom(); - $dom->setOptions(['cleanupInput' => false]); + $dom->setOptions((new Options())->setCleanupInput(false)); $dom->loadStr($html); $this->assertSame($html, $dom->root->outerHtml()); } @@ -213,7 +214,7 @@ public function testLoadUtf8() public function testLoadFileWhitespace() { $dom = new Dom(); - $dom->setOptions(['cleanupInput' => false]); + $dom->setOptions((new Options())->setCleanupInput(false)); $dom->loadFromFile('tests/data/files/whitespace.html'); $this->assertEquals(1, \count($dom->find('.class'))); $this->assertEquals('', (string) $dom); @@ -237,7 +238,8 @@ public function testLoadFileBigTwice() public function testLoadFileBigTwicePreserveOption() { $dom = new Dom(); - $dom->loadFromFile('tests/data/files/big.html', ['preserveLineBreaks' => true]); + $dom->loadFromFile('tests/data/files/big.html', + (new Options)->setPreserveLineBreaks(true)); $post = $dom->find('.post-row', 0); $this->assertEquals( "

Журчанье воды
\nЧерно-белые тени
\nВновь на фонтане

", @@ -261,7 +263,7 @@ public function testLoadFromUrl() ->andReturn($responseMock); $dom = new Dom(); - $dom->loadFromUrl('http://google.com', [], $clientMock); + $dom->loadFromUrl('http://google.com', null, $clientMock); $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); } @@ -397,9 +399,7 @@ public function testHasChildren() public function testWhitespaceInText() { $dom = new Dom(); - $dom->setOptions([ - 'removeDoubleSpace' => false, - ]); + $dom->setOptions((new Options())->setRemoveDoubleSpace(false)); $dom->loadStr('
    Hello world
'); $this->assertEquals('
    Hello world
', (string) $dom); } @@ -415,7 +415,7 @@ public function testGetComplexAttribute() public function testGetComplexAttributeHtmlSpecialCharsDecode() { $dom = new Dom(); - $dom->setOptions(['htmlSpecialCharsDecode' => true]); + $dom->setOptions((new Options())->setHtmlSpecialCharsDecode(true)); $dom->loadStr('Next >'); $a = $dom->find('a', 0); $this->assertEquals('Next >', $a->innerHtml); @@ -563,7 +563,7 @@ public function testLoadGetAttributeWithBackslash() public function test25ChildrenFound() { $dom = new Dom(); - $dom->setOptions(['whitespaceTextNode' => false]); + $dom->setOptions((new Options())->setWhitespaceTextNode(false)); $dom->loadFromFile('tests/data/files/51children.html'); $children = $dom->find('#red-line-g *'); $this->assertEquals(25, \count($children)); @@ -596,22 +596,21 @@ public function testLessThanCharacterInJavascript() $results = (new Dom())->loadStr('
', - [ - 'cleanupInput' => false, - 'removeScripts' => false - ])->find('body'); + (new Options())->setCleanupInput(false) + ->setRemoveScripts(false) + )->find('body'); $this->assertCount(1, $results); } public function testUniqueIdForAllObjects() { // Create a dom which will be used as a parent/container for a paragraph - $dom1 = new \PHPHtmlParser\Dom; + $dom1 = new \PHPHtmlParser\Dom(); $dom1->loadStr('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting) $div = $dom1->firstChild(); // Create a paragraph outside of the first dom - $dom2 = new \PHPHtmlParser\Dom; + $dom2 = new \PHPHtmlParser\Dom(); $dom2->loadStr('

Our new paragraph.

'); // Resets the counter $paragraph = $dom2->firstChild(); @@ -647,7 +646,6 @@ public function testCompatibleWithWordPressShortcode() $node = $dom->find('p', 0); $this->assertEquals(' [wprs_alert type="success" content="this is a short code" /] ', $node->innerHtml); - } public function testBrokenHtml() diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php index b7e5325e..914078ac 100755 --- a/tests/Options/CleanupTest.php +++ b/tests/Options/CleanupTest.php @@ -3,6 +3,7 @@ declare(strict_types=1); use PHPHtmlParser\Dom; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; class CleanupTest extends TestCase @@ -10,9 +11,7 @@ class CleanupTest extends TestCase public function testCleanupInputTrue() { $dom = new Dom(); - $dom->setOptions([ - 'cleanupInput' => true, - ]); + $dom->setOptions((new Options())->setCleanupInput(true)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(0, \count($dom->find('style'))); $this->assertEquals(0, \count($dom->find('script'))); @@ -21,9 +20,7 @@ public function testCleanupInputTrue() public function testCleanupInputFalse() { $dom = new Dom(); - $dom->setOptions([ - 'cleanupInput' => false, - ]); + $dom->setOptions((new Options())->setCleanupInput(false)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(1, \count($dom->find('style'))); $this->assertEquals(22, \count($dom->find('script'))); @@ -32,9 +29,7 @@ public function testCleanupInputFalse() public function testRemoveStylesTrue() { $dom = new Dom(); - $dom->setOptions([ - 'removeStyles' => true, - ]); + $dom->setOptions((new Options())->setRemoveStyles(true)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(0, \count($dom->find('style'))); } @@ -42,9 +37,7 @@ public function testRemoveStylesTrue() public function testRemoveStylesFalse() { $dom = new Dom(); - $dom->setOptions([ - 'removeStyles' => false, - ]); + $dom->setOptions((new Options())->setRemoveStyles(false)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(1, \count($dom->find('style'))); $this->assertEquals('text/css', @@ -54,9 +47,7 @@ public function testRemoveStylesFalse() public function testRemoveScriptsTrue() { $dom = new Dom(); - $dom->setOptions([ - 'removeScripts' => true, - ]); + $dom->setOptions((new Options())->setRemoveScripts(true)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(0, \count($dom->find('script'))); } @@ -64,9 +55,7 @@ public function testRemoveScriptsTrue() public function testRemoveScriptsFalse() { $dom = new Dom(); - $dom->setOptions([ - 'removeScripts' => false, - ]); + $dom->setOptions((new Options())->setRemoveScripts(false)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(22, \count($dom->find('script'))); $this->assertEquals('text/javascript', @@ -85,9 +74,7 @@ public function testSmartyScripts() public function testSmartyScriptsDisabled() { $dom = new Dom(); - $dom->setOptions([ - 'removeSmartyScripts' => false, - ]); + $dom->setOptions((new Options())->setRemoveSmartyScripts(false)); $dom->loadStr(' aa={123} '); diff --git a/tests/Options/PreserveLineBreaks.php b/tests/Options/PreserveLineBreaks.php index ad095a38..be396490 100755 --- a/tests/Options/PreserveLineBreaks.php +++ b/tests/Options/PreserveLineBreaks.php @@ -3,6 +3,7 @@ declare(strict_types=1); use PHPHtmlParser\Dom; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; class PreserveLineBreaks extends TestCase @@ -10,9 +11,8 @@ class PreserveLineBreaks extends TestCase public function testPreserveLineBreakTrue() { $dom = new Dom(); - $dom->setOptions([ - 'preserveLineBreaks' => true, - ]); + $dom->setOptions((new Options())->setPreserveLineBreaks(true)); + $dom->loadStr('
'); @@ -22,9 +22,7 @@ public function testPreserveLineBreakTrue() public function testPreserveLineBreakBeforeClosingTag() { $dom = new Dom(); - $dom->setOptions([ - 'preserveLineBreaks' => true, - ]); + $dom->setOptions((new Options())->setPreserveLineBreaks(true)); $dom->loadStr('
'); diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php index 96d457b7..709f292d 100755 --- a/tests/Options/StrictTest.php +++ b/tests/Options/StrictTest.php @@ -4,6 +4,7 @@ use PHPHtmlParser\Dom; use PHPHtmlParser\Exceptions\StrictException; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; class StrictTest extends TestCase @@ -11,9 +12,7 @@ class StrictTest extends TestCase public function testConfigStrict() { $dom = new Dom(); - $dom->setOptions([ - 'strict' => true, - ]); + $dom->setOptions((new Options())->setStrict(true)); $dom->loadStr('

Hey you

Ya you!

'); $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); } @@ -21,9 +20,7 @@ public function testConfigStrict() public function testConfigStrictMissingSelfClosing() { $dom = new Dom(); - $dom->setOptions([ - 'strict' => true, - ]); + $dom->setOptions((new Options())->setStrict(true)); try { // should throw an exception $dom->loadStr('

Hey you


Ya you!

'); @@ -37,9 +34,7 @@ public function testConfigStrictMissingSelfClosing() public function testConfigStrictMissingAttribute() { $dom = new Dom(); - $dom->setOptions([ - 'strict' => true, - ]); + $dom->setOptions((new Options())->setStrict(true)); try { // should throw an exception $dom->loadStr('

Hey you

Ya you!

'); @@ -53,9 +48,7 @@ public function testConfigStrictMissingAttribute() public function testConfigStrictBRTag() { $dom = new Dom(); - $dom->setOptions([ - 'strict' => true, - ]); + $dom->setOptions((new Options())->setStrict(true)); $dom->loadStr('
'); $this->assertTrue(true); } diff --git a/tests/Options/WhitespaceTextNodeTest.php b/tests/Options/WhitespaceTextNodeTest.php index 0097f28d..245ef7f0 100755 --- a/tests/Options/WhitespaceTextNodeTest.php +++ b/tests/Options/WhitespaceTextNodeTest.php @@ -3,6 +3,7 @@ declare(strict_types=1); use PHPHtmlParser\Dom; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; class WhitespaceTextNodeTest extends TestCase @@ -10,9 +11,7 @@ class WhitespaceTextNodeTest extends TestCase public function testConfigGlobalNoWhitespaceTextNode() { $dom = new Dom(); - $dom->setOptions([ - 'whitespaceTextNode' => false, - ]); + $dom->setOptions((new Options())->setWhitespaceTextNode(false)); $dom->loadStr('

Hey you

Ya you!

'); $this->assertEquals('Ya you!', $dom->getElementById('hey')->nextSibling()->text); } @@ -20,12 +19,8 @@ public function testConfigGlobalNoWhitespaceTextNode() public function testConfigLocalOverride() { $dom = new Dom(); - $dom->setOptions([ - 'whitespaceTextNode' => false, - ]); - $dom->loadStr('

Hey you

Ya you!

', [ - 'whitespaceTextNode' => true, - ]); + $dom->setOptions((new Options())->setWhitespaceTextNode(false)); + $dom->loadStr('

Hey you

Ya you!

', (new Options())->setWhitespaceTextNode(true)); $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); } } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index a78f508f..f7406a14 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -2,8 +2,6 @@ declare(strict_types=1); -use PHPHtmlParser\Dom; -use PHPHtmlParser\Exceptions\UnknownOptionException; use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; @@ -13,137 +11,62 @@ public function testDefaultWhitespaceTextNode() { $options = new Options(); - $this->assertTrue($options->whitespaceTextNode); + $this->assertTrue($options->isWhitespaceTextNode()); } public function testSettingOption() { $options = new Options(); - $options->setOptions([ - 'strict' => true, - ]); - - $this->assertTrue($options->strict); - } - - public function testAddingOption() - { - $this->expectException(UnknownOptionException::class); + $options->setStrict(true); - $options = new Options(); - $options->setOptions([ - 'test' => true, - ]); + $this->assertTrue($options->isStrict()); } public function testOverwritingOption() { $options = new Options(); - $options->setOptions([ - 'strict' => false, - ])->setOptions([ - 'strict' => true, - 'whitespaceTextNode' => false, - ]); - - $this->assertTrue($options->get('strict')); - $this->assertFalse($options->get('whitespaceTextNode')); - } + $options->setStrict(false); + $options2 = new Options(); + $options2->setStrict(true); + $options2->setWhitespaceTextNode(false); + $options->setFromOptions($options2); - public function testGettingNoOption() - { - $options = new Options(); - $this->assertEquals(null, $options->get('doesnotexist')); + $this->assertTrue($options->isStrict()); + $this->assertFalse($options->isWhitespaceTextNode()); } public function testSetters() { $options = new Options(); - $options->setOptions([ - 'whitespaceTextNode' => false, - 'strict' => false, - 'enforceEncoding' => null, - 'cleanupInput' => false, - 'removeScripts' => false, - 'removeStyles' => false, - 'preserveLineBreaks' => false, - 'removeDoubleSpace' => false, - 'removeSmartyScripts' => false, - 'htmlSpecialCharsDecode' => false, - ]); - $options->setWhitespaceTextNode(true); - $this->assertTrue($options->get('whitespaceTextNode')); + $this->assertTrue($options->isWhitespaceTextNode()); $options->setStrict(true); - $this->assertTrue($options->get('strict')); + $this->assertTrue($options->isStrict()); $options->setEnforceEncoding('utf8'); - $this->assertEquals('utf8', $options->get('enforceEncoding')); + $this->assertEquals('utf8', $options->getEnforceEncoding()); $options->setCleanupInput(true); - $this->assertTrue($options->get('cleanupInput')); + $this->assertTrue($options->isCleanupInput()); $options->setRemoveScripts(true); - $this->assertTrue($options->get('removeScripts')); + $this->assertTrue($options->isRemoveScripts()); $options->setRemoveStyles(true); - $this->assertTrue($options->get('removeStyles')); + $this->assertTrue($options->isRemoveStyles()); $options->setPreserveLineBreaks(true); - $this->assertTrue($options->get('preserveLineBreaks')); + $this->assertTrue($options->isPreserveLineBreaks()); $options->setRemoveDoubleSpace(true); - $this->assertTrue($options->get('removeDoubleSpace')); + $this->assertTrue($options->isRemoveDoubleSpace()); $options->setRemoveSmartyScripts(true); - $this->assertTrue($options->get('removeSmartyScripts')); + $this->assertTrue($options->isRemoveSmartyScripts()); $options->setHtmlSpecialCharsDecode(true); - $this->assertTrue($options->get('htmlSpecialCharsDecode')); - - // now reset to false - - $options->setWhitespaceTextNode(false); - $this->assertFalse($options->get('whitespaceTextNode')); - - $options->setStrict(false); - $this->assertFalse($options->get('strict')); - - $options->setEnforceEncoding(null); - $this->assertNull($options->get('enforceEncoding')); - - $options->setCleanupInput(false); - $this->assertFalse($options->get('cleanupInput')); - - $options->setRemoveScripts(false); - $this->assertFalse($options->get('removeScripts')); - - $options->setRemoveStyles(false); - $this->assertFalse($options->get('removeStyles')); - - $options->setPreserveLineBreaks(false); - $this->assertFalse($options->get('preserveLineBreaks')); - - $options->setRemoveDoubleSpace(false); - $this->assertFalse($options->get('removeDoubleSpace')); - - $options->setRemoveSmartyScripts(false); - $this->assertFalse($options->get('removeSmartyScripts')); - - $options->setHtmlSpecialCharsDecode(false); - $this->assertFalse($options->get('htmlSpecialCharsDecode')); - } - - public function testUnknownOptionDom() - { - $dom = new Dom(); - $dom->setOptions([ - 'unknown_option' => true, - ]); - - $this->expectException(UnknownOptionException::class); - $dom->loadStr('
'); + $this->assertTrue($options->isHtmlSpecialCharsDecode()); } } diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index fbc1a5bd..73453633 100755 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -73,7 +73,7 @@ public function testLoadFromUrl() ->once() ->andReturn($responseMock); - Dom::loadFromUrl('http://google.com', [], $clientMock); + Dom::loadFromUrl('http://google.com', null, $clientMock); $this->assertEquals('VonBurgermeister', Dom::find('.post-row div .post-user font', 0)->text); } } From 7bba8adf4348c42e877275a6740689b90ef5707d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 02:09:17 +0000 Subject: [PATCH 177/200] Cleaned up the code --- src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 4 +- src/PHPHtmlParser/DTO/TagDTO.php | 70 +++++++++ src/PHPHtmlParser/Dom.php | 166 +++------------------ src/PHPHtmlParser/Options.php | 134 +++++++++++++++++ tests/DomTest.php | 46 +----- tests/Options/NoSlashTest.php | 45 ++++++ tests/Options/SelfClosingTest.php | 45 ++++++ 7 files changed, 321 insertions(+), 189 deletions(-) create mode 100644 src/PHPHtmlParser/DTO/TagDTO.php create mode 100644 tests/Options/NoSlashTest.php create mode 100644 tests/Options/SelfClosingTest.php diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php index 1f15c492..6ac22197 100755 --- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php +++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php @@ -17,12 +17,12 @@ final class AttributeDTO /** * @var bool */ - private $doubleQuote = true; + private $doubleQuote; public function __construct(array $values) { $this->value = $values['value']; - $this->doubleQuote = $values['doubleQuote']; + $this->doubleQuote = $values['doubleQuote'] ?? true; } public function getValue(): ?string diff --git a/src/PHPHtmlParser/DTO/TagDTO.php b/src/PHPHtmlParser/DTO/TagDTO.php new file mode 100644 index 00000000..d1c365e1 --- /dev/null +++ b/src/PHPHtmlParser/DTO/TagDTO.php @@ -0,0 +1,70 @@ +status = $values['status'] ?? false; + $this->closing = $values['closing'] ?? false; + $this->node = $values['node'] ?? null; + $this->tag = $values['tag'] ?? null; + } + + /** + * @return bool + */ + public function isStatus(): bool + { + return $this->status; + } + + /** + * @return bool + */ + public function isClosing(): bool + { + return $this->closing; + } + + /** + * @return mixed + */ + public function getNode(): ?HtmlNode + { + return $this->node; + } + + /** + * @return mixed + */ + public function getTag(): ?string + { + return $this->tag; + } +} diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index d2db15e2..68c8d144 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -10,6 +10,7 @@ use PHPHtmlParser\Dom\Collection; use PHPHtmlParser\Dom\HtmlNode; use PHPHtmlParser\Dom\TextNode; +use PHPHtmlParser\DTO\TagDTO; use PHPHtmlParser\Enum\StringToken; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; @@ -84,46 +85,17 @@ class Dom */ private $options; - /** - * A list of tags which will always be self closing. - * - * @var array - */ - private $selfClosing = [ - 'area', - 'base', - 'basefont', - 'br', - 'col', - 'embed', - 'hr', - 'img', - 'input', - 'keygen', - 'link', - 'meta', - 'param', - 'source', - 'spacer', - 'track', - 'wbr', - ]; - - /** - * A list of tags where there should be no /> at the end (html5 style). - * - * @var array - */ - private $noSlash = []; - /** * Returns the inner html of the root node. * * @throws ChildNotFoundException * @throws UnknownChildTypeException + * @throws NotLoadedException */ public function __toString(): string { + $this->isLoaded(); + return $this->root->innerHtml(); } @@ -132,10 +104,14 @@ public function __toString(): string * * @param string $name * + * @throws NotLoadedException + * * @return mixed */ public function __get($name) { + $this->isLoaded(); + return $this->root->$name; } @@ -242,100 +218,6 @@ public function find(string $selector, int $nth = null) return $this->root->find($selector, $nth); } - /** - * Adds the tag (or tags in an array) to the list of tags that will always - * be self closing. - * - * @param string|array $tag - * @chainable - */ - public function addSelfClosingTag($tag): Dom - { - if (!\is_array($tag)) { - $tag = [$tag]; - } - foreach ($tag as $value) { - $this->selfClosing[] = $value; - } - - return $this; - } - - /** - * Removes the tag (or tags in an array) from the list of tags that will - * always be self closing. - * - * @param string|array $tag - * @chainable - */ - public function removeSelfClosingTag($tag): Dom - { - if (!\is_array($tag)) { - $tag = [$tag]; - } - $this->selfClosing = \array_diff($this->selfClosing, $tag); - - return $this; - } - - /** - * Sets the list of self closing tags to empty. - * - * @chainable - */ - public function clearSelfClosingTags(): Dom - { - $this->selfClosing = []; - - return $this; - } - - /** - * Adds a tag to the list of self closing tags that should not have a trailing slash. - * - * @param $tag - * @chainable - */ - public function addNoSlashTag($tag): Dom - { - if (!\is_array($tag)) { - $tag = [$tag]; - } - foreach ($tag as $value) { - $this->noSlash[] = $value; - } - - return $this; - } - - /** - * Removes a tag from the list of no-slash tags. - * - * @param $tag - * @chainable - */ - public function removeNoSlashTag($tag): Dom - { - if (!\is_array($tag)) { - $tag = [$tag]; - } - $this->noSlash = \array_diff($this->noSlash, $tag); - - return $this; - } - - /** - * Empties the list of no-slash tags. - * - * @chainable - */ - public function clearNoSlashTags(): Dom - { - $this->noSlash = []; - - return $this; - } - /** * Simple wrapper function that returns the first child. * @@ -574,18 +456,18 @@ private function parse(): void $str = $this->content->copyUntil('<'); } if ($str == '') { - $info = $this->parseTag(); - if (!$info['status']) { + $tagDTO = $this->parseTag(); + if (!$tagDTO->isStatus()) { // we are done here $activeNode = null; continue; } // check if it was a closing tag - if ($info['closing']) { + if ($tagDTO->isClosing()) { $foundOpeningTag = true; $originalNode = $activeNode; - while ($activeNode->getTag()->name() != $info['tag']) { + while ($activeNode->getTag()->name() != $tagDTO->getTag()) { $activeNode = $activeNode->getParent(); if ($activeNode === null) { // we could not find opening tag @@ -600,12 +482,12 @@ private function parse(): void continue; } - if (!isset($info['node'])) { + if ($tagDTO->getNode() === null) { continue; } /** @var AbstractNode $node */ - $node = $info['node']; + $node = $tagDTO->getNode(); $activeNode->addChild($node); // check if node is self closing @@ -628,7 +510,7 @@ private function parse(): void * * @throws StrictException */ - private function parseTag(): array + private function parseTag(): TagDTO { $return = [ 'status' => false, @@ -637,7 +519,7 @@ private function parseTag(): array ]; if ($this->content->char() != '<') { // we are not at the beginning of a tag - return $return; + return new TagDTO(); } // check if this is a closing tag @@ -645,7 +527,7 @@ private function parseTag(): array $this->content->fastForward(1); } catch (ContentLengthException $exception) { // we are at the end of the file - return $return; + return new TagDTO(); } if ($this->content->char() == '/') { // end tag @@ -657,22 +539,22 @@ private function parseTag(): array // check if this closing tag counts $tag = \strtolower($tag); - if (\in_array($tag, $this->selfClosing, true)) { + if (\in_array($tag, $this->options->getSelfClosing(), true)) { $return['status'] = true; - return $return; + return new TagDTO($return); } $return['status'] = true; $return['closing'] = true; $return['tag'] = \strtolower($tag); - return $return; + return new TagDTO($return); } $tag = \strtolower($this->content->copyByToken(StringToken::SLASH(), true)); if (\trim($tag) == '') { // no tag found, invalid < found - return $return; + return new TagDTO(); } $node = new HtmlNode($tag); $node->setHtmlSpecialCharsDecode($this->options->isHtmlSpecialCharsDecode()); @@ -754,7 +636,7 @@ private function parseTag(): array // self closing tag $node->getTag()->selfClosing(); $this->content->fastForward(1); - } elseif (\in_array($tag, $this->selfClosing, true)) { + } elseif (\in_array($tag, $this->options->getSelfClosing(), true)) { // Should be a self closing tag, check if we are strict if ($this->options->isStrict()) { $character = $this->content->getPosition(); @@ -765,7 +647,7 @@ private function parseTag(): array $node->getTag()->selfClosing(); // Should this tag use a trailing slash? - if (\in_array($tag, $this->noSlash, true)) { + if (\in_array($tag, $this->options->getNoSlash(), true)) { $node->getTag()->noTrailingSlash(); } } @@ -777,7 +659,7 @@ private function parseTag(): array $return['status'] = true; $return['node'] = $node; - return $return; + return new TagDTO($return); } /** diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index 9d1dfbfd..d995dbc8 100755 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -88,6 +88,38 @@ class Options */ private $htmlSpecialCharsDecode = false; + /** + * A list of tags which will always be self closing. + * + * @var array + */ + private $selfClosing = [ + 'area', + 'base', + 'basefont', + 'br', + 'col', + 'embed', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'spacer', + 'track', + 'wbr', + ]; + + /** + * A list of tags where there should be no /> at the end (html5 style). + * + * @var array + */ + private $noSlash = []; + public function isWhitespaceTextNode(): bool { return $this->whitespaceTextNode; @@ -208,6 +240,106 @@ public function setHtmlSpecialCharsDecode(bool $htmlSpecialCharsDecode): Options return $this; } + public function getSelfClosing(): array + { + return $this->selfClosing; + } + + public function setSelfClosing(array $selfClosing): Options + { + $this->selfClosing = $selfClosing; + + return $this; + } + + /** + * Adds the tag to the list of tags that will always be self closing. + */ + public function addSelfClosingTag(string $tag): Options + { + $this->selfClosing[] = $tag; + + return $this; + } + + /** + * Adds the tags to the list of tags that will always be self closing. + * + * @param string[] $tags + */ + public function addSelfClosingTags(array $tags): Options + { + foreach ($tags as $tag) { + $this->selfClosing[] = $tag; + } + + return $this; + } + + /** + * Removes the tag from the list of tags that will always be self closing. + */ + public function removeSelfClosingTag(string $tag): Options + { + $tags = [$tag]; + $this->selfClosing = \array_diff($this->selfClosing, $tags); + + return $this; + } + + /** + * Sets the list of self closing tags to empty. + */ + public function clearSelfClosingTags(): Options + { + $this->selfClosing = []; + + return $this; + } + + public function getNoSlash(): array + { + return $this->noSlash; + } + + public function setNoSlash(array $noSlash): Options + { + $this->noSlash = $noSlash; + + return $this; + } + + /** + * Adds a tag to the list of self closing tags that should not have a trailing slash. + */ + public function addNoSlashTag(string $tag): Options + { + $this->noSlash[] = $tag; + + return $this; + } + + /** + * Removes a tag from the list of no-slash tags. + */ + public function removeNoSlashTag(string $tag): Options + { + $tags = [$tag]; + $this->noSlash = \array_diff($this->noSlash, $tags); + + return $this; + } + + /** + * Empties the list of no-slash tags. + */ + public function clearNoSlashTags(): Options + { + $this->noSlash = []; + + return $this; + } + public function setFromOptions(Options $options): void { $this->setCleanupInput($options->isCleanupInput()); @@ -220,5 +352,7 @@ public function setFromOptions(Options $options): void $this->setRemoveStyles($options->isRemoveStyles()); $this->setStrict($options->isStrict()); $this->setWhitespaceTextNode($options->isWhitespaceTextNode()); + $this->setSelfClosing($options->getSelfClosing()); + $this->setNoSlash($options->getNoSlash()); } } diff --git a/tests/DomTest.php b/tests/DomTest.php index 81396962..96756d6c 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -104,50 +104,6 @@ public function testLoadClosingTagOnSelfClosing() $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } - public function testLoadClosingTagOnSelfClosingNoSlash() - { - $dom = new Dom(); - $dom->addNoSlashTag('br'); - - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); - } - - public function testLoadClosingTagAddSelfClosingTag() - { - $dom = new Dom(); - $dom->addSelfClosingTag('mytag'); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); - } - - public function testLoadClosingTagAddSelfClosingTagArray() - { - $dom = new Dom(); - $dom->addSelfClosingTag([ - 'mytag', - 'othertag', - ]); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); - } - - public function testLoadClosingTagRemoveSelfClosingTag() - { - $dom = new Dom(); - $dom->removeSelfClosingTag('br'); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); - } - - public function testLoadClosingTagClearSelfClosingTag() - { - $dom = new Dom(); - $dom->clearSelfClosingTags(); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); - } - public function testLoadNoValueAttribute() { $dom = new Dom(); @@ -239,7 +195,7 @@ public function testLoadFileBigTwicePreserveOption() { $dom = new Dom(); $dom->loadFromFile('tests/data/files/big.html', - (new Options)->setPreserveLineBreaks(true)); + (new Options())->setPreserveLineBreaks(true)); $post = $dom->find('.post-row', 0); $this->assertEquals( "

Журчанье воды
\nЧерно-белые тени
\nВновь на фонтане

", diff --git a/tests/Options/NoSlashTest.php b/tests/Options/NoSlashTest.php new file mode 100644 index 00000000..93370b56 --- /dev/null +++ b/tests/Options/NoSlashTest.php @@ -0,0 +1,45 @@ +setOptions((new Options())->addNoSlashTag('br')); + + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagOnSelfClosingRemoveNoSlash() + { + $dom = new Dom(); + $dom->setOptions( + (new Options()) + ->addNoSlashTag('br') + ->removeNoSlashTag('br') + ); + + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagOnSelfClosingClearNoSlash() + { + $dom = new Dom(); + $dom->setOptions( + (new Options()) + ->addNoSlashTag('br') + ->clearNoSlashTags() + ); + + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } +} diff --git a/tests/Options/SelfClosingTest.php b/tests/Options/SelfClosingTest.php new file mode 100644 index 00000000..f1349821 --- /dev/null +++ b/tests/Options/SelfClosingTest.php @@ -0,0 +1,45 @@ +setOptions((new Options())->addSelfClosingTag('mytag')); + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagAddSelfClosingTagArray() + { + $dom = new Dom(); + $dom->setOptions((new Options())->addSelfClosingTags([ + 'mytag', + 'othertag', + ])); + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagRemoveSelfClosingTag() + { + $dom = new Dom(); + $dom->setOptions((new Options())->removeSelfClosingTag('br')); + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagClearSelfClosingTag() + { + $dom = new Dom(); + $dom->setOptions((new Options())->clearSelfClosingTags()); + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); + } +} From 8f43f08fe9f5f4d002e4cf8124a6e48c0d582652 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 03:13:13 +0000 Subject: [PATCH 178/200] Fixed #215 - Added support for options = new Options(); if ($this->globalOptions !== null) { - $this->options->setFromOptions($this->globalOptions); + $this->options = $this->options->setFromOptions($this->globalOptions); } if ($options !== null) { - $this->options->setFromOptions($options); + $this->options = $this->options->setFromOptions($options); } $this->rawSize = \strlen($str); @@ -194,7 +195,7 @@ public function loadStr(string $str, ?Options $options = null): Dom /** * Sets a global options array to be used by all load calls. * - * @chainable + * */ public function setOptions(Options $options): Dom { @@ -512,11 +513,7 @@ private function parse(): void */ private function parseTag(): TagDTO { - $return = [ - 'status' => false, - 'closing' => false, - 'node' => null, - ]; + $return = []; if ($this->content->char() != '<') { // we are not at the beginning of a tag return new TagDTO(); @@ -549,12 +546,20 @@ private function parseTag(): TagDTO $return['tag'] = \strtolower($tag); return new TagDTO($return); - } - - $tag = \strtolower($this->content->copyByToken(StringToken::SLASH(), true)); - if (\trim($tag) == '') { - // no tag found, invalid < found - return new TagDTO(); + } elseif ($this->content->char() == '?') { + // special setting tag + $tag = $this->content->fastForward(1) + ->copyByToken(StringToken::SLASH(), true); + $tag = (new Tag($tag)) + ->setOpening('setClosing(' ?>') + ->selfClosing(); + } else { + $tag = \strtolower($this->content->copyByToken(StringToken::SLASH(), true)); + if (\trim($tag) == '') { + // no tag found, invalid < found + return new TagDTO(); + } } $node = new HtmlNode($tag); $node->setHtmlSpecialCharsDecode($this->options->isHtmlSpecialCharsDecode()); @@ -631,23 +636,22 @@ private function parseTag(): TagDTO } $this->content->skipByToken(StringToken::BLANK()); - $tag = \strtolower($tag); if ($this->content->char() == '/') { // self closing tag $node->getTag()->selfClosing(); $this->content->fastForward(1); - } elseif (\in_array($tag, $this->options->getSelfClosing(), true)) { + } elseif (\in_array($node->getTag()->name(), $this->options->getSelfClosing(), true)) { // Should be a self closing tag, check if we are strict if ($this->options->isStrict()) { $character = $this->content->getPosition(); - throw new StrictException("Tag '$tag' is not self closing! (character #$character)"); + throw new StrictException("Tag '".$node->getTag()->name()."' is not self closing! (character #$character)"); } // We force self closing on this tag. $node->getTag()->selfClosing(); // Should this tag use a trailing slash? - if (\in_array($tag, $this->options->getNoSlash(), true)) { + if (\in_array($node->getTag()->name(), $this->options->getNoSlash(), true)) { $node->getTag()->noTrailingSlash(); } } diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 3d67ab5c..9c89b5ce 100755 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -304,7 +304,7 @@ public function getTag(): Tag * Replaces the tag for this node. * * @param string|Tag $tag - * @chainable + * */ public function setTag($tag): AbstractNode { @@ -365,7 +365,7 @@ public function hasAttribute(string $key): bool * A wrapper method that simply calls the setAttribute method * on the tag of this node. * - * @chainable + * */ public function setAttribute(string $key, ?string $value, bool $doubleQuote = true): AbstractNode { diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 60c120d7..07f16c7d 100755 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -205,7 +205,7 @@ public function insertAfter(AbstractNode $child, int $id): bool /** * Removes the child by id. * - * @chainable + * */ public function removeChild(int $id): InnerNode { diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 8fc84874..a98590e4 100755 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -58,6 +58,19 @@ class Tag */ private $HtmlSpecialCharsDecode = false; + /** + * What the opening of this tag will be. + * + * @var string + */ + private $opening = '<'; + + /** + * What the closing tag for self-closing elements should be. + * @var string + */ + private $closing = ' />'; + /** * Sets up the tag with a name. * @@ -79,25 +92,37 @@ public function name(): string /** * Sets the tag to be self closing. * - * @chainable + * */ public function selfClosing(): Tag { $this->selfClosing = true; - return $this; + return clone $this; + } + + public function setOpening(string $opening): Tag + { + $this->opening = $opening; + return clone $this; + } + + public function setClosing(string $closing): Tag + { + $this->closing = $closing; + return clone $this; } /** * Sets the tag to not use a trailing slash. * - * @chainable + * */ public function noTrailingSlash(): Tag { $this->trailingSlash = false; - return $this; + return clone $this; } /** @@ -131,7 +156,7 @@ public function noise(string $noise): Tag { $this->noise = $noise; - return $this; + return clone $this; } /** @@ -148,7 +173,7 @@ public function setAttribute(string $key, ?string $attributeValue, bool $doubleQ } $this->attr[\strtolower($key)] = $attributeDTO; - return $this; + return clone $this; } /** @@ -296,7 +321,7 @@ public function hasAttribute(string $key) */ public function makeOpeningTag() { - $return = '<' . $this->name; + $return = $this->opening . $this->name; // add the attributes foreach (\array_keys($this->attr) as $key) { @@ -317,7 +342,7 @@ public function makeOpeningTag() } if ($this->selfClosing && $this->trailingSlash) { - return $return . ' />'; + return $return . $this->closing; } return $return . '>'; diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index d995dbc8..ffd3c734 100755 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -91,7 +91,7 @@ class Options /** * A list of tags which will always be self closing. * - * @var array + * @var string[] */ private $selfClosing = [ 'area', @@ -116,7 +116,7 @@ class Options /** * A list of tags where there should be no /> at the end (html5 style). * - * @var array + * @var string[] */ private $noSlash = []; @@ -129,7 +129,7 @@ public function setWhitespaceTextNode(bool $whitespaceTextNode): Options { $this->whitespaceTextNode = $whitespaceTextNode; - return $this; + return clone $this; } public function isStrict(): bool @@ -141,7 +141,7 @@ public function setStrict(bool $strict): Options { $this->strict = $strict; - return $this; + return clone $this; } public function getEnforceEncoding(): ?string @@ -153,7 +153,7 @@ public function setEnforceEncoding(?string $enforceEncoding): Options { $this->enforceEncoding = $enforceEncoding; - return $this; + return clone $this; } public function isCleanupInput(): bool @@ -165,7 +165,7 @@ public function setCleanupInput(bool $cleanupInput): Options { $this->cleanupInput = $cleanupInput; - return $this; + return clone $this; } public function isRemoveScripts(): bool @@ -177,7 +177,7 @@ public function setRemoveScripts(bool $removeScripts): Options { $this->removeScripts = $removeScripts; - return $this; + return clone $this; } public function isRemoveStyles(): bool @@ -189,7 +189,7 @@ public function setRemoveStyles(bool $removeStyles): Options { $this->removeStyles = $removeStyles; - return $this; + return clone $this; } public function isPreserveLineBreaks(): bool @@ -201,7 +201,7 @@ public function setPreserveLineBreaks(bool $preserveLineBreaks): Options { $this->preserveLineBreaks = $preserveLineBreaks; - return $this; + return clone $this; } public function isRemoveDoubleSpace(): bool @@ -213,7 +213,7 @@ public function setRemoveDoubleSpace(bool $removeDoubleSpace): Options { $this->removeDoubleSpace = $removeDoubleSpace; - return $this; + return clone $this; } public function isRemoveSmartyScripts(): bool @@ -225,7 +225,7 @@ public function setRemoveSmartyScripts(bool $removeSmartyScripts): Options { $this->removeSmartyScripts = $removeSmartyScripts; - return $this; + return clone $this; } public function isHtmlSpecialCharsDecode(): bool @@ -237,9 +237,12 @@ public function setHtmlSpecialCharsDecode(bool $htmlSpecialCharsDecode): Options { $this->htmlSpecialCharsDecode = $htmlSpecialCharsDecode; - return $this; + return clone $this; } + /** + * @return string[] + */ public function getSelfClosing(): array { return $this->selfClosing; @@ -249,7 +252,7 @@ public function setSelfClosing(array $selfClosing): Options { $this->selfClosing = $selfClosing; - return $this; + return clone $this; } /** @@ -259,7 +262,7 @@ public function addSelfClosingTag(string $tag): Options { $this->selfClosing[] = $tag; - return $this; + return clone $this; } /** @@ -273,7 +276,7 @@ public function addSelfClosingTags(array $tags): Options $this->selfClosing[] = $tag; } - return $this; + return clone $this; } /** @@ -284,7 +287,7 @@ public function removeSelfClosingTag(string $tag): Options $tags = [$tag]; $this->selfClosing = \array_diff($this->selfClosing, $tags); - return $this; + return clone $this; } /** @@ -294,19 +297,25 @@ public function clearSelfClosingTags(): Options { $this->selfClosing = []; - return $this; + return clone $this; } + /** + * @return string[] + */ public function getNoSlash(): array { return $this->noSlash; } + /** + * @param string[] $noSlash + */ public function setNoSlash(array $noSlash): Options { $this->noSlash = $noSlash; - return $this; + return clone $this; } /** @@ -316,7 +325,7 @@ public function addNoSlashTag(string $tag): Options { $this->noSlash[] = $tag; - return $this; + return clone $this; } /** @@ -327,7 +336,7 @@ public function removeNoSlashTag(string $tag): Options $tags = [$tag]; $this->noSlash = \array_diff($this->noSlash, $tags); - return $this; + return clone $this; } /** @@ -337,22 +346,24 @@ public function clearNoSlashTags(): Options { $this->noSlash = []; - return $this; + return clone $this; } - public function setFromOptions(Options $options): void + public function setFromOptions(Options $options): Options { - $this->setCleanupInput($options->isCleanupInput()); - $this->setEnforceEncoding($options->getEnforceEncoding()); - $this->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode()); - $this->setPreserveLineBreaks($options->isPreserveLineBreaks()); - $this->setRemoveDoubleSpace($options->isRemoveDoubleSpace()); - $this->setRemoveScripts($options->isRemoveScripts()); - $this->setRemoveSmartyScripts($options->isRemoveSmartyScripts()); - $this->setRemoveStyles($options->isRemoveStyles()); - $this->setStrict($options->isStrict()); - $this->setWhitespaceTextNode($options->isWhitespaceTextNode()); - $this->setSelfClosing($options->getSelfClosing()); - $this->setNoSlash($options->getNoSlash()); + $newOptions = $this->setCleanupInput($options->isCleanupInput()) + ->setEnforceEncoding($options->getEnforceEncoding()) + ->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode()) + ->setPreserveLineBreaks($options->isPreserveLineBreaks()) + ->setRemoveDoubleSpace($options->isRemoveDoubleSpace()) + ->setRemoveScripts($options->isRemoveScripts()) + ->setRemoveSmartyScripts($options->isRemoveSmartyScripts()) + ->setRemoveStyles($options->isRemoveStyles()) + ->setStrict($options->isStrict()) + ->setWhitespaceTextNode($options->isWhitespaceTextNode()) + ->setSelfClosing($options->getSelfClosing()) + ->setNoSlash($options->getNoSlash()); + + return $newOptions; } } diff --git a/tests/DomTest.php b/tests/DomTest.php index 96756d6c..fed044b8 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -611,4 +611,12 @@ public function testBrokenHtml() $this->assertEquals('', $dom->outerHtml); } + + public function testXMLOpeningToken() + { + $dom = new Dom(); + $dom->loadStr('

fun time

'); + + $this->assertEquals('

fun time

', $dom->outerHtml); + } } diff --git a/tests/Node/TagTest.php b/tests/Node/TagTest.php index 601e82eb..f14de7e2 100755 --- a/tests/Node/TagTest.php +++ b/tests/Node/TagTest.php @@ -159,8 +159,8 @@ public function testMakeOpeningTagSelfClosing() ], ]; - $tag = new Tag('div'); - $tag->selfClosing() + $tag = (new Tag('div')) + ->selfClosing() ->setAttributes($attr); $this->assertEquals('
', $tag->makeOpeningTag()); } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index f7406a14..43dd6fb6 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -29,7 +29,7 @@ public function testOverwritingOption() $options2 = new Options(); $options2->setStrict(true); $options2->setWhitespaceTextNode(false); - $options->setFromOptions($options2); + $options = $options->setFromOptions($options2); $this->assertTrue($options->isStrict()); $this->assertFalse($options->isWhitespaceTextNode()); From a78054fec726427e723bd31fcf10f21403e5e4d7 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 03:56:54 +0000 Subject: [PATCH 179/200] Cleaned up code --- .travis.yml | 4 ++-- src/PHPHtmlParser/Contracts/DomInterface.php | 23 ++++++++++++++++++++ src/PHPHtmlParser/Dom.php | 5 +++-- src/PHPHtmlParser/Dom/InnerNode.php | 11 ++++++++++ src/PHPHtmlParser/Selector/Seeker.php | 11 +++++----- src/PHPHtmlParser/Selector/Selector.php | 2 -- tests/Selector/SeekerTest.php | 2 +- 7 files changed, 46 insertions(+), 12 deletions(-) create mode 100644 src/PHPHtmlParser/Contracts/DomInterface.php diff --git a/.travis.yml b/.travis.yml index 9ffb2529..a7abcac9 100755 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ script: - mkdir -p build/logs - php vendor/bin/phpunit --coverage-clover build/logs/clover.xml -after_script: - - travis_retry php vendor/bin/coveralls +after_success: + - travis_retry php vendor/bin/php-coveralls -v - wget https://scrutinizer-ci.com/ocular.phar - php ocular.phar code-coverage:upload --format=php-clover build/logs/clover.xml diff --git a/src/PHPHtmlParser/Contracts/DomInterface.php b/src/PHPHtmlParser/Contracts/DomInterface.php new file mode 100644 index 00000000..b803f8f2 --- /dev/null +++ b/src/PHPHtmlParser/Contracts/DomInterface.php @@ -0,0 +1,23 @@ +root; while ($activeNode !== null) { if ($activeNode && $activeNode->tag->name() === 'script' - && $this->options->isCleanupInput() != true + && $this->options->isCleanupInput() !== true ) { $str = $this->content->copyUntil('children = $combination; // tell child I am the new parent @@ -300,6 +305,8 @@ public function isChild(int $id): bool /** * Removes the child with id $childId and replace it with the new child * $newChild. + * + * @throws LogicalException */ public function replaceChild(int $childId, AbstractNode $newChild): void { @@ -312,6 +319,10 @@ public function replaceChild(int $childId, AbstractNode $newChild): void $index = \array_search($childId, $keys, true); $keys[$index] = $newChild->id(); $combination = \array_combine($keys, $this->children); + if ($combination === false) { + // The number of elements for each array isn't equal or if the arrays are empty. + throw new LogicalException('array combine failed during replace child method call.'); + } $this->children = $combination; $this->children[$newChild->id()] = [ 'prev' => $oldChild['prev'], diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index ca92cb29..523aa42e 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -63,12 +63,12 @@ public function seek(array $nodes, RuleDTO $rule, array $options): array } $pass = $this->checkTag($rule, $child); - if ($pass && $rule->getKey() != null) { + if ($pass && $rule->getKey() !== null) { $pass = $this->checkKey($rule, $child); } if ($pass && - $rule->getKey() != null && - $rule->getValue() != null && + $rule->getKey() !== null && + $rule->getValue() !== null && $rule->getValue() != '*' ) { $pass = $this->checkComparison($rule, $child); @@ -238,8 +238,9 @@ private function checkNodeValue( ): bool { $check = false; if ( - $rule->getValue() != null && - \is_string($rule->getValue()) + $rule->getValue() !== null && + \is_string($rule->getValue()) && + $nodeValue !== null ) { $check = $this->match($rule->getOperator(), $rule->getValue(), $nodeValue); } diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 4c45da01..7179ee1f 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -48,8 +48,6 @@ public function __construct(string $selector, ?ParserInterface $parser = null, ? /** * Returns the selectors that where found in __construct. - * - * @return array */ public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO { diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php index 4e2d9e4f..a5106e98 100644 --- a/tests/Selector/SeekerTest.php +++ b/tests/Selector/SeekerTest.php @@ -19,7 +19,7 @@ public function testSeekReturnEmptyArray() 'alterNext' => false, ]); $seeker = new Seeker(); - $results = $seeker->seek([], $ruleDTO, [], false); + $results = $seeker->seek([], $ruleDTO, []); $this->assertCount(0, $results); } } From 3f1f6d60e4572c43c50378b14f102bc9caf33f30 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Fri, 17 Jul 2020 03:14:48 +0000 Subject: [PATCH 180/200] Cleaned up tests --- .travis.yml | 1 - phpunit.xml | 45 +++++++------ src/PHPHtmlParser/Dom.php | 2 +- tests/Dom/LoadTest.php | 99 ++++++++++++++++++++++++++++ tests/Dom/NotLoadedTest.php | 34 ++++++++++ tests/DomTest.php | 127 ++++-------------------------------- 6 files changed, 170 insertions(+), 138 deletions(-) create mode 100644 tests/Dom/LoadTest.php create mode 100644 tests/Dom/NotLoadedTest.php diff --git a/.travis.yml b/.travis.yml index a7abcac9..25ba270f 100755 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: php php: - - 7.1 - 7.2 - 7.3 - 7.4 diff --git a/phpunit.xml b/phpunit.xml index d0aa7db8..04b1d77d 100755 --- a/phpunit.xml +++ b/phpunit.xml @@ -1,26 +1,29 @@ - - - ./tests/ - - + + + ./tests/ + + - - - src - - vendor - - - + + + src + + vendor + + + diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 10d19144..df5b344e 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -349,7 +349,7 @@ private function isLoaded(): void */ private function clean(string $str): string { - if ($this->options->isCleanupInput() != true) { + if (!$this->options->isCleanupInput()) { // skip entire cleanup step return $str; } diff --git a/tests/Dom/LoadTest.php b/tests/Dom/LoadTest.php new file mode 100644 index 00000000..6079d2f5 --- /dev/null +++ b/tests/Dom/LoadTest.php @@ -0,0 +1,99 @@ +loadStr('

Hey bro, click here


'); + $this->dom = $dom; + } + + public function tearDown() + { + Mockery::close(); + } + + public function testLoadEscapeQuotes() + { + $a = $this->dom->find('a', 0); + $this->assertEquals('click here', $a->outerHtml); + } + + public function testLoadNoClosingTag() + { + $p = $this->dom->find('p', 0); + $this->assertEquals('Hey bro, click here', $p->innerHtml); + } + + public function testLoadClosingTagOnSelfClosing() + { + $this->assertCount(2, $this->dom->find('br')); + } + + public function testIncorrectAccess() + { + $div = $this->dom->find('div', 0); + $this->assertEquals(null, $div->foo); + } + + public function testLoadAttributeOnSelfClosing() + { + $br = $this->dom->find('br', 1); + $this->assertEquals('both', $br->getAttribute('class')); + } + + public function testToStringMagic() + { + $this->assertEquals('

Hey bro, click here


', (string) $this->dom); + } + + public function testGetMagic() + { + $this->assertEquals('

Hey bro, click here


', $this->dom->innerHtml); + } + + public function testFirstChild() + { + $this->assertEquals('

Hey bro, click here

', $this->dom->firstChild()->outerHtml); + } + + public function testLastChild() + { + $this->assertEquals('
', $this->dom->lastChild()->outerHtml); + } + + public function testGetElementById() + { + $this->assertEquals('click here', $this->dom->getElementById('78')->outerHtml); + } + + public function testGetElementsByTag() + { + $this->assertEquals('

Hey bro, click here

', $this->dom->getElementsByTag('p')[0]->outerHtml); + } + + public function testGetElementsByClass() + { + $this->assertEquals('

Hey bro, click here

', $this->dom->getElementsByClass('all')[0]->innerHtml); + } + + public function testDeleteNode() + { + $a = $this->dom->find('a')[0]; + $a->delete(); + unset($a); + $this->assertEquals('

Hey bro,


', (string) $this->dom); + } +} diff --git a/tests/Dom/NotLoadedTest.php b/tests/Dom/NotLoadedTest.php new file mode 100644 index 00000000..a8cc42ff --- /dev/null +++ b/tests/Dom/NotLoadedTest.php @@ -0,0 +1,34 @@ +dom = $dom; + } + + public function tearDown() + { + Mockery::close(); + } + + public function testNotLoaded() + { + $this->expectException(NotLoadedException::class); + $div = $this->dom->find('div', 0); + } +} + + diff --git a/tests/DomTest.php b/tests/DomTest.php index fed044b8..fb740235 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -3,6 +3,7 @@ declare(strict_types=1); use PHPHtmlParser\Dom; +use PHPHtmlParser\Exceptions\NotLoadedException; use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; @@ -25,31 +26,6 @@ public function testParsingCData() $this->assertSame($html, $dom->root->outerHtml()); } - public function testloadStr() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $div = $dom->find('div', 0); - $this->assertEquals('

Hey bro, click here
:)

', $div->outerHtml); - } - - /** - * @expectedException \PHPHtmlParser\Exceptions\NotLoadedException - */ - public function testNotLoaded() - { - $dom = new Dom(); - $div = $dom->find('div', 0); - } - - public function testIncorrectAccess() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $div = $dom->find('div', 0); - $this->assertEquals(null, $div->foo); - } - public function testLoadSelfclosingAttr() { $dom = new Dom(); @@ -66,14 +42,6 @@ public function testLoadSelfclosingAttrToString() $this->assertEquals('
', (string) $br); } - public function testLoadEscapeQuotes() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here

'); - $div = $dom->find('div', 0); - $this->assertEquals('

Hey bro, click here

', $div->outerHtml); - } - public function testLoadNoOpeningTag() { $dom = new Dom(); @@ -81,29 +49,6 @@ public function testLoadNoOpeningTag() $this->assertEquals('content', $dom->find('.content', 0)->text); } - public function testLoadNoClosingTag() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $root = $dom->find('div', 0)->getParent(); - $this->assertEquals('

Hey bro, click here


', $root->outerHtml); - } - - public function testLoadAttributeOnSelfClosing() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $br = $dom->find('br', 0); - $this->assertEquals('both', $br->getAttribute('class')); - } - - public function testLoadClosingTagOnSelfClosing() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); - } - public function testLoadNoValueAttribute() { $dom = new Dom(); @@ -223,55 +168,6 @@ public function testLoadFromUrl() $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); } - public function testToStringMagic() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $this->assertEquals('

Hey bro, click here
:)

', (string) $dom); - } - - public function testGetMagic() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $this->assertEquals('

Hey bro, click here
:)

', $dom->innerHtml); - } - - public function testFirstChild() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('

Hey bro, click here

', $dom->firstChild()->outerHtml); - } - - public function testLastChild() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('
', $dom->lastChild()->outerHtml); - } - - public function testGetElementById() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('click here', $dom->getElementById('78')->outerHtml); - } - - public function testGetElementsByTag() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('

Hey bro, click here

', $dom->getElementsByTag('p')[0]->outerHtml); - } - - public function testGetElementsByClass() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('

Hey bro, click here

', $dom->getElementsByClass('all')[0]->innerHtml); - } - public function testScriptCleanerScriptTag() { $dom = new Dom(); @@ -321,16 +217,6 @@ public function testCodeTag() $this->assertEquals('hello$foo = "bar";', (string) $dom); } - public function testDeleteNode() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $a = $dom->find('a')[0]; - $a->delete(); - unset($a); - $this->assertEquals('

Hey bro,
:)

', (string) $dom); - } - public function testCountChildren() { $dom = new Dom(); @@ -619,4 +505,15 @@ public function testXMLOpeningToken() $this->assertEquals('

fun time

', $dom->outerHtml); } + + /** + * Test to cover issue found in ticket #221 + */ + public function testRandomTagInMiddleOfText() + { + $dom = new Dom(); + $dom->loadStr('

Hello, this is just a test in which <55 names with some other text > should be interpreted as text

'); + + $this->assertEquals('

Hello, this is just a test in which <55 names with some other text> should be interpreted as text

', $dom->outerHtml); + } } From c487fce3c8f931a7dbbbe630666316625c7d247a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Fri, 17 Jul 2020 04:07:21 +0000 Subject: [PATCH 181/200] Cleaned up code base --- README.md | 12 +- .../Contracts/Dom/CleanerInterface.php | 16 + .../Contracts/Dom/ParserInterface.php | 33 + .../Contracts/Selector/SelectorInterface.php | 4 +- src/PHPHtmlParser/DTO/TagDTO.php | 2 +- .../Discovery/CleanerDiscovery.php | 25 + .../Discovery/DomParserDiscovery.php | 25 + ...covery.php => SelectorParserDiscovery.php} | 2 +- src/PHPHtmlParser/Dom.php | 582 ++---------------- src/PHPHtmlParser/Dom/Cleaner.php | 108 ++++ .../Dom/{ => Node}/AbstractNode.php | 3 +- .../Dom/{ => Node}/ArrayNode.php | 2 +- .../Dom/{ => Node}/Collection.php | 2 +- src/PHPHtmlParser/Dom/{ => Node}/HtmlNode.php | 3 +- .../Dom/{ => Node}/InnerNode.php | 2 +- src/PHPHtmlParser/Dom/{ => Node}/LeafNode.php | 2 +- src/PHPHtmlParser/Dom/{ => Node}/TextNode.php | 3 +- src/PHPHtmlParser/Dom/Parser.php | 332 ++++++++++ src/PHPHtmlParser/Dom/RootAccessTrait.php | 100 +++ src/PHPHtmlParser/Dom/Tag.php | 0 src/PHPHtmlParser/Finder.php | 4 +- src/PHPHtmlParser/Selector/Seeker.php | 6 +- src/PHPHtmlParser/Selector/Selector.php | 8 +- tests/CollectionTest.php | 4 +- tests/Dom/LoadTest.php | 1 - tests/DomTest.php | 5 +- tests/Node/ChildrenTest.php | 2 +- tests/Node/HtmlTest.php | 6 +- tests/Node/ParentTest.php | 2 +- tests/Node/TextTest.php | 2 +- tests/Selector/SelectorTest.php | 2 +- tests/data/MockNode.php | 2 +- 32 files changed, 729 insertions(+), 573 deletions(-) create mode 100644 src/PHPHtmlParser/Contracts/Dom/CleanerInterface.php create mode 100644 src/PHPHtmlParser/Contracts/Dom/ParserInterface.php create mode 100644 src/PHPHtmlParser/Discovery/CleanerDiscovery.php create mode 100644 src/PHPHtmlParser/Discovery/DomParserDiscovery.php rename src/PHPHtmlParser/Discovery/{ParserDiscovery.php => SelectorParserDiscovery.php} (93%) create mode 100644 src/PHPHtmlParser/Dom/Cleaner.php rename src/PHPHtmlParser/Dom/{ => Node}/AbstractNode.php (99%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/ArrayNode.php (95%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/Collection.php (98%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/HtmlNode.php (98%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/InnerNode.php (99%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/LeafNode.php (76%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/TextNode.php (98%) mode change 100755 => 100644 create mode 100644 src/PHPHtmlParser/Dom/Parser.php create mode 100644 src/PHPHtmlParser/Dom/RootAccessTrait.php mode change 100755 => 100644 src/PHPHtmlParser/Dom/Tag.php diff --git a/README.md b/README.md index cbd64800..c46d9913 100755 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ You can find many examples of how to use the dom parser and any of its parts (wh ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->loadStr('

Hey bro, click here
:)

'); @@ -46,7 +46,7 @@ You may also seamlessly load a file into the dom instead of a string, which is m ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->loadFromFile('tests/data/big.html'); @@ -79,7 +79,7 @@ Loading a url is very similar to the way you would load the html from a file. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->loadFromUrl('http://google.com'); @@ -95,7 +95,7 @@ What makes the loadFromUrl method note worthy is the `PHPHtmlParser\CurlInterfac ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; use App\Services\Connector; $dom = new Dom; @@ -113,7 +113,7 @@ Loading a string directly, with out the checks in `load()` is also easily done. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->loadStr('String', []); @@ -130,7 +130,7 @@ You can also set parsing option that will effect the behavior of the parsing eng ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->setOptions([ diff --git a/src/PHPHtmlParser/Contracts/Dom/CleanerInterface.php b/src/PHPHtmlParser/Contracts/Dom/CleanerInterface.php new file mode 100644 index 00000000..e2a4f111 --- /dev/null +++ b/src/PHPHtmlParser/Contracts/Dom/CleanerInterface.php @@ -0,0 +1,16 @@ +isLoaded(); + if ($domParser === null) { + $domParser = DomParserDiscovery::find(); + } + if ($domCleaner === null) { + $domCleaner = CleanerDiscovery::find(); + } - return $this->root->innerHtml(); + $this->domParser = $domParser; + $this->domCleaner = $domCleaner; } /** - * A simple wrapper around the root node. - * - * @param string $name + * Returns the inner html of the root node. * + * @throws ChildNotFoundException + * @throws UnknownChildTypeException * @throws NotLoadedException - * - * @return mixed */ - public function __get($name) + public function __toString(): string { $this->isLoaded(); - return $this->root->$name; + return $this->root->innerHtml(); } /** @@ -122,8 +92,9 @@ public function __get($name) * * @throws ChildNotFoundException * @throws CircularException - * @throws StrictException + * @throws Exceptions\ContentLengthException * @throws LogicalException + * @throws StrictException */ public function loadFromFile(string $file, ?Options $options = null): Dom { @@ -139,12 +110,12 @@ public function loadFromFile(string $file, ?Options $options = null): Dom * Use a curl interface implementation to attempt to load * the content from a url. * - * @param ClientInterface $client - * * @throws ChildNotFoundException * @throws CircularException + * @throws Exceptions\ContentLengthException + * @throws LogicalException * @throws StrictException - * @throws \Psr\Http\Client\ClientExceptionInterface + * @throws ClientExceptionInterface */ public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $client = null, ?RequestInterface $request = null): Dom { @@ -167,28 +138,26 @@ public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $cl * * @throws ChildNotFoundException * @throws CircularException + * @throws Exceptions\ContentLengthException + * @throws LogicalException * @throws StrictException */ public function loadStr(string $str, ?Options $options = null): Dom { - $this->options = new Options(); + $localOptions = new Options(); if ($this->globalOptions !== null) { - $this->options = $this->options->setFromOptions($this->globalOptions); + $localOptions = $localOptions->setFromOptions($this->globalOptions); } if ($options !== null) { - $this->options = $this->options->setFromOptions($options); + $localOptions = $localOptions->setFromOptions($options); } - $this->rawSize = \strlen($str); - $this->raw = $str; - - $html = $this->clean($str); + $html = $this->domCleaner->clean($str, $localOptions); - $this->size = \strlen($str); $this->content = new Content($html); - $this->parse(); - $this->detectCharset(); + $this->root = $this->domParser->parse($localOptions, $this->content, strlen($str)); + $this->domParser->detectCharset($localOptions, $this->defaultCharset, $this->root); return $this; } @@ -208,78 +177,16 @@ public function setOptions(Options $options): Dom /** * Find elements by css selector on the root node. * - * @throws ChildNotFoundException - * @throws NotLoadedException - * * @return mixed|Collection|null - */ - public function find(string $selector, int $nth = null) - { - $this->isLoaded(); - - return $this->root->find($selector, $nth); - } - - /** - * Simple wrapper function that returns the first child. - * - * @throws ChildNotFoundException * @throws NotLoadedException - */ - public function firstChild(): AbstractNode - { - $this->isLoaded(); - - return $this->root->firstChild(); - } - - /** - * Simple wrapper function that returns the last child. * * @throws ChildNotFoundException - * @throws NotLoadedException - */ - public function lastChild(): AbstractNode - { - $this->isLoaded(); - - return $this->root->lastChild(); - } - - /** - * Simple wrapper function that returns count of child elements. - * - * @throws NotLoadedException - */ - public function countChildren(): int - { - $this->isLoaded(); - - return $this->root->countChildren(); - } - - /** - * Get array of children. - * - * @throws NotLoadedException */ - public function getChildren(): array - { - $this->isLoaded(); - - return $this->root->getChildren(); - } - - /** - * Check if node have children nodes. - * - * @throws NotLoadedException - */ - public function hasChildren(): bool + public function find(string $selector, int $nth = null) { $this->isLoaded(); - return $this->root->hasChildren(); + return $this->root->find($selector, $nth); } /** @@ -288,10 +195,10 @@ public function hasChildren(): bool * * @param $id * - * @throws ChildNotFoundException + * @return mixed|Collection|null * @throws NotLoadedException * - * @return mixed|Collection|null + * @throws ChildNotFoundException */ public function getElementById($id) { @@ -304,10 +211,10 @@ public function getElementById($id) * Simple wrapper function that returns all elements by * tag name. * - * @throws ChildNotFoundException + * @return mixed|Collection|null * @throws NotLoadedException * - * @return mixed|Collection|null + * @throws ChildNotFoundException */ public function getElementsByTag(string $name) { @@ -320,10 +227,10 @@ public function getElementsByTag(string $name) * Simple wrapper function that returns all elements by * class name. * - * @throws ChildNotFoundException + * @return mixed|Collection|null * @throws NotLoadedException * - * @return mixed|Collection|null + * @throws ChildNotFoundException */ public function getElementsByClass(string $class) { @@ -343,395 +250,4 @@ private function isLoaded(): void throw new NotLoadedException('Content is not loaded!'); } } - - /** - * Cleans the html of any none-html information. - */ - private function clean(string $str): string - { - if (!$this->options->isCleanupInput()) { - // skip entire cleanup step - return $str; - } - - $is_gzip = 0 === \mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, 'US-ASCII'); - if ($is_gzip) { - $str = \gzdecode($str); - if ($str === false) { - throw new LogicalException('gzdecode returned false. Error when trying to decode the string.'); - } - } - - // remove white space before closing tags - $str = \mb_eregi_replace("'\s+>", "'>", $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.'); - } - $str = \mb_eregi_replace('"\s+>', '">', $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.'); - } - - // clean out the \n\r - $replace = ' '; - if ($this->options->isPreserveLineBreaks()) { - $replace = ' '; - } - $str = \str_replace(["\r\n", "\r", "\n"], $replace, $str); - if ($str === false) { - throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.'); - } - - // strip the doctype - $str = \mb_eregi_replace('', '', $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.'); - } - - // strip out comments - $str = \mb_eregi_replace('', '', $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.'); - } - - // strip out cdata - $str = \mb_eregi_replace("", '', $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.'); - } - - // strip out + + + + + + + + + An Introduction to Custom Fields – WordPress.tv + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ + +

An Introduction to Custom Fields

+ +
+ +
+ +
+
+ +
+
+
+
+

+ 16 responses on “An Introduction to Custom Fields

+ +
    +
  1. + + Alex (Viper007Bond) + +
    + + + + + + +
    +
  2. + +
  3. + + Consciência Planetária + +
    + + + + + +
    + +

    Nice explanation!

    +

    I always wondered how these thumbnails are added to magazine layout themes and I had no idea! Indeed I was more interested on the PHP code of how making it work 😛

    +

    It seems that with custom fields we can do pratically any customization…
    +But I have 2 questions

    +

    Are custom fields limited to varchar values, or can we use any kind of data there? Does it have any size limit?

    +

    Ans what about these themes that have a stick post with a bigger image? Does it have 2 custom fields 1 for thumnail and 1 for stick image?

    +

    tnx again!

    +

    Like

    +
    +
    +
  4. + +
  5. + + Vid + +
    + + + + + +
    + +

    Hi, Scott

    +

    This is very helpful for those of us who aren’t programmers but want to maximize WordPress. Thanks for taking the time to submit this tip.

    +

    Like

    +
    +
    +
  6. + +
  7. + + Thomas Clausen + +
    + + + + + +
    + +

    Justin Tadlocks Get the Image plugin can also help you to solve this task (also without CSS stuff in the php file 😉 ).

    +

    Like

    +
    +
    +
  8. + +
  9. + + driz + +
    + + + + + +
    + +

    I would like to see a follow-up video explaining about that Custom Field Template that you told us to ignore. I know it’s created by coding in some WP hooks in functions.php as I have done it myself, but it would be cool to see your interpretation of doing this, might pick up some additional tips.

    +

    Like

    +
    +
    +
  10. + +
  11. + + Scott Ellis + +
    + + + + + +
    + +

    Thomas, that is a good plugin, Justin does great stuff but custom fileds can be used for a lot of things, images were just an easy example, hopefully viewers will find other creative uses. Not sure what you mean about all the css in the php?

    +

    Like

    +
    +
    +
  12. +
      +
    • + + Thomas Clausen + +
      + + + + + +
      + +

      I was just wondering about the styling, that you’ve got about 3 minutes into the video.

      +

      But I didn’t mean to take our eyes of off things. You’re absolutely right the custom fields can be a powerful tool. And I hadn’t really grasped how easy it is, before I saw your video. Good job.

      +

      Like

      +
      +
      +
    • + +
    + +
  13. + + Karen + +
    + + + + + +
    + +

    This is very hard to follow. I admit, I’m not a newbie, but I am not terribly advanced, so I’m not sure where the problem is. One, I think you’re moving too fast. Two, when exactly should you first see the picture appear in the custom field? After you copy the url to the value field and update? or after you go into the home.php and change the code? That is completely unclear. This is such a great topic, so I hope if I post a few questions, it will become clear how to do this. I really do appreciate that you’ve posted this. Thanks!

    +

    Like

    +
    +
    +
  14. + +
  15. + + Karen + +
    + + + + + +
    + +

    also, my home.php doesn’t have the code you show. I am working in the theme Constructor. below is all the text in the home.php file:

    +

    Like

    +
    +
    +
  16. + +
  17. + + Karen + +
    + + + + + +
    + +

    Woopsie & sorry. below is the code in my file:

    +

    Like

    +
    +
    +
  18. + +
  19. + + Scott Ellis + +
    + + + + + +
    + +

    Consciência, you could use two custom fields for that if you wanted to. It would work just fine. I’ve read about some potential performance issues if you use a lot of custom fields but haven’t experienced it myself. We use several on citycrush.com including for the post thumbnail and the image in the post after you click through. The custom field type in the DB is “longtext” maximum size 4GB.

    +

    Driz – We used a plugin for that previously but moved to adding it to the functions.php and using wp hooks so it sounds like you are doing exactly what we would.

    +

    Thomas, most of what was in there was actual logic or just spitting out the html, not css styling, we keep all of that in the stylesheet. Glad you enjoyed the video.

    +

    Karen, sorry it felt fast, if you look at an example and watch the video I’m sure you’ll pick it up quickly. It took me a couple of rounds the first time I started playing with custom fields. Justin Tadlock has a good explanation here as well: http://justintadlock.com/archives/2007/10/24/using-wordpress-custom-fields-introduction.
    +The pictures will appear on the page where your custom field spits them out once you save the image url in the appropriate custom field. FYI, you code didn’t show up so visit http://www.vsellis.com/wordpress-how-to/using-custom-fields-in-wordpress/ and leave a comment and I’ll take a closer look.

    +

    Like

    +
    +
    +
  20. + +
  21. + + Consciência Planetária + +
    + + + + + +
    + +

    Thanks for the reply!

    +

    I’d like to suggest a subject for a future tutorial.

    +

    I love Drupal’s ability to use blocks above and below main content area. It is much easier to implement than WordPress widgets.

    +

    But I know it can be done in WordPress too. I’ve seen some magazine themes that have a “horizontal sidebar” on the botton of the page, and recently I’ve also seen a premium theme that has a “top horizontal sidebar” and a “bottom horizontal sidebar”, together with standard right and left ones.

    +

    It would be great if we had a tutorial teaching how to do it!

    +

    Like

    +
    +
    +
  22. + +
  23. + + PNaw10 + +
    + + + + + +
    + +

    Hello all, just wanted to add one extra tidbit of info.

    +

    The first time you use custom fields, the “name” field is blank, so yes, you would be typing in “thumbnail” as seen in the video. But after your very first use, the “name” field will appear as a pulldown menu which displays ALL previously-used names. So you really don’t have to worry about typing it the right way every single time — just as long as you get it right the first time, you can just select it from the menu. Much faster, and it ensures you’re spelling it the same way every time.

    +

    I realize everyone will discover this on their own as they try it, but thought I’d mention it in case anyone was daunted by the prospect of having to be extra-careful about typing out those case-sensitive field names every time.

    +

    Case-sensitive is definitely important though… for one website I run, cnyradio.com , I originally used Tadlock’s “Newspaperize” theme, which used the custom keywords “thumbnail” and “image.” Later, I upgraded to a newer theme of his, but the theme was designed to seek out “Thumbnail” and “Image” with capital letters at the beginning. Rather than go through all my old posts to change the custom keywords (would have taken forever) I just changed the uppercase letters to lowercase in the theme templates.

    +

    If you want a good example of how different custom fields can help with your site design, check out cnyradio.com. It’s not as complex as the site shown in the video, but it’s (hopefully) still simple enough for newbies (like I was just 2 years ago) to understand.

    +

    My “loop” pages (home page, category pages, etc.) show 128×96 images invoked by the “thumbnail” custom field. When you click to read the full text of any post, a larger 200×150 image appears, invoked by the “image” custom field. If either field is blank or missing, then the site simply doesn’t display an image — the text takes up the entire width of the space.

    +

    Yes, it’s more work because I have to create 2 custom fields for each post, and I create 2 separate images. I do the latter for two main reasons. One, I don’t like relying on web browsers to resize images on-the-fly. Even if it looks OK on my computer, it may appear choppy on someone else’s.

    +

    Two, and more importantly, an image at 200×150 doesn’t always look so good when you simply resize it to 128×96. For example, the “fullsize” version of any mugshots I use will often include the subject’s name and a “courtesy line” to credit the photo source. But that text would be cluttered and tiny when the size is reduced, so when I make the thumbnail, I usually delete the “courtesy” line and bump up the text size of the person’s last name so it’s less cluttered and easier to read.

    +

    If anyone reading this does look at my site to see what I’m talking about, just a note that any “Picture of the Week” posts are done entirely differently. I won’t get into details, just wanted to avoid any confusion.

    +

    Like

    +
    +
    +
  24. + +
  25. + + Sarfraz Ahmed + +
    + + + + + +
    + +

    can we add custom fields to wordpress.com blogs?

    +

    Like

    +
    +
    +
  26. + + +
  27. + + votar fotos + +
    + + + + + +
    + +

    I guess never say never, huh?

    +

    Like

    +
    +
    +
  28. + +
+
+

Continue the discussion

+ + +
+ +
+
+ +
+
+

Fill in your details below or click an icon to log in:

+ +
+ +
+
+
+ Gravatar +
+ +
+ +
+ +
+
+
+ +
+
+
+ +
+
+ +
+
+
+ WordPress.com Logo +
+ +
+ + + +

+ + You are commenting using your WordPress.com account. + ( Log Out /  + Change ) + + +

+
+ +
+
+ +
+
+
+ Google photo +
+ +
+ + + +

+ + You are commenting using your Google account. + ( Log Out /  + Change ) + + +

+
+ +
+
+ +
+
+
+ Twitter picture +
+ +
+ + + +

+ + You are commenting using your Twitter account. + ( Log Out /  + Change ) + + +

+
+ +
+
+ +
+
+
+ Facebook photo +
+ +
+ + + +

+ + You are commenting using your Facebook account. + ( Log Out /  + Change ) + + +

+
+ +
+
+ + +
+ +

Connecting to %s

+
+ +
+ + + +
+

+ + + + +

+ +

+ +

+
+
+
+ +
+
Published
+

August 29, 2009

+ +

Using custom fields can be confusing to new WordPress users. Scott Ellis provides an introductory explanation of how to use custom fields for image placement and the components that go into making custom fields work from front end placement to back end utilization and code.

+

Rate this:

+
Speakers

Scott Ellis 3

Tags

Custom Fields 23

Language

English 8849

Download
+
+MP4: Low, Med
OGG: Low
+
Subtitles
Subtitle this video → +
Producer
+ + + +
+
+ + +
+ + + + + + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + +
    + + + + + + diff --git a/tests/data/files/mvEregiReplaceFailure.html b/tests/data/files/mvEregiReplaceFailure.html new file mode 100644 index 00000000..d9a559d4 --- /dev/null +++ b/tests/data/files/mvEregiReplaceFailure.html @@ -0,0 +1,1117 @@ + + + + + + + + + + + + + + + + + + + + An Introduction to Custom Fields – WordPress.tv + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    +
    + + +

    An Introduction to Custom Fields

    + +
    + +
    + +
    +
    + +
    +
    +
    +
    +

    + 16 responses on “An Introduction to Custom Fields

    + +
      +
    1. + + Alex (Viper007Bond) + +
      + + + + + + +
      +
    2. + +
    3. + + Consciência Planetária + +
      + + + + + +
      + +

      Nice explanation!

      +

      I always wondered how these thumbnails are added to magazine layout themes and I had no idea! Indeed I was more interested on the PHP code of how making it work 😛

      +

      It seems that with custom fields we can do pratically any customization…
      +But I have 2 questions

      +

      Are custom fields limited to varchar values, or can we use any kind of data there? Does it have any size limit?

      +

      Ans what about these themes that have a stick post with a bigger image? Does it have 2 custom fields 1 for thumnail and 1 for stick image?

      +

      tnx again!

      +

      Like

      +
      +
      +
    4. + +
    5. + + Vid + +
      + + + + + +
      + +

      Hi, Scott

      +

      This is very helpful for those of us who aren’t programmers but want to maximize WordPress. Thanks for taking the time to submit this tip.

      +

      Like

      +
      +
      +
    6. + +
    7. + + Thomas Clausen + +
      + + + + + +
      + +

      Justin Tadlocks Get the Image plugin can also help you to solve this task (also without CSS stuff in the php file 😉 ).

      +

      Like

      +
      +
      +
    8. + +
    9. + + driz + +
      + + + + + +
      + +

      I would like to see a follow-up video explaining about that Custom Field Template that you told us to ignore. I know it’s created by coding in some WP hooks in functions.php as I have done it myself, but it would be cool to see your interpretation of doing this, might pick up some additional tips.

      +

      Like

      +
      +
      +
    10. + +
    11. + + Scott Ellis + +
      + + + + + +
      + +

      Thomas, that is a good plugin, Justin does great stuff but custom fileds can be used for a lot of things, images were just an easy example, hopefully viewers will find other creative uses. Not sure what you mean about all the css in the php?

      +

      Like

      +
      +
      +
    12. +
        +
      • + + Thomas Clausen + +
        + + + + + +
        + +

        I was just wondering about the styling, that you’ve got about 3 minutes into the video.

        +

        But I didn’t mean to take our eyes of off things. You’re absolutely right the custom fields can be a powerful tool. And I hadn’t really grasped how easy it is, before I saw your video. Good job.

        +

        Like

        +
        +
        +
      • + +
      + +
    13. + + Karen + +
      + + + + + +
      + +

      This is very hard to follow. I admit, I’m not a newbie, but I am not terribly advanced, so I’m not sure where the problem is. One, I think you’re moving too fast. Two, when exactly should you first see the picture appear in the custom field? After you copy the url to the value field and update? or after you go into the home.php and change the code? That is completely unclear. This is such a great topic, so I hope if I post a few questions, it will become clear how to do this. I really do appreciate that you’ve posted this. Thanks!

      +

      Like

      +
      +
      +
    14. + +
    15. + + Karen + +
      + + + + + +
      + +

      also, my home.php doesn’t have the code you show. I am working in the theme Constructor. below is all the text in the home.php file:

      +

      Like

      +
      +
      +
    16. + +
    17. + + Karen + +
      + + + + + +
      + +

      Woopsie & sorry. below is the code in my file:

      +

      Like

      +
      +
      +
    18. + +
    19. + + Scott Ellis + +
      + + + + + +
      + +

      Consciência, you could use two custom fields for that if you wanted to. It would work just fine. I’ve read about some potential performance issues if you use a lot of custom fields but haven’t experienced it myself. We use several on citycrush.com including for the post thumbnail and the image in the post after you click through. The custom field type in the DB is “longtext” maximum size 4GB.

      +

      Driz – We used a plugin for that previously but moved to adding it to the functions.php and using wp hooks so it sounds like you are doing exactly what we would.

      +

      Thomas, most of what was in there was actual logic or just spitting out the html, not css styling, we keep all of that in the stylesheet. Glad you enjoyed the video.

      +

      Karen, sorry it felt fast, if you look at an example and watch the video I’m sure you’ll pick it up quickly. It took me a couple of rounds the first time I started playing with custom fields. Justin Tadlock has a good explanation here as well: http://justintadlock.com/archives/2007/10/24/using-wordpress-custom-fields-introduction.
      +The pictures will appear on the page where your custom field spits them out once you save the image url in the appropriate custom field. FYI, you code didn’t show up so visit http://www.vsellis.com/wordpress-how-to/using-custom-fields-in-wordpress/ and leave a comment and I’ll take a closer look.

      +

      Like

      +
      +
      +
    20. + +
    21. + + Consciência Planetária + +
      + + + + + +
      + +

      Thanks for the reply!

      +

      I’d like to suggest a subject for a future tutorial.

      +

      I love Drupal’s ability to use blocks above and below main content area. It is much easier to implement than WordPress widgets.

      +

      But I know it can be done in WordPress too. I’ve seen some magazine themes that have a “horizontal sidebar” on the botton of the page, and recently I’ve also seen a premium theme that has a “top horizontal sidebar” and a “bottom horizontal sidebar”, together with standard right and left ones.

      +

      It would be great if we had a tutorial teaching how to do it!

      +

      Like

      +
      +
      +
    22. + +
    23. + + PNaw10 + +
      + + + + + +
      + +

      Hello all, just wanted to add one extra tidbit of info.

      +

      The first time you use custom fields, the “name” field is blank, so yes, you would be typing in “thumbnail” as seen in the video. But after your very first use, the “name” field will appear as a pulldown menu which displays ALL previously-used names. So you really don’t have to worry about typing it the right way every single time — just as long as you get it right the first time, you can just select it from the menu. Much faster, and it ensures you’re spelling it the same way every time.

      +

      I realize everyone will discover this on their own as they try it, but thought I’d mention it in case anyone was daunted by the prospect of having to be extra-careful about typing out those case-sensitive field names every time.

      +

      Case-sensitive is definitely important though… for one website I run, cnyradio.com , I originally used Tadlock’s “Newspaperize” theme, which used the custom keywords “thumbnail” and “image.” Later, I upgraded to a newer theme of his, but the theme was designed to seek out “Thumbnail” and “Image” with capital letters at the beginning. Rather than go through all my old posts to change the custom keywords (would have taken forever) I just changed the uppercase letters to lowercase in the theme templates.

      +

      If you want a good example of how different custom fields can help with your site design, check out cnyradio.com. It’s not as complex as the site shown in the video, but it’s (hopefully) still simple enough for newbies (like I was just 2 years ago) to understand.

      +

      My “loop” pages (home page, category pages, etc.) show 128×96 images invoked by the “thumbnail” custom field. When you click to read the full text of any post, a larger 200×150 image appears, invoked by the “image” custom field. If either field is blank or missing, then the site simply doesn’t display an image — the text takes up the entire width of the space.

      +

      Yes, it’s more work because I have to create 2 custom fields for each post, and I create 2 separate images. I do the latter for two main reasons. One, I don’t like relying on web browsers to resize images on-the-fly. Even if it looks OK on my computer, it may appear choppy on someone else’s.

      +

      Two, and more importantly, an image at 200×150 doesn’t always look so good when you simply resize it to 128×96. For example, the “fullsize” version of any mugshots I use will often include the subject’s name and a “courtesy line” to credit the photo source. But that text would be cluttered and tiny when the size is reduced, so when I make the thumbnail, I usually delete the “courtesy” line and bump up the text size of the person’s last name so it’s less cluttered and easier to read.

      +

      If anyone reading this does look at my site to see what I’m talking about, just a note that any “Picture of the Week” posts are done entirely differently. I won’t get into details, just wanted to avoid any confusion.

      +

      Like

      +
      +
      +
    24. + +
    25. + + Sarfraz Ahmed + +
      + + + + + +
      + +

      can we add custom fields to wordpress.com blogs?

      +

      Like

      +
      +
      +
    26. + + +
    27. + + votar fotos + +
      + + + + + +
      + +

      I guess never say never, huh?

      +

      Like

      +
      +
      +
    28. + +
    +
    +

    Continue the discussion

    + + +
    + +
    +
    + +
    +
    +

    Fill in your details below or click an icon to log in:

    + +
    + +
    +
    +
    + Gravatar +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    + +
    +
    + +
    +
    +
    + WordPress.com Logo +
    + +
    + + + +

    + + You are commenting using your WordPress.com account. + ( Log Out /  + Change ) + + +

    +
    + +
    +
    + +
    +
    +
    + Google photo +
    + +
    + + + +

    + + You are commenting using your Google account. + ( Log Out /  + Change ) + + +

    +
    + +
    +
    + +
    +
    +
    + Twitter picture +
    + +
    + + + +

    + + You are commenting using your Twitter account. + ( Log Out /  + Change ) + + +

    +
    + +
    +
    + +
    +
    +
    + Facebook photo +
    + +
    + + + +

    + + You are commenting using your Facebook account. + ( Log Out /  + Change ) + + +

    +
    + +
    +
    + + +
    + +

    Connecting to %s

    +
    + +
    + + + +
    +

    + + + + +

    + +

    + +

    +
    +
    +
    + +
    +
    Published
    +

    August 29, 2009

    + +

    Using custom fields can be confusing to new WordPress users. Scott Ellis provides an introductory explanation of how to use custom fields for image placement and the components that go into making custom fields work from front end placement to back end utilization and code.

    +

    Rate this:

    +
    Speakers

    Scott Ellis 3

    Tags

    Custom Fields 23

    Language

    English 8849

    Download
    +
    +MP4: Low, Med
    OGG: Low
    +
    Subtitles
    Subtitle this video → +
    Producer
    + + + +
    +
    + + +
    + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + +
      + + + + + + From c116346b2fcc038d4ab6615f4328346a4eebf121 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 19 Jul 2020 18:28:37 +0000 Subject: [PATCH 184/200] Updated documentation --- CHANGELOG.md | 4 +- README.md | 82 ++++++++++++++++++++++----------------- tests/Dom/CleanerTest.php | 2 +- 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05d2146f..25f862dc 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## 3.0.0 ### Added - Support for PSR7 HTTP clients and requests for URL calls has been added. @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - PHP-CS-Fixer added. - Support for html5 charset detection. - Added the ability to match both parent and children. +- Added character set conversion in load. ### Changed - Fixed issue with \ causing an infite loop. @@ -28,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Removed support for the depth first search option. - `findById()` method removed from Dom object. - Removed `load()` method in Dom object. +- Removed support for php 7.1. ## 2.2.0 diff --git a/README.md b/README.md index c46d9913..32853b91 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 2.2.1 +Version 3.0.0 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) @@ -18,7 +18,7 @@ Install the latest version using composer. $ composer require paquettg/php-html-parser ``` -This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.1, 7.2, 7.3, and 7.4. +This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.2, 7.3, and 7.4. Usage ----- @@ -28,7 +28,7 @@ You can find many examples of how to use the dom parser and any of its parts (wh ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; $dom = new Dom; $dom->loadStr('

      Hey bro, click here
      :)

      '); @@ -46,7 +46,7 @@ You may also seamlessly load a file into the dom instead of a string, which is m ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; $dom = new Dom; $dom->loadFromFile('tests/data/big.html'); @@ -69,8 +69,6 @@ foreach ($contents as $content) This example loads the html from big.html, a real page found online, and gets all the content-border classes to process. It also shows a few things you can do with a node but it is not an exhaustive list of methods that a node has available. -Alternativly, you can always use the `load()` method to load the file. It will attempt to find the file using `file_exists` and, if successful, will call `loadFromFile()` for you. The same applies to a URL and `loadFromUrl()` method. - Loading Url ---------------- @@ -79,7 +77,7 @@ Loading a url is very similar to the way you would load the html from a file. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; $dom = new Dom; $dom->loadFromUrl('http://google.com'); @@ -90,38 +88,36 @@ $dom->loadFromUrl('http://google.com'); $html = $dom->outerHtml; // same result as the first example ``` -What makes the loadFromUrl method note worthy is the `PHPHtmlParser\CurlInterface` parameter, an optional second parameter. By default, we use the `PHPHtmlParser\Curl` class to get the contents of the url. On the other hand, though, you can inject your own implementation of CurlInterface and we will attempt to load the url using what ever tool/settings you want, up to you. +loadFromUrl will, by default, use an implementation of the `\Psr\Http\Client\ClientInterface` to do the HTTP request and a default implementation of `\Psr\Http\Message\RequestInterface` to create the body of the request. You can easely implement your own version of either the client or request to use a custom HTTP connection when using loadFromUrl. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; -use App\Services\Connector; +use PHPHtmlParser\Dom; +use App\Services\MyClient; $dom = new Dom; -$dom->loadFromUrl('http://google.com', [], new Connector); +$dom->loadFromUrl('http://google.com', null, new MyClient()); $html = $dom->outerHtml; ``` -As long as the Connector object implements the `PHPHtmlParser\CurlInterface` interface properly it will use that object to get the content of the url instead of the default `PHPHtmlParser\Curl` class. +As long as the client object implements the interface properly it will use that object to get the content of the url. Loading Strings --------------- -Loading a string directly, with out the checks in `load()` is also easily done. +Loading a string directly is also easily done. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; $dom = new Dom; -$dom->loadStr('String', []); +$dom->loadStr('String'); $html = $dom->outerHtml; ``` -If the string is to long, depending on your file system, the `load()` method will throw a warning. If this happens you can just call the above method to bypass the `is_file()` check in the `load()` method. - Options ------- @@ -130,21 +126,24 @@ You can also set parsing option that will effect the behavior of the parsing eng ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; +use PHPHtmlParser\Options; $dom = new Dom; -$dom->setOptions([ - 'strict' => true, // Set a global option to enable strict html parsing. -]); +$dom->setOptions( + // this is set as the global option level. + (new Options()) + ->setStrict(true) +); -$dom->loadFromUrl('http://google.com', [ - 'whitespaceTextNode' => false, // Only applies to this load. -]); +$dom->loadFromUrl('http://google.com', + (new Options())->setWhitespaceTextNode(false) // only applies to this load. +); $dom->loadFromUrl('http://gmail.com'); // will not have whitespaceTextNode set to false. ``` -At the moment we support 8 options. +At the moment we support 12 options. **Strict** @@ -182,15 +181,17 @@ Set this to `false` if you want to preserve whitespace inside of text nodes. It Set this to `false` if you want to preserve smarty script found in the html content. It is set to `true` by default. -**depthFirstSearch** +**htmlSpecialCharsDecode** + +By default this is set to `false`. Setting this to `true` will apply the php function `htmlspecialchars_decode` too all attribute values and text nodes. -By default this is set to `false` for legacy support. Setting this to `true` will change the behavior of find to order elements by depth first. This will properly preserve the order of elements as they where in the HTML. +**selfClosing** -This option is depricated and will be removed in version `3.0.0` with the new behavior being as if it was set to `true`. +This option contains an array of all self closing tags. These tags must be self closing and the parser will force them to be so if you have strict turned on. You can update this list with any additional tags that can be used as a self closing tag when using strict. You can also remove tags from this array or clear it out completly. -**htmlSpecialCharsDecode** +**noSlash** -By default this is set to `false`. Setting this to `true` will apply the php function `htmlspecialchars_decode` too all attribute values and text nodes. +This option contains an array of all tags that can not be self closing. The list starts off as empty but you can add elements as you wish. Static Facade ------------- @@ -200,7 +201,7 @@ You can also mount a static facade for the Dom object. ```PHP PHPHtmlParser\StaticDom::mount(); -Dom::load('tests/big.hmtl'); +Dom::loadFromFile('tests/big.hmtl'); $objects = Dom::find('.content-border'); ``` @@ -213,8 +214,10 @@ Modifying The Dom You can always modify the dom that was created from any loading method. To change the attribute of any node you can just call the `setAttribute` method. ```php +use PHPHtmlParser\Dom; + $dom = new Dom; -$dom->load('

      Hey bro, click here
      :)

      '); +$dom->loadStr('

      Hey bro, click here
      :)

      '); $a = $dom->find('a')[0]; $a->setAttribute('class', 'foo'); echo $a->getAttribute('class'); // "foo" @@ -223,8 +226,11 @@ echo $a->getAttribute('class'); // "foo" You may also get the `PHPHtmlParser\Dom\Tag` class directly and manipulate it as you see fit. ```php +use PHPHtmlParser\Dom; + $dom = new Dom; -$dom->load('

      Hey bro, click here
      :)

      '); +$dom->loadStr('

      Hey bro, click here
      :)

      '); +/** @var Dom\Node\AbstractNode $a */ $a = $dom->find('a')[0]; $tag = $a->getTag(); $tag->setAttribute('class', 'foo'); @@ -234,8 +240,11 @@ echo $a->getAttribute('class'); // "foo" It is also possible to remove a node from the tree. Simply call the `delete` method on any node to remove it from the tree. It is important to note that you should unset the node after removing it from the `DOM``, it will still take memory as long as it is not unset. ```php +use PHPHtmlParser\Dom; + $dom = new Dom; -$dom->load('

      Hey bro, click here
      :)

      '); +$dom->loadStr('

      Hey bro, click here
      :)

      '); +/** @var Dom\Node\AbstractNode $a */ $a = $dom->find('a')[0]; $a->delete(); unset($a); @@ -245,8 +254,11 @@ echo $dom; // '

      Hey bro,
      :)

      '); You can modify the text of `TextNode` objects easely. Please note that, if you set an encoding, the new text will be encoded using the existing encoding. ```php +use PHPHtmlParser\Dom; + $dom = new Dom; -$dom->load('

      Hey bro, click here
      :)

      '); +$dom->loadStr('

      Hey bro, click here
      :)

      '); +/** @var Dom\Node\InnerNode $a */ $a = $dom->find('a')[0]; $a->firstChild()->setText('biz baz'); echo $dom; // '

      Hey bro, biz baz
      :)

      ' diff --git a/tests/Dom/CleanerTest.php b/tests/Dom/CleanerTest.php index 8473eaff..3ff32506 100644 --- a/tests/Dom/CleanerTest.php +++ b/tests/Dom/CleanerTest.php @@ -8,7 +8,7 @@ class CleanerTest extends TestCase { - public function testLoadByURL() + public function testCleanEregiFailureFile() { $cleaner = new Cleaner(); $string = $cleaner->clean(\file_get_contents('tests/data/files/mvEregiReplaceFailure.html'), new Options(), 'utf-8'); From 382b98c8de5c6b7d1fd59831e88ddcb82f4f6fbb Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Aug 2020 18:15:44 +0000 Subject: [PATCH 185/200] fixed #228 - Fixed documentation and added typew hint --- src/PHPHtmlParser/Dom/Node/AbstractNode.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Dom/Node/AbstractNode.php b/src/PHPHtmlParser/Dom/Node/AbstractNode.php index 254ca31f..a2c29274 100644 --- a/src/PHPHtmlParser/Dom/Node/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/Node/AbstractNode.php @@ -154,9 +154,9 @@ public function id(): int /** * Returns the parent of node. * - * @return AbstractNode + * @return InnerNode */ - public function getParent() + public function getParent(): ?InnerNode { return $this->parent; } From cf0bb680b39a8ad59444d54e67462527c4ba455a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Aug 2020 18:17:12 +0000 Subject: [PATCH 186/200] Small refactor - Added private methods for DTO --- .gitattributes | 1 + .../Selector/ParsedSelectorCollectionDTO.php | 19 +++++++++++--- .../DTO/Selector/ParsedSelectorDTO.php | 19 +++++++++++--- src/PHPHtmlParser/DTO/Selector/RuleDTO.php | 26 +++++++++++-------- src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 10 ++++++- src/PHPHtmlParser/DTO/TagDTO.php | 12 ++++++++- src/PHPHtmlParser/Dom/Parser.php | 22 +++++----------- src/PHPHtmlParser/Dom/Tag.php | 8 +++--- src/PHPHtmlParser/Selector/Parser.php | 22 ++++++++-------- tests/Selector/SeekerTest.php | 16 ++++++------ 10 files changed, 95 insertions(+), 60 deletions(-) diff --git a/.gitattributes b/.gitattributes index ebfea7c7..77b544ff 100755 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,5 @@ /tests export-ignore +/tests linguist-documentation /.scrutinizar.yml export-ignore /.travis.yml export-ignore /.gitignore export-ignore diff --git a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php index 128db514..870262cc 100644 --- a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php +++ b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php @@ -11,15 +11,26 @@ final class ParsedSelectorCollectionDTO */ private $parsedSelectorDTO = []; - public function __construct(array $values) + /** + * @param ParsedSelectorDTO[] $parsedSelectorDTOs + */ + private function __construct(array $parsedSelectorDTOs) { - foreach ($values as $value) { - if ($value instanceof ParsedSelectorDTO) { - $this->parsedSelectorDTO[] = $value; + foreach ($parsedSelectorDTOs as $parsedSelectorDTO) { + if ($parsedSelectorDTO instanceof ParsedSelectorDTO) { + $this->parsedSelectorDTO[] = $parsedSelectorDTO; } } } + /** + * @param ParsedSelectorDTO[] $parsedSelectorDTOs + */ + public static function makeCollection(array $parsedSelectorDTOs): ParsedSelectorCollectionDTO + { + return new ParsedSelectorCollectionDTO($parsedSelectorDTOs); + } + /** * @return ParsedSelectorDTO[] */ diff --git a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php index 5424e2a7..bce0721f 100644 --- a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php +++ b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php @@ -11,15 +11,26 @@ final class ParsedSelectorDTO */ private $rules = []; - public function __construct(array $values) + /** + * @param RuleDTO[] $ruleDTOs + */ + private function __construct(array $ruleDTOs) { - foreach ($values as $value) { - if ($value instanceof RuleDTO) { - $this->rules[] = $value; + foreach ($ruleDTOs as $ruleDTO) { + if ($ruleDTO instanceof RuleDTO) { + $this->rules[] = $ruleDTO; } } } + /** + * @param RuleDTO[] $ruleDTOs + */ + public static function makeFromRules(array $ruleDTOs): ParsedSelectorDTO + { + return new ParsedSelectorDTO($ruleDTOs); + } + /** * @return RuleDTO[] */ diff --git a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php index 1c336149..5299e3a0 100644 --- a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php +++ b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php @@ -36,7 +36,7 @@ final class RuleDTO */ private $alterNext; - public function __construct(array $values) + private function __construct(array $values) { $this->tag = $values['tag']; $this->operator = $values['operator']; @@ -47,16 +47,26 @@ public function __construct(array $values) } /** - * @return string + * @param string|array|null $key + * @param string|array|null $value */ + public static function makeFromPrimitives(string $tag, string $operator, $key, $value, bool $noKey, bool $alterNext): RuleDTO + { + return new RuleDTO([ + 'tag' => $tag, + 'operator' => $operator, + 'key' => $key, + 'value' => $value, + 'noKey' => $noKey, + 'alterNext' => $alterNext, + ]); + } + public function getTag(): string { return $this->tag; } - /** - * @return string - */ public function getOperator(): string { return $this->operator; @@ -78,17 +88,11 @@ public function getValue() return $this->value; } - /** - * @return bool - */ public function isNoKey(): bool { return $this->noKey; } - /** - * @return bool - */ public function isAlterNext(): bool { return $this->alterNext; diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php index 6ac22197..3e7e1824 100755 --- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php +++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php @@ -19,12 +19,20 @@ final class AttributeDTO */ private $doubleQuote; - public function __construct(array $values) + private function __construct(array $values) { $this->value = $values['value']; $this->doubleQuote = $values['doubleQuote'] ?? true; } + public static function makeFromPrimitives(?string $value, bool $doubleQuote = true): AttributeDTO + { + return new AttributeDTO([ + 'value' => $value, + 'doubleQuote' => $doubleQuote, + ]); + } + public function getValue(): ?string { return $this->value; diff --git a/src/PHPHtmlParser/DTO/TagDTO.php b/src/PHPHtmlParser/DTO/TagDTO.php index e9d182db..71f0ec1c 100644 --- a/src/PHPHtmlParser/DTO/TagDTO.php +++ b/src/PHPHtmlParser/DTO/TagDTO.php @@ -28,7 +28,7 @@ final class TagDTO */ private $tag; - public function __construct(array $values = []) + private function __construct(array $values = []) { $this->status = $values['status'] ?? false; $this->closing = $values['closing'] ?? false; @@ -36,6 +36,16 @@ public function __construct(array $values = []) $this->tag = $values['tag'] ?? null; } + public static function makeFromPrimitives(bool $status = false, bool $closing = false, ?HtmlNode $node = null, ?string $tag = null): TagDTO + { + return new TagDTO([ + 'status' => $status, + 'closing' => $closing, + 'node' => $node, + 'tag' => $tag, + ]); + } + public function isStatus(): bool { return $this->status; diff --git a/src/PHPHtmlParser/Dom/Parser.php b/src/PHPHtmlParser/Dom/Parser.php index 0d4573f4..418e535c 100644 --- a/src/PHPHtmlParser/Dom/Parser.php +++ b/src/PHPHtmlParser/Dom/Parser.php @@ -160,10 +160,9 @@ public function detectCharset(Options $options, string $defaultCharset, Abstract */ private function parseTag(Options $options, Content $content, int $size): TagDTO { - $return = []; if ($content->char() != '<') { // we are not at the beginning of a tag - return new TagDTO(); + return TagDTO::makeFromPrimitives(); } // check if this is a closing tag @@ -171,7 +170,7 @@ private function parseTag(Options $options, Content $content, int $size): TagDTO $content->fastForward(1); } catch (ContentLengthException $exception) { // we are at the end of the file - return new TagDTO(); + return TagDTO::makeFromPrimitives(); } if ($content->char() == '/') { return $this->makeEndTag($content, $options); @@ -188,7 +187,7 @@ private function parseTag(Options $options, Content $content, int $size): TagDTO $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true)); if (\trim($tag) == '') { // no tag found, invalid < found - return new TagDTO(); + return TagDTO::makeFromPrimitives(); } } $node = new HtmlNode($tag); @@ -220,10 +219,7 @@ private function parseTag(Options $options, Content $content, int $size): TagDTO $content->fastForward(1); } - $return['status'] = true; - $return['node'] = $node; - - return new TagDTO($return); + return TagDTO::makeFromPrimitives(true, false, $node); } /** @@ -249,7 +245,6 @@ private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool */ private function makeEndTag(Content $content, Options $options): TagDTO { - $return = []; $tag = $content->fastForward(1) ->copyByToken(StringToken::SLASH(), true); // move to end of tag @@ -259,15 +254,10 @@ private function makeEndTag(Content $content, Options $options): TagDTO // check if this closing tag counts $tag = \strtolower($tag); if (\in_array($tag, $options->getSelfClosing(), true)) { - $return['status'] = true; - - return new TagDTO($return); + return TagDTO::makeFromPrimitives(true); } - $return['status'] = true; - $return['closing'] = true; - $return['tag'] = \strtolower($tag); - return new TagDTO($return); + return TagDTO::makeFromPrimitives(true, true, null, \strtolower($tag)); } /** diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index be974c1e..29b68bf7 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -163,10 +163,10 @@ public function noise(string $noise): Tag */ public function setAttribute(string $key, ?string $attributeValue, bool $doubleQuote = true): Tag { - $attributeDTO = new AttributeDTO([ - 'value' => $attributeValue, - 'doubleQuote' => $doubleQuote, - ]); + $attributeDTO = AttributeDTO::makeFromPrimitives( + $attributeValue, + $doubleQuote + ); if ($this->HtmlSpecialCharsDecode) { $attributeDTO->htmlspecialcharsDecode(); } diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index a70a7a5e..4643c467 100755 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -92,25 +92,25 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $noKey = true; } - $rules[] = new RuleDTO([ - 'tag' => $tag, - 'key' => $key, - 'value' => $value, - 'operator' => $operator, - 'noKey' => $noKey, - 'alterNext' => $alterNext, - ]); + $rules[] = RuleDTO::makeFromPrimitives( + $tag, + $operator, + $key, + $value, + $noKey, + $alterNext + ); if (isset($match[7]) && \is_string($match[7]) && \trim($match[7]) == ',') { - $selectors[] = new ParsedSelectorDTO($rules); + $selectors[] = ParsedSelectorDTO::makeFromRules($rules); $rules = []; } } // save last results if (\count($rules) > 0) { - $selectors[] = new ParsedSelectorDTO($rules); + $selectors[] = ParsedSelectorDTO::makeFromRules($rules); } - return new ParsedSelectorCollectionDTO($selectors); + return ParsedSelectorCollectionDTO::makeCollection($selectors); } } diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php index a5106e98..d9e0e824 100644 --- a/tests/Selector/SeekerTest.php +++ b/tests/Selector/SeekerTest.php @@ -10,14 +10,14 @@ class SeekerTest extends TestCase { public function testSeekReturnEmptyArray() { - $ruleDTO = new RuleDTO([ - 'tag' => 'tag', - 'key' => 1, - 'value' => null, - 'operator' => null, - 'noKey' => false, - 'alterNext' => false, - ]); + $ruleDTO = RuleDTO::makeFromPrimitives( + 'tag', + '=', + null, + null, + false, + false + ); $seeker = new Seeker(); $results = $seeker->seek([], $ruleDTO, []); $this->assertCount(0, $results); From 77a7eb18f003dba0b0d82f969e93f2896947121e Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Aug 2020 18:30:20 +0000 Subject: [PATCH 187/200] fixed #229 - Added documentation to reflect read only property --- .php_cs.dist | 3 ++- src/PHPHtmlParser/Dom/Node/AbstractNode.php | 13 ++++++------- src/PHPHtmlParser/Dom/Node/ArrayNode.php | 8 ++++++++ src/PHPHtmlParser/Dom/Node/HtmlNode.php | 7 +++++++ src/PHPHtmlParser/Dom/Node/InnerNode.php | 8 ++++++++ src/PHPHtmlParser/Dom/Node/LeafNode.php | 9 +++++++++ src/PHPHtmlParser/Dom/Node/TextNode.php | 7 +++++++ tests/Node/HtmlTest.php | 2 +- 8 files changed, 48 insertions(+), 9 deletions(-) diff --git a/.php_cs.dist b/.php_cs.dist index 56af284d..2ead7195 100644 --- a/.php_cs.dist +++ b/.php_cs.dist @@ -90,6 +90,7 @@ return PhpCsFixer\Config::create() 'method', 'param', 'property', + 'property-read', 'return', 'throws', 'type', @@ -100,7 +101,7 @@ return PhpCsFixer\Config::create() 'phpdoc_indent' => true, 'phpdoc_inline_tag' => true, 'phpdoc_no_access' => true, - 'phpdoc_no_alias_tag' => true, + 'phpdoc_no_alias_tag' => false, 'phpdoc_no_package' => true, 'phpdoc_no_useless_inheritdoc' => true, 'phpdoc_order' => true, diff --git a/src/PHPHtmlParser/Dom/Node/AbstractNode.php b/src/PHPHtmlParser/Dom/Node/AbstractNode.php index a2c29274..897445b0 100644 --- a/src/PHPHtmlParser/Dom/Node/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/Node/AbstractNode.php @@ -17,13 +17,12 @@ /** * Dom node object. * - * @property string $outerhtml - * @property string $innerhtml - * @property string $text - * @property int $prev - * @property int $next - * @property Tag $tag - * @property InnerNode $parent + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ abstract class AbstractNode { diff --git a/src/PHPHtmlParser/Dom/Node/ArrayNode.php b/src/PHPHtmlParser/Dom/Node/ArrayNode.php index fb8ed4c2..87e8bd51 100644 --- a/src/PHPHtmlParser/Dom/Node/ArrayNode.php +++ b/src/PHPHtmlParser/Dom/Node/ArrayNode.php @@ -7,10 +7,18 @@ use ArrayIterator; use Countable; use IteratorAggregate; +use PHPHtmlParser\Dom\Tag; /** * Dom node object which will allow users to use it as * an array. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ abstract class ArrayNode extends AbstractNode implements IteratorAggregate, Countable { diff --git a/src/PHPHtmlParser/Dom/Node/HtmlNode.php b/src/PHPHtmlParser/Dom/Node/HtmlNode.php index 0d78b8ff..2acb2592 100644 --- a/src/PHPHtmlParser/Dom/Node/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/Node/HtmlNode.php @@ -10,6 +10,13 @@ /** * Class HtmlNode. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ class HtmlNode extends InnerNode { diff --git a/src/PHPHtmlParser/Dom/Node/InnerNode.php b/src/PHPHtmlParser/Dom/Node/InnerNode.php index 911e10a0..448057a7 100644 --- a/src/PHPHtmlParser/Dom/Node/InnerNode.php +++ b/src/PHPHtmlParser/Dom/Node/InnerNode.php @@ -4,6 +4,7 @@ namespace PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom\Tag; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; use PHPHtmlParser\Exceptions\LogicalException; @@ -11,6 +12,13 @@ /** * Inner node of the html tree, might have children. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ abstract class InnerNode extends ArrayNode { diff --git a/src/PHPHtmlParser/Dom/Node/LeafNode.php b/src/PHPHtmlParser/Dom/Node/LeafNode.php index 7a2a7386..f74414a0 100644 --- a/src/PHPHtmlParser/Dom/Node/LeafNode.php +++ b/src/PHPHtmlParser/Dom/Node/LeafNode.php @@ -4,8 +4,17 @@ namespace PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom\Tag; + /** * Class LeafNode. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ abstract class LeafNode extends AbstractNode { diff --git a/src/PHPHtmlParser/Dom/Node/TextNode.php b/src/PHPHtmlParser/Dom/Node/TextNode.php index a5bd934e..1c8b646c 100644 --- a/src/PHPHtmlParser/Dom/Node/TextNode.php +++ b/src/PHPHtmlParser/Dom/Node/TextNode.php @@ -9,6 +9,13 @@ /** * Class TextNode. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ class TextNode extends LeafNode { diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index 153f7da5..592003b4 100755 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -334,7 +334,7 @@ public function testInnerText() $node->addChild($anode); $node->addChild($span_node); - $this->assertEquals($node->innerText(), '123 456789 101112'); + $this->assertEquals($node->innerText, '123 456789 101112'); } public function testTextLookInChildrenAndNoChildren() From 93ec62003ad6ebbe3136fa897e346aad7b377a0d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Aug 2020 18:59:19 +0000 Subject: [PATCH 188/200] Version 3.0.1 --- CHANGELOG.md | 6 ++++++ README.md | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25f862dc..ef3240dd 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 3.0.1 + +### Changed +- Updated all DTOs to make them immutable. +- Updated documentation. + ## 3.0.0 ### Added diff --git a/README.md b/README.md index 32853b91..c194c8be 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 3.0.0 +Version 3.0.1 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) From 2db46371c180c504a2d16e44e4800455f8a5c801 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Thu, 27 Aug 2020 10:05:23 -0400 Subject: [PATCH 189/200] Create FUNDING.yml --- .github/FUNDING.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..8fe59770 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,12 @@ +# These are supported funding model platforms + +# github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +# patreon: # Replace with a single Patreon username +# open_collective: # Replace with a single Open Collective username +# ko_fi: # Replace with a single Ko-fi username +tidelift: "packagist/paquettg/php-html-parser" +# community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +# liberapay: # Replace with a single Liberapay username +# issuehunt: # Replace with a single IssueHunt username +# otechie: # Replace with a single Otechie username +# custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] From 590a1b7ea822f31f2739277ef4402e88714ba61a Mon Sep 17 00:00:00 2001 From: Raphael Krut-Landau Date: Wed, 16 Sep 2020 11:31:25 -0400 Subject: [PATCH 190/200] Fixed a few stray typos --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c194c8be..26f6d20b 100755 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ This package can be found on [packagist](https://packagist.org/packages/paquettg Usage ----- -You can find many examples of how to use the dom parser and any of its parts (which you will most likely never touch) in the tests directory. The tests are done using PHPUnit and are very small, a few lines each, and are a great place to start. Given that, I'll still be showing a few examples of how the package should be used. The following example is a very simplistic usage of the package. +You can find many examples of how to use the DOM parser and any of its parts (which you will most likely never touch) in the tests directory. The tests are done using PHPUnit and are very small, a few lines each, and are a great place to start. Given that, I'll still be showing a few examples of how the package should be used. The following example is a very simplistic usage of the package. ```php // Assuming you installed from Composer: @@ -36,12 +36,12 @@ $a = $dom->find('a')[0]; echo $a->text; // "click here" ``` -The above will output "click here". Simple no? There are many ways to get the same result from the dome, such as `$dom->getElementsbyTag('a')[0]` or `$dom->find('a', 0)` which can all be found in the tests or in the code itself. +The above will output "click here". Simple, no? There are many ways to get the same result from the DOM, such as `$dom->getElementsbyTag('a')[0]` or `$dom->find('a', 0)`, which can all be found in the tests or in the code itself. Loading Files ------------------ -You may also seamlessly load a file into the dom instead of a string, which is much more convenient and is how I except most developers will be loading the html. The following example is taken from our test and uses the "big.html" file found there. +You may also seamlessly load a file into the DOM instead of a string, which is much more convenient and is how I expect most developers will be loading the HTML. The following example is taken from our test and uses the "big.html" file found there. ```php // Assuming you installed from Composer: @@ -67,12 +67,12 @@ foreach ($contents as $content) } ``` -This example loads the html from big.html, a real page found online, and gets all the content-border classes to process. It also shows a few things you can do with a node but it is not an exhaustive list of methods that a node has available. +This example loads the html from big.html, a real page found online, and gets all the content-border classes to process. It also shows a few things you can do with a node but it is not an exhaustive list of the methods that a node has available. -Loading Url +Loading URLs ---------------- -Loading a url is very similar to the way you would load the html from a file. +Loading a URL is very similar to the way you would load the HTML from a file. ```php // Assuming you installed from Composer: @@ -88,7 +88,7 @@ $dom->loadFromUrl('http://google.com'); $html = $dom->outerHtml; // same result as the first example ``` -loadFromUrl will, by default, use an implementation of the `\Psr\Http\Client\ClientInterface` to do the HTTP request and a default implementation of `\Psr\Http\Message\RequestInterface` to create the body of the request. You can easely implement your own version of either the client or request to use a custom HTTP connection when using loadFromUrl. +loadFromUrl will, by default, use an implementation of the `\Psr\Http\Client\ClientInterface` to do the HTTP request and a default implementation of `\Psr\Http\Message\RequestInterface` to create the body of the request. You can easily implement your own version of either the client or request to use a custom HTTP connection when using loadFromUrl. ```php // Assuming you installed from Composer: @@ -101,7 +101,7 @@ $dom->loadFromUrl('http://google.com', null, new MyClient()); $html = $dom->outerHtml; ``` -As long as the client object implements the interface properly it will use that object to get the content of the url. +As long as the client object implements the interface properly, it will use that object to get the content of the url. Loading Strings --------------- From 7d0468794e6f13874ecf134316ab39eb172d3455 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 19 Sep 2020 19:17:01 +0000 Subject: [PATCH 191/200] Fixed #235 - Removed guzzle 6 and added guzzle 7 support --- composer.json | 3 +-- src/PHPHtmlParser/Dom.php | 2 +- src/PHPHtmlParser/StaticDom.php | 2 +- tests/DomTest.php | 7 +++++++ 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/composer.json b/composer.json index 5549a5ee..f8ca8450 100755 --- a/composer.json +++ b/composer.json @@ -19,14 +19,13 @@ "ext-curl": "*", "paquettg/string-encode": "~1.0.0", "php-http/httplug": "^2.1", - "php-http/guzzle6-adapter": "^2.0", + "guzzlehttp/guzzle": "^7.0", "guzzlehttp/psr7": "^1.6", "myclabs/php-enum": "^1.7" }, "require-dev": { "phpunit/phpunit": "^7.5.1", "mockery/mockery": "^1.2", - "php-coveralls/php-coveralls": "^2.1", "infection/infection": "^0.13.4", "phan/phan": "^2.4", "friendsofphp/php-cs-fixer": "^2.16" diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 0b3a0730..816b1b3b 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -5,7 +5,7 @@ namespace PHPHtmlParser; use GuzzleHttp\Psr7\Request; -use Http\Adapter\Guzzle6\Client; +use GuzzleHttp\Client; use PHPHtmlParser\Contracts\Dom\CleanerInterface; use PHPHtmlParser\Contracts\Dom\ParserInterface; use PHPHtmlParser\Contracts\DomInterface; diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php index 95d01073..78950204 100755 --- a/src/PHPHtmlParser/StaticDom.php +++ b/src/PHPHtmlParser/StaticDom.php @@ -5,7 +5,7 @@ namespace PHPHtmlParser; use GuzzleHttp\Psr7\Request; -use Http\Adapter\Guzzle6\Client; +use GuzzleHttp\Client; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; use PHPHtmlParser\Exceptions\NotLoadedException; diff --git a/tests/DomTest.php b/tests/DomTest.php index 9fbb1529..519d5594 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -525,4 +525,11 @@ public function testRandomTagInMiddleOfText() $this->assertEquals('

      Hello, this is just a test in which <55 names with some other text> should be interpreted as text

      ', $dom->outerHtml); } + + public function testHttpCall() + { + $dom = new Dom(); + $dom->loadFromUrl('http://google.com'); + $this->assertNotEmpty($dom->outerHtml); + } } From b523b1d785f65e414a4bf865b776c0d3dcacba5d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 19 Sep 2020 19:17:40 +0000 Subject: [PATCH 192/200] Added tifelift verbage --- CHANGELOG.md | 14 ++++++++++++++ README.md | 11 +++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef3240dd..beb5ec6f 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 3.1.0 + +- Updated to include Tidelift subscription option. +- Removed php-coverall. +- Removed Guzzle 6 Adapter. +- Added support for Guzzle 7. + ## 3.0.1 ### Changed @@ -37,6 +44,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Removed `load()` method in Dom object. - Removed support for php 7.1. +## 2.2.1 + +### Added +- Added php_cs. +- Added support for PSR7 requests. +- Added the attribute type dto. + ## 2.2.0 ### Added diff --git a/README.md b/README.md index c194c8be..b8f4b7ae 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 3.0.1 +Version 3.0.2 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) @@ -20,7 +20,7 @@ $ composer require paquettg/php-html-parser This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.2, 7.3, and 7.4. -Usage +Basic Usage ----- You can find many examples of how to use the dom parser and any of its parts (which you will most likely never touch) in the tests directory. The tests are done using PHPUnit and are very small, a few lines each, and are a great place to start. Given that, I'll still be showing a few examples of how the package should be used. The following example is a very simplistic usage of the package. @@ -38,6 +38,13 @@ echo $a->text; // "click here" The above will output "click here". Simple no? There are many ways to get the same result from the dome, such as `$dom->getElementsbyTag('a')[0]` or `$dom->find('a', 0)` which can all be found in the tests or in the code itself. +Support PHP Html Parser Financially +-------------- + +Get supported Monolog and help fund the project with the [Tidelift Subscription](https://tidelift.com/subscription/pkg/packagist-paquettg-php-html-parser?utm_source=packagist-paquettg-php-html-parser&utm_medium=referral&utm_campaign=enterprise). + +Tidelift delivers commercial support and maintenance for the open source dependencies you use to build your applications. Save time, reduce risk, and improve code health, while paying the maintainers of the exact dependencies you use. + Loading Files ------------------ From b200ae9894af3ba309115c120799f3f273d3e2aa Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 19 Sep 2020 19:27:31 +0000 Subject: [PATCH 193/200] Updated readme versions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5b30a95d..c89f3bdf 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 3.0.2 +Version 3.1.0 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) From f5c2dd9b8abd03cfe9383efe2575c7ff0d9711ea Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 19 Sep 2020 19:35:11 +0000 Subject: [PATCH 194/200] Create SECURITY.md --- SECURITY.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..3fc4dfcc --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,11 @@ +# Security Policy + +## Supported Versions + +We only support the most recent version with security fixes. + +## Reporting a Vulnerability + +If you have found any issues that might have security implications, please refer to https://tidelift.com/security + +Do not report security reports publicly. From 9851d9875721109d0f5ee07f389f673500670a16 Mon Sep 17 00:00:00 2001 From: RajaTaimur7 <72160749+RajaTaimur7@users.noreply.github.com> Date: Sat, 3 Oct 2020 00:23:08 +0500 Subject: [PATCH 195/200] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c89f3bdf..2d30b978 100755 --- a/README.md +++ b/README.md @@ -258,7 +258,7 @@ unset($a); echo $dom; // '

      Hey bro,
      :)

      '); ``` -You can modify the text of `TextNode` objects easely. Please note that, if you set an encoding, the new text will be encoded using the existing encoding. +You can modify the text of `TextNode` objects easily. Please note that, if you set an encoding, the new text will be encoded using the existing encoding. ```php use PHPHtmlParser\Dom; From 5572180311dc69ba195e7e6945570a73395ef9c9 Mon Sep 17 00:00:00 2001 From: Leon Kessler Date: Mon, 26 Oct 2020 15:55:22 +0000 Subject: [PATCH 196/200] Fixes #247 numbers in comments can cause php fatal errors. --- src/PHPHtmlParser/Dom/Tag.php | 2 ++ tests/Node/TextTest.php | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 29b68bf7..2aeb6aa8 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -329,6 +329,8 @@ public function makeOpeningTag() } catch (AttributeNotFoundException $e) { // attribute that was in the array not found in the array... let's continue. continue; + } catch (\TypeError $e) { + $val = null; } $val = $attributeDTO->getValue(); if (\is_null($val)) { diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index 44298fc9..ce7f0f59 100755 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -4,6 +4,7 @@ use PHPHtmlParser\Dom; use PHPHtmlParser\Dom\Node\TextNode; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; use stringEncode\Encode; @@ -74,4 +75,14 @@ public function testSetTextEncoded() $node->setText('biz baz'); $this->assertEquals('biz baz', $node->text()); } + + public function testCommentWithNumbers() { + $dom = new Dom; + $options = new Options(); + $options->setCleanupInput(false); + $dom->setOptions($options); + $dom->loadStr(''); + $output = $dom->outerHtml; + $this->assertContains('', $output); + } } From 81341e1cfb9ce843ce50bd9b3715733ec5e5abfb Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 1 Nov 2020 19:45:49 +0000 Subject: [PATCH 197/200] Updated change log to reflect fix --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index beb5ec6f..eafc6357 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 3.1.1 + +### Changed +- Fixed issue with numbers in comments + ## 3.1.0 +### Changed - Updated to include Tidelift subscription option. - Removed php-coverall. - Removed Guzzle 6 Adapter. From ec1bc10b6acfa69cff1a7259a5abd051ad7f42d4 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 1 Nov 2020 19:50:43 +0000 Subject: [PATCH 198/200] fixed #246 Fixed php version dependency. --- CHANGELOG.md | 3 ++- composer.json | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eafc6357..065e5b36 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## 3.1.1 ### Changed -- Fixed issue with numbers in comments +- Fixed issue with numbers in comments. +- Updated minimume php version to correct version. ## 3.1.0 diff --git a/composer.json b/composer.json index f8ca8450..166886f7 100755 --- a/composer.json +++ b/composer.json @@ -13,7 +13,7 @@ } ], "require": { - "php": ">=7.1", + "php": ">=7.2", "ext-mbstring": "*", "ext-zlib": "*", "ext-curl": "*", From 40c335b512969bbfeb819771eabd130a40170338 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 1 Nov 2020 20:15:40 +0000 Subject: [PATCH 199/200] fix #233 - Made comments self-closing --- CHANGELOG.md | 1 + src/PHPHtmlParser/Content.php | 16 ++++++++++++ src/PHPHtmlParser/Dom/Parser.php | 8 ++++++ src/PHPHtmlParser/Enum/StringToken.php | 2 ++ tests/Dom/CommentTest.php | 34 ++++++++++++++++++++++++++ tests/Node/TextTest.php | 10 -------- 6 files changed, 61 insertions(+), 10 deletions(-) create mode 100644 tests/Dom/CommentTest.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 065e5b36..3fbf0bb4 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Fixed issue with numbers in comments. - Updated minimume php version to correct version. +- Comment tags are now self-closing when cleanup input is set to false. ## 3.1.0 diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 888a6039..f1332175 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -72,6 +72,22 @@ public function char(?int $char = null): string return $this->content[$char ?? $this->pos] ?? ''; } + /** + * Gets a string from the current character position. + * + * @param int $length + * @return string + */ + public function string(int $length = 1): string + { + $string = ''; + $position = $this->pos; + do { + $string .= $this->char($position++); + } while ($position < $this->pos + $length); + return $string; + } + /** * Moves the current position forward. * diff --git a/src/PHPHtmlParser/Dom/Parser.php b/src/PHPHtmlParser/Dom/Parser.php index 418e535c..7ed310cb 100644 --- a/src/PHPHtmlParser/Dom/Parser.php +++ b/src/PHPHtmlParser/Dom/Parser.php @@ -183,6 +183,14 @@ private function parseTag(Options $options, Content $content, int $size): TagDTO ->setOpening('setClosing(' ?>') ->selfClosing(); + } elseif($content->string(3) == '!--') { + // comment tag + $tag = $content->fastForward(3) + ->copyByToken(StringToken::CLOSECOMMENT(), true); + $tag = (new Tag($tag)) + ->setOpening('') + ->selfClosing(); } else { $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true)); if (\trim($tag) == '') { diff --git a/src/PHPHtmlParser/Enum/StringToken.php b/src/PHPHtmlParser/Enum/StringToken.php index 6b60d520..7a209e00 100644 --- a/src/PHPHtmlParser/Enum/StringToken.php +++ b/src/PHPHtmlParser/Enum/StringToken.php @@ -11,6 +11,7 @@ * @method static StringToken EQUAL() * @method static StringToken SLASH() * @method static StringToken ATTR() + * @method static StringToken CLOSECOMMENT() */ class StringToken extends Enum { @@ -18,4 +19,5 @@ class StringToken extends Enum private const EQUAL = ' =/>'; private const SLASH = " />\r\n\t"; private const ATTR = ' >'; + private const CLOSECOMMENT = '-->'; } diff --git a/tests/Dom/CommentTest.php b/tests/Dom/CommentTest.php new file mode 100644 index 00000000..3f10696e --- /dev/null +++ b/tests/Dom/CommentTest.php @@ -0,0 +1,34 @@ +setCleanupInput(false); + $dom->loadStr('', $options); + $this->dom = $dom; + } + + public function tearDown() + { + Mockery::close(); + } + + public function testLoadCommentInnerHtml() + { + $this->assertEquals('', $this->dom->innerHtml); + } +} diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index ce7f0f59..f94c4962 100755 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -75,14 +75,4 @@ public function testSetTextEncoded() $node->setText('biz baz'); $this->assertEquals('biz baz', $node->text()); } - - public function testCommentWithNumbers() { - $dom = new Dom; - $options = new Options(); - $options->setCleanupInput(false); - $dom->setOptions($options); - $dom->loadStr(''); - $output = $dom->outerHtml; - $this->assertContains('', $output); - } } From 7c05e4192a918cb72902499d275a5b9fa7779d7e Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 1 Nov 2020 20:33:18 +0000 Subject: [PATCH 200/200] Removed version number from readme --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 2d30b978..6889b079 100755 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ PHP Html Parser ========================== -Version 3.1.0 - [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/paquettg/php-html-parser/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/paquettg/php-html-parser/?branch=master)