/ Gists / PHP-PHPDOM

Gists - PHP-PHPDOM

On gists

Parser - PsiDetektiv

PHP-PHPDOM

parser.php #

<?php

libxml_use_internal_errors(true);
 
$data = file_get_contents('https://www.psidetektiv.cz/ztracena-zvirata/');

// Load HTML into DOMDocument
$dom = new DOMDocument();
$dom->loadHTML($data, LIBXML_NOERROR | LIBXML_NOWARNING);
$finder = new DomXPath($dom);

$wrapper = $finder->query("//div[@id='category-list']"); 

if ($wrapper->length > 0) {
    $catalogItems = $finder->query(".//div[contains(@class, 'catalog-item')]", $wrapper->item(0));
    

    $savedItems = [];
    foreach ($catalogItems as $index => $item) {
        // Safely get link
        $linkElement = $finder->query(".//a[contains(@href, '/zvire/')]", $item)->item(0);
        $link = $linkElement ? $linkElement->getAttribute('href') : '';
        
        // Safely get background image
        $bgImageElement = $finder->query(".//span[@class='bg-image']", $item)->item(0);
        $bgImageStyle = $bgImageElement ? $bgImageElement->getAttribute('style') : '';
        
        // Extract image URL from style
        preg_match('/background-image:url\((.*?)\)/', $bgImageStyle, $matches);
        $imageUrl = isset($matches[1]) ? $matches[1] : '';
        
        $name = trim($finder->query(".//div[contains(@class, 'name')]/span[contains(@class, 'label') and contains(text(), 'Jméno:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $breed = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Plemeno:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $lostLocation = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Místo ztráty:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $region = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Kraj:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $gender = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Pohlaví:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $color = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Barva:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $size = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Velikost:')]/following::text()[1]", $item)->item(0)->nodeValue);
        
        $animalData = [
            'odkaz' => $link,
            'jmeno' => $name,
            'plemeno' => $breed,
            'misto_ztraty' => $lostLocation,
            'kraj' => $region,
            'pohlavi' => $gender,
            'barva' => $color,
            'velikost' => $size,
            'obrazek' => $imageUrl
        ];
    
        $savedItems[] = $animalData;
    }
} else {
    echo "No elements found\n";
    foreach (libxml_get_errors() as $error) {
        echo "Line {$error->line}: {$error->message}\n";
    }
}


echo "<pre>";
print_r($savedItems);
echo "</pre>";




On gists

PHP DOM - saveHtml (solution without entities and html,body tags)

PHP PHP-PHPDOM

dom.php #

<?php
 
$string = '<div>ěščřžýáíé</div><p>testik</p> <p>testik2</p>';
 
$dom = new \DOMDocument('1.0', 'UTF-8');
$dom->loadHTML('<body>'.mb_convert_encoding($string, 'HTML-ENTITIES', 'UTF-8').'</body>');
 
// změny na $dom
 
echo substr($dom->saveHTML($dom->documentElement), 12, -14);

On gists

XPath Cheatsheet --> https://github.com/LeCoupa/awesome-cheatsheets

Cheatsheets PHP-PHPDOM

xpath-cheatsheet.js #

// XPath CheatSheet
// To test XPath in your Chrome Debugger: $x('/html/body')
// http://www.jittuu.com/2012/2/14/Testing-XPath-In-Chrome/


// 0. XPath Examples.
// More: http://xpath.alephzarro.com/content/cheatsheet.html


'//hr[@class="edge" and position()=1]'                // every first hr of 'edge' class
'//table[count(tr)=1 and count(tr/td)=2]'             // all tables with 1 row and 2 cols
'//div/form/parent::*'                                // all divs that have form
'./div/b'                                             // a relative path
'//table[parent::div[@class="pad"] and not(@id)]//a'  // any anchor in a table without id, contained in a div of "pad" class
'/html/body/div/*[preceding-sibling::h4]'             // give me whatever after h4
'//tr/td[font[@class="head" and text()="TRACK"]]'     // all td that has font of a "head" class and text "TRACK"
'./table/tr[last()]'                                  // the last row of a table
'//rdf:Seq/rdf:li/em:id'                              // using namespaces
'//a/@href'                                           // hrefs of all anchors
'//*[count(*)=3]'                                     // all nodes with 3 children
'//var|//acronym'                                     // all vars and acronyms


// 1. General.


'/html'                     // whole web page (css: html)
'/html/body'                // whole web page body (css: body)
'//text()'                  // all text nodes of web page
'/html/body/.../.../.../E'  // element <E> by absolute reference (css: body > … > … > … > E)


// 2. Tag.


'//E'                                        // element <E> by relative reference (css: E)
'(//E)[2]'                                   // second <E> element anywhere on page
'//img'                                      // image element (css: img)
'//E[@A]'                                    // element <E> with attribute A (css: E[A])
'//E[@A="t"]'                                // element <E> with attribute A containing text 't' exactly (css: E[A='t'])
'//E[contains(@A,"t")]'                      // element <E> with attribute A containing text 't' (css: E[A*='t'])
'//E[starts-with(@A, "t")]'                  // element <E> whose attribute A begins with 't' (css: E[A^='t'])
'//E[ends-with(@A, "t")]'                    // element <E> whose attribute A ends with 't' (css: E[A$='t'])
'//E[contains(concat(" ", @A, " "), " w ")'  // element <E> with attribute A containing word 'w' (css: E[A~='w'])
'//E[matches(@A, "r")]'                      // element <E> with attribute A matching regex ‘r’
'//E1[@id=I1] | //E2[@id=I2]'                // element <E1> with id I1 or element <E2> with id I2 (css: E1#I1, E2#I2)
'//E1[@id=I1 or @id=I2]'                     // element <E1> with id I1 or id I2 (css: E1#I1, E1#I2)


// 3. Attribute.


'//E/@A'                    // attribute A of element <E> (css: E@A)
'//*/@A'                    // attribute A of any element (css: *@A)
'//E[@A2="t"]/@A1'          // attribute A1 of element <E> where attribute A2 is 't' exactly (css: E[A2='t']@A1)
'//E[contains(@A,"t")]/@A'  // attribute A of element <E> where A contains 't' (css: E[A*='t']@A)


// 4. ID & Name.


'//*[@id="I"]'                // element with id I (css: #I)
'//E[@id="I"]'                // element <E> with id I (css: E#I)
'//*[@name="N"]'              // element with name (css: [name='N'])
'//E[@name="N"]'              // element <E> with name (css: E[name='N'])
'//*[@id="X" or @name="X"]'   // element with id X or, failing that, a name X
'//*[@name="N"][v+1]'         // element with name N & specified 0-based index ‘v’ (css: [name='N']:nth-child(v+1))
'//*[@name="N"][@value="v"]'  // element with name N & specified value ‘v’ (css: *[name='N'][value='v’])


// 5. Lang & Class.


'//E[@lang="L" or starts-with(@lang, concat("L", "-"))]'  // element <E> is explicitly in language L or subcode (css: E[lang|=L])
'//*[contains(concat(" ", @class, " "), " C ")]'          // element with a class C (css: .C)
'//E[contains(concat(" ", @class, " "), " C ")]'          // element <E> with a class C (css: E.C)


// 6. Text & Link.


'//*[.="t"]'                  // element containing text 't' exactly
'//E[contains(text(), "t")]'  // element <E> containing text 't' (css: E:contains('t'))
'//a'                         // link element (css: a)
'//a[.="t"]'                  // element <a> containing text 't' exactly
'//a[contains(text(), "t")]'  // element <a> containing text 't' (css: a:contains('t'))
'//a[@href="url"]'            // <a> with target link 'url' (css: a[href='url'])
'//a[.="t"]/@href'            // link URL labeled with text 't' exactly


// 7. Parent & Child.


'//E/*[1]'                                                        // first child of element <E> (css: E > *:first-child)
'//E[1]'                                                          // first <E> child (css: E:first-of-type)
'//E/*[last()]'                                                   // last child of element E (css: E *:last-child)
'//E[last()]'                                                     // last <E> child (css: E:last-of-type)
'//E[2]'                                                          // second <E> child (css: E:nth-of-type(2))
'//*[2][name()="E"]'                                              // second child that is an <E> element (css: E:nth-child(2))
'//E[last()-1]'                                                   // second-to-last <E> child (css: E:nth-last-of-type(2))
'//*[last()-1][name()="E"]'                                       // second-to-last child that is an <E> element (css: E:nth-last-child(2))
'//E1/[E2 and not( *[not(self::E2)])]'                            // element <E1> with only <E2> children
'//E/..'                                                          // parent of element <E>
'//*[@id="I"]/.../.../.../E'                                      // descendant <E> of element with id I using specific path (css: #I > … > … > … > E)
'//*[@id="I"]//E'                                                 // descendant <E> of element with id I using unspecified path (css: #I E)
'//E[count(*)=0]'                                                 // element <E> with no children (E:empty)
'//E[count(*)=1]'                                                 // element <E> with an only child
'//E[count(preceding-sibling::*)+count(following-sibling::*)=0]'  // element <E> that is an only child (css: E:only-child)
'//E[count(../E) = 1]'                                            // element <E> with no <E> siblings (css: E:only-of-type)
'//E[position() mod N = M + 1]'                                   // every Nth element starting with the (M+1)th (css: E:nth-child(Nn+M))


// 8. Sibling.


'//E2/following-sibling::E1'                 // element <E1> following some sibling <E2> (css: E2 ~ E1)
'//E2/following-sibling::*[1][name()="E1"]'  // element <E1> immediately following sibling <E2> (css: E2 + E1)
'//E2/following-sibling::*[2][name()="E1"]'  // element <E1> following sibling <E2> with one intermediary (css: E2 + * + E1)
'//E/following-sibling::*'                   // sibling element immediately following <E> (css: E + *)
'//E2/preceding-sibling::E1'                 // element <E1> preceding some sibling <E2>
'//E2/preceding-sibling::*[1][name()="E1"]'  // element <E1> immediately preceding sibling <E2>
'//E2/preceding-sibling::*[2][name()="E1"]'  // element <E1> preceding sibling <E2> with one intermediary
'//E/preceding-sibling::*[1]'                // sibling element immediately preceding <E>


// 9. Table Cell.


'//*[@id="TestTable"]//tr[3]//td[2]'          // cell by row and column (e.g. 3rd row, 2nd column) (css: #TestTable tr:nth-child(3) td:nth-child(2))
'//td[preceding-sibling::td="t"]'             // cell immediately following cell containing 't' exactly
'td[preceding-sibling::td[contains(.,"t")]]'  // cell immediately following cell containing 't' (css: td:contains('t') ~ td)


// 10. Dynamic.


'//E[@disabled]'       // user interface element <E> that is disabled (css: E:disabled)
'//*[not(@disabled)]'  // user interface element that is enabled (css: E:enabled)
'//*[@checked]'        // checkbox (or radio button) that is checked (css: *:checked)


// 11. XPath Functions.
// https://developer.mozilla.org/en-US/docs/Web/XPath/Functions


// 11.1. Conversion.


boolean(expression)  // evaluates an expression and returns true or false.
string([object])     // converts the given argument to a string.
number([object])     // converts an object to a number and returns the number.


// 11.2. Math.


ceiling(number)  // evaluates a decimal number and returns the smallest integer greater than or equal to the decimal number.
floor(number)    // evaluates a decimal number and returns the largest integer less than or equal to the decimal number.
round(decimal)   // returns a number that is the nearest integer to the given number.
sum(node-set)    // returns a number that is the sum of the numeric values of each node in a given node-set.


// 11.3. Logic.


true()           // returns a boolean value of true.
false()          // returns boolean false.
not(expression)  // evaluates a boolean expression and returns the opposite value.


// 11.4. Node.


lang(string)               // determines whether the context node matches the given language and returns boolean true or false.
name([node-set])           // returns a string representing the QName of the first node in a given node-set.
namespace-uri([node-set])  // returns a string representing the namespace URI of the first node in a given node-set.


// 11.5. Context.


count(node-set)           // counts the number of nodes in a node-set and returns an integer.
function-available(name)  // determines if a given function is available and returns boolean true or false.
last()                    // returns a number equal to the context size from the expression evaluation context.
position()                // returns a number equal to the context position from the expression evaluation context.


// 11.6. String.


contains(haystack-string, needle-string)  // determines whether the first argument string contains the second argument string and returns boolean true or false.
concat(string1, string2 [stringn]*)       // concatenates two or more strings and returns the resulting string.
normalize-space(string)                   // strips leading and trailing white-space from a string, replaces sequences of whitespace characters by a single space, and returns the resulting string.
starts-with(haystack, needle)             // checks whether the first string starts with the second string and returns true or false.
string-length([string])                   // returns a number equal to the number of characters in a given string.
substring(string, start [length])         // returns a part of a given string.
substring-after(haystack, needle)         // returns a string that is the rest of a given string after a given substring.
substring-before(haystack, needle)        // returns a string that is the rest of a given string before a given substring.
translate(string, abc, XYZ)               // evaluates a string and a set of characters to translate and returns the translated string.


// 12. XPath Axes.

ancestor            // indicates all the ancestors of the context node beginning with the parent node and traveling through to the root node.
ancestor-or-self    // indicates the context node and all of its ancestors, including the root node.
attribute (@)       // indicates the attributes of the context node. Only elements have attributes. This axis can be abbreviated with the at sign (@).
child (/)           // indicates the children of the context node. If an XPath expression does not specify an axis, this is understood by default. Since only the root node or element nodes have children, any other use will select nothing.
descendant (//)     // indicates all of the children of the context node, and all of their children, and so forth. Attribute and namespace nodes are not included - the parent of an attribute node is an element node, but attribute nodes are not the children of their parents.
descendant-or-self  // indicates the context node and all of its descendants. Attribute and namespace nodes are not included - the parent of an attribute node is an element node, but attribute nodes are not the children of their parents.
following           // indicates all the nodes that appear after the context node, except any descendant, attribute, and namespace nodes.
following-sibling   // indicates all the nodes that have the same parent as the context node and appear after the context node in the source document.
parent(..)          // indicates the single node that is the parent of the context node. It can be abbreviated as two periods (..).
preceding           // indicates all the nodes that precede the context node in the document except any ancestor, attribute and namespace nodes.
preceding-sibling   // indicates all the nodes that have the same parent as the context node and appear before the context node in the source document.
self (.)            // indicates the context node itself. It can be abbreviated as a single period (.).

On gists

Xpath - ukázky

PHP-PHPDOM

readme.txt #

@url: https://funkcionalne.k47.cz/2015/01/xpath-co-proc-a-hlavne-jak.html
@url: https://funkcionalne.k47.cz/2014/05/php-dom-simplexml-a-matcher.html

================================================================

/html – Vybere ele­ment html, který je ko­ře­no­vým ele­men­tem stromu.

/html/body - Odpovídá všem elementům body, které jsou přímými potomky kořenového elementu html`.

//h2 – Všechny ele­menty h2, které se v do­ku­mentu vy­sky­tují, včetně vlast­ních pod­stromů. Pokud jeden h2 kde­koli v sobě ob­sa­huje jiný ele­ment h2, vý­sled­kem budou všechny tyto ele­menty.

//div/span//a – Se­lek­tory je možno li­bo­volně ře­tě­zit a tento najde všechny a jako po­tomky span, které jsou přímými po­tomky li­bo­vol­ného divu.

./div nebo div – Vybere všechny ele­menty div které jsou přímými po­tomky právě ak­tiv­ního ele­mentu. Jde o re­la­tivní polohu a ne o polohu fi­xo­va­nou ke kořenu do­ku­mentu.

.//div – To samé jako před­chozí výraz, ale při­pouští i ne­přímé po­tomky.

div[@class="asdf"] – div, jehož atri­but class je „asdf“. XPath ne­pod­po­ruje CSS třídy a k obsahu atri­butů při­stu­puje jako ke strin­gům a proto, tento výraz není stejný jako css se­lek­tor div.asdf. XPath pod­mínce @class="asdf" vyhoví jen ty ele­menty je­jichž atri­but class je jen a pouze string „asdf“. Na toto je si třeba dávat pozor, pro­tože jde o nej­větší od­chylku od cho­vání, které by čekal člověk od­ko­jený CSS se­lek­tory.

div[@class != "asdf"] nebo div[not(@class = "asdf")] – Od­po­vídá divům je­jichž třída není „asdf“.

div[contains(@class, "asdf")] – divy je­jichž atri­but class ob­sa­huje string „asdf“. Na­pří­klad: Třída „xasdfy“ také vy­ho­vuje tomuto pre­di­kátu.

div[starts-with(@class, "asdf")] – divy je­jichž atri­but class začíná na „asdf“.

div[contains(concat(" ", normalize-space(@class), " "), " asdf ")] – Tento výraz má stejný význam jako CSS se­lek­tor div.asdf (tedy kromě toho, že CSS ne­při­hlíží v ve­li­kosti písmen a XPath ano). Na­štěstí takové kon­strukce nejsou skoro nikdy po­třeba, pro­tože vět­ši­nou stačí jed­no­du­ché @class="asdf" nebo funkce contains().

div[@class] – divy, které mají nějaký atri­but class

div[span] – divy, které mají jako pří­mého po­tomka span

div[.//span] – divy, které ob­sa­hují span

div[span[@class]] – divy, které mají pří­mého po­tomka span, který má na­sta­ve­nou li­bo­vol­nou třídu

div[@class and span] – divy, které mají na­sta­ven atri­but class a zá­ro­veň ob­sa­hují span jako po­tomka. Stejně jako and se dá použít i lo­gické or nebo funkce not().

div[1] – První div v pořadí v jakém se vy­sky­tuje v do­ku­mentu.

div[last()] – Po­slední div.

div[position() mod 2 = 0] – Každý sudý div.

Na pořadí pre­di­kátů v hra­na­tých zá­vor­kách záleží:

div[@class="asdf"][1] – Vybere všechny divy s třídou „asdf“ a z nich pak vybere ten první. Pokud exis­tují nějaké divy s touto třídou, vý­sled­kem bude vždycky první z nich.

div[1][@class="asdf"] – Vybere první div a ten ve vý­sledné mno­žině po­ne­chá jen pokud má třídu „asdf“. Když exis­tují divy s touto třídou, ale první ji nemá, vý­sled­kem je prázdná mno­žina.

div/text()[1] – Z divu vybere první kus textu, který není ob­sa­žen v žádném jiném ele­mentu.

preceding-sibling::h2 – Vybere ele­menty h2, které před­chá­zejí ak­tiv­nímu ele­mentu a zá­ro­veň jsou jeho sou­ro­zenci (tj. jsou přímými po­tomky stej­ného ele­mentu jako ak­tivní ele­ment).

a[@class="active"]/following-sibling::a[1] – Vybere první odkaz, který ná­sle­duje (a je sou­ro­ze­nec) po odkazu s třídou „active“.


On gists

Ukázka parsování stringu - PHP DOM

PHP-PHPDOM

example.php #

<?php
// ukazky https://github.com/kaja47/Matcher

$htmlString = '
    <div class="post" id="prvni">
        <div class="date">DNES</div>
        <h2>prvni nadpis</h2>
    </div>
    <div class="post" id="druhy">
        <div class="date">ZITRA</div>
        <h2>druhy nadpis</h2>
    </div>

    <textarea name="message"></textarea>
';

$dom = new DOMDocument;
$dom->loadHTML($htmlString);
$xpath = new \DOMXpath($dom);
$nodes = $xpath->query('//div[@class="post"]');
$res = [];
foreach ($nodes as $node) {
  $res[] = [
    'id'    => $xpath->query('@id', $node)->item(0)->textContent,
    'date'  => $xpath->query('div[@class="date"]', $node)->item(0)->textContent,
    'title' => $xpath->query('h2', $node)->item(0)->textContent,
  ];
}

$res['textarea'] = $xpath->query('//textarea/@name')->item(0)->textContent;


print_R($res);

/*

Array
(
    [0] => Array
        (
            [id] => prvni
            [date] => DNES
            [title] => prvni nadpis
        )

    [1] => Array
        (
            [id] => druhy
            [date] => ZITRA
            [title] => druhy nadpis
        )

    [textarea] => message
)


*/

On gists

Parsing og tags

PHP PHP-PHPDOM

parser.php #

<?php 


function autoUTF($s)
{
    if (preg_match('#[\x80-\x{1FF}\x{2000}-\x{3FFF}]#u', $s)) // detect UTF-8
    {
        return $s;
    }
    elseif (preg_match('#[\x7F-\x9F\xBC]#', $s)) // detect WINDOWS-1250
    {
        return iconv('WINDOWS-1250', 'UTF-8', $s);
    }
    else // assume ISO-8859-2
    {
        return iconv('ISO-8859-2', 'UTF-8', $s);
    }
}
 

$url = 'https://navratdoreality.cz/uzdraveny-fotbalista-8853.html';

$html = file_get_contents($url);

libxml_use_internal_errors(true); // Yeah if you are so worried about using @ with warnings
$doc = new DomDocument();
$doc->loadHTML($html);
$xpath = new DOMXPath($doc);
$query = '//*/meta[starts-with(@property, \'og:\')]';
$metas = $xpath->query($query);
$rmetas = array();
foreach ($metas as $meta) {
    $property = $meta->getAttribute('property');
    $content = $meta->getAttribute('content');
    $rmetas[$property] = html_entity_decode(autoUTF($content));
}
var_dump($rmetas);



// $pattern='/<\s*meta\s+property="og:([^"]+)"\s+content="([^"]*)/i';
// preg_match_all($pattern, $html, $out);
// var_dump(array_combine($out[1], $out[2]));

On gists

PHP DOM examples

PHP-PHPDOM

example.php #

<?php


/ Bez tohoto volání při načítání HTML5 vyskakují podobné chyby:
// Warning:  DOMDocument::loadHTML(): Tag header invalid in Entity
libxml_use_internal_errors(true);

$data = file_get_contents("test.html");

$dom = new DOMDocument();
$dom->loadHtml(mb_convert_encoding($data, 'HTML-ENTITIES', 'UTF-8'));
$finder = new DomXPath($dom);


// Elementy
$nodesByElement = $finder->query("//a");       # CSS: a
$nodesByElement = $finder->query("//a/span");  # CSS: a > span
$nodesByElement = $finder->query("//a//span"); # CSS: a span

// ID a atributy
$nodeById = $finder->query("//*[@id='testId']");   # CSS: #testId
$nodeById = $finder->query("//div[@id='testId']"); # CSS: div#testId
$nodeByAttr = $finder->query("//*[@data-city]");   # CSS: [data-city]

// Třídy - těch může být více v jednom atributu, trochu se to komplikuje
// CSS: .inactive
$classToFind = "inactive";
$byClass = $finder->query("//*[contains(concat(' ', normalize-space(@class), ' '), ' ".$classToFind." ')]");

// XPath se v těchto případech nechová úplně stejně jako CSS
$byIndex = $finder->query("//div/a[2]");      # CSS: div > a:nth-child(2)
$lastNode = $finder->query("//div/a[last()]");# CSS: div > a:last-child

// Vyhledávání pouze v dříve vyhledaném elementu
$finder->query("//a", $nodeById);

// Rodič, potomci, předchozí sourozenec, následující sourozenec
// POZOR: Počítá i TextNode
$parentNode = $nodeById->item(0)->parentNode;
$previousSibling = $nodeById->item(0)->previousSibling;
$nextSibling = $nodeById->item(0)->nextSibling;
$children = $nodeById->item(0)->childNodes;


// Získání kusu HTML nalezeného selektorem
$htmlPart = $dom->saveHtml($nodeById->item(0));

//Získání hodnotu atributu
$linkNode->item(0)->getAttribute("href");

// Změnu atributu
$linkNode->item(0)->setAttribute("href", "/");

// Smazání tagu img, který je přímým potomkem
$toRemove = $finder->query("img", $nodeList->item(2));
$removedItem = $nodeList->item(2)->removeChild($toRemove->item(0));

//Vložení nového (nyní odstraněného) elementu
$nodeList->item(0)->appendChild($removedItem);

On gists

PHPDOM - examples

PHP PHP-PHPDOM

responsive-images.php #

  // Create a DOMDocument
  $dom = new DOMDocument();
	
  // Load html including utf8, like Hebrew
  $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
	
  // Create the div wrapper
  $div = $dom->createElement('div');
  $div->setAttribute('class', 'responsive-img');
	
  // Get all the images
  $images = $dom->getElementsByTagName('img');
 
  // Loop the images
  foreach ($images as $image) 
  {
    //Clone our created div
    $new_div_clone = $div->cloneNode();
		
    //Replace image with wrapper div
    $image->parentNode->replaceChild($new_div_clone,$image);
		
    //Append image to wrapper div
    $new_div_clone->appendChild($image);
  }
	
  // Save the HTML
  $html = $dom->saveHTML();
	
  return $html;

On gists

From https://diskuse.jakpsatweb.cz/?action=vthread&forum=9&topic=109986#7 fotbal-dom-parsování

PHP-PHPDOM

parsovani-dom.php #

$data = <<< DATA
  <tr>
    <td rowspan=6 align="center" valign="middle"><font size=-2>12. KOLO</font></td>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>17:00</font></td>
    <td align="left"><font size=-2>FC ŠEBÁNEK</font></td>
    <td align="left"><font size=-2>ROZJETEJ STROJ</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>17:53</font></td>
    <td align="left"><font size=-2>BOMBERS TEAM</font></td>
    <td align="left"><font size=-2>STATUS QUO</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>18:46</font></td>
    <td align="left"><font size=-2>ELITA CHASERS</font></td>
    <td align="left"><font size=-2>VIPER TEAM</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>19:39</font></td>
    <td align="left"><font size=-2>PROŠLÝ MLÍKA</font></td>
    <td align="left"><font size=-2>RVHP</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>20:32</font></td>
    <td align="left"><font size=-2>OSPLPPOTR.</font></td>
    <td align="left"><font size=-2>FC KEBOURY</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>21:25</font></td>
    <td align="left"><font size=-2>AC PELYŇKOVÝ DESTILÁT UNITED</font></td>
    <td align="left"><font size=-2>LOSERS OF UFTALAND</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td rowspan=6 align="center" valign="middle"><font size=-2>13. KOLO</font></td>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>17:00</font></td>
    <td align="left"><font size=-2>LOSERS OF UFTALAND</font></td>
    <td align="left"><font size=-2>OSPLPPOTR.</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>17:53</font></td>
    <td align="left"><font size=-2>FC KEBOURY</font></td>
    <td align="left"><font size=-2>PROŠLÝ MLÍKA</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>18:46</font></td>
    <td align="left"><font size=-2>AC PELYŇKOVÝ DESTILÁT UNITED</font></td>
    <td align="left"><font size=-2>FC ŠEBÁNEK</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>19:39</font></td>
    <td align="left"><font size=-2>RVHP</font></td>
    <td align="left"><font size=-2>ELITA CHASERS</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>20:32</font></td>
    <td align="left"><font size=-2>VIPER TEAM</font></td>
    <td align="left"><font size=-2>BOMBERS TEAM</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>21:25</font></td>
    <td align="left"><font size=-2>STATUS QUO</font></td>
    <td align="left"><font size=-2>ROZJETEJ STROJ</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
DATA;
 
$ary = array();
 
$dom = new DOMDocument();
$dom->loadHTML('<?xml encoding="UTF-8">' . $data);
$trs = $dom->getElementsByTagName('tr');
for ($i = 0; $i < $trs->length; $i++) {
    $tds = $trs->item($i)->getElementsbyTagName('td');
    for ($j = 0; $j < $tds->length; $j++) {
        $ary[$i][$j] = $tds->item($j)->getElementsByTagName('font')->item(0)->nodeValue;
    }
}
 
var_dump($ary);

On gists

From http://programujte.com/forum/vlakno/30409-csfd-api/

PHP-PHPDOM

csfd-dom-xpath.php #

<?php

$dom = new domDocument;
$csfd = file_get_contents("http://www.csfd.cz/film/$csfd_id");
$html = (ord($csfd[0]) == 31) ? gzdecode($csfd) : $csfd;
@$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;

$xpath = new DOMXPath($dom);
$nazvy = array();
$zeme = array();
$names_other = "";
$nodes = $xpath->query("//h1[@itemprop='name']");
$names_cs = $nodes->item(0)->nodeValue;

foreach($xpath->query("//ul[@class='names']/li/h3") as $li) {
    $nazvy[] = $li->nodeValue;
}
foreach($xpath->query("//ul[@class='names']/li/img") as $li) {
    $zeme[] = $li->getAttribute('alt');
}
for($i=0;$i<count($nazvy);$i++){
    if($i==count($nazvy)-1)
        $names_other .= $zeme[$i]."-".$nazvy[$i];
    else
        $names_other .= $zeme[$i]."-".$nazvy[$i].";";
}

$nodes = $xpath->query("//h2[@class='average']");
$hodnoceni = str_replace('%', '', $nodes->item(0)->nodeValue);

$nodes = $xpath->query("//p[@class='origin']");
$podrobnosti = explode(", ", $nodes->item(0)->nodeValue);

$nodes = $xpath->query("//p[@class='genre']");
$genre = str_replace(' / ', '@;@', $nodes->item(0)->nodeValue);

$nodes = $xpath->query("//span[@data-truncate='340']");
$hraji = $nodes->item(0)->nodeValue;

$nodes = $xpath->query("//div[@data-truncate='570']");
$popis = $nodes->item(0)->nodeValue;

$nodes = $xpath->query("//img[@class='film-poster']");
$poster_url = "http:".$nodes->item(0)->getAttribute('src');