Gists - PHP-PHPDOM

On gists

Parser - PsiDetektiv

4.3.2025

PHP-PHPDOM

parser.php #

<?php

libxml_use_internal_errors(true);
 
$data = file_get_contents('https://www.psidetektiv.cz/ztracena-zvirata/');

// Load HTML into DOMDocument
$dom = new DOMDocument();
$dom->loadHTML($data, LIBXML_NOERROR | LIBXML_NOWARNING);
$finder = new DomXPath($dom);

$wrapper = $finder->query("//div[@id='category-list']"); 

if ($wrapper->length > 0) {
    $catalogItems = $finder->query(".//div[contains(@class, 'catalog-item')]", $wrapper->item(0));
    

    $savedItems = [];
    foreach ($catalogItems as $index => $item) {
        // Safely get link
        $linkElement = $finder->query(".//a[contains(@href, '/zvire/')]", $item)->item(0);
        $link = $linkElement ? $linkElement->getAttribute('href') : '';
        
        // Safely get background image
        $bgImageElement = $finder->query(".//span[@class='bg-image']", $item)->item(0);
        $bgImageStyle = $bgImageElement ? $bgImageElement->getAttribute('style') : '';
        
        // Extract image URL from style
        preg_match('/background-image:url\((.*?)\)/', $bgImageStyle, $matches);
        $imageUrl = isset($matches[1]) ? $matches[1] : '';
        
        $name = trim($finder->query(".//div[contains(@class, 'name')]/span[contains(@class, 'label') and contains(text(), 'Jméno:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $breed = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Plemeno:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $lostLocation = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Místo ztráty:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $region = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Kraj:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $gender = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Pohlaví:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $color = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Barva:')]/following::text()[1]", $item)->item(0)->nodeValue);
        $size = trim($finder->query(".//div[contains(@class, 'line')]/span[contains(@class, 'label') and contains(text(), 'Velikost:')]/following::text()[1]", $item)->item(0)->nodeValue);
        
        $animalData = [
            'odkaz' => $link,
            'jmeno' => $name,
            'plemeno' => $breed,
            'misto_ztraty' => $lostLocation,
            'kraj' => $region,
            'pohlavi' => $gender,
            'barva' => $color,
            'velikost' => $size,
            'obrazek' => $imageUrl
        ];
    
        $savedItems[] = $animalData;
    }
} else {
    echo "No elements found\n";
    foreach (libxml_get_errors() as $error) {
        echo "Line {$error->line}: {$error->message}\n";
    }
}


echo "<pre>";
print_r($savedItems);
echo "</pre>";

On gists

PHP DOM - saveHtml (solution without entities and html,body tags)

8.6.2020

PHP PHP-PHPDOM

dom.php #

<?php
 
$string = '<div>ěščřžýáíé</div><p>testik</p> <p>testik2</p>';
 
$dom = new \DOMDocument('1.0', 'UTF-8');
$dom->loadHTML('<body>'.mb_convert_encoding($string, 'HTML-ENTITIES', 'UTF-8').'</body>');
 
// změny na $dom
 
echo substr($dom->saveHTML($dom->documentElement), 12, -14);

On gists

XPath Cheatsheet --> https://github.com/LeCoupa/awesome-cheatsheets

29.11.2019

Cheatsheets PHP-PHPDOM

xpath-cheatsheet.js #

// XPath CheatSheet
// To test XPath in your Chrome Debugger: $x('/html/body')
// http://www.jittuu.com/2012/2/14/Testing-XPath-In-Chrome/


// 0. XPath Examples.
// More: http://xpath.alephzarro.com/content/cheatsheet.html


'//hr[@class="edge" and position()=1]'                // every first hr of 'edge' class
'//table[count(tr)=1 and count(tr/td)=2]'             // all tables with 1 row and 2 cols
'//div/form/parent::*'                                // all divs that have form
'./div/b'                                             // a relative path
'//table[parent::div[@class="pad"] and not(@id)]//a'  // any anchor in a table without id, contained in a div of "pad" class
'/html/body/div/*[preceding-sibling::h4]'             // give me whatever after h4
'//tr/td[font[@class="head" and text()="TRACK"]]'     // all td that has font of a "head" class and text "TRACK"
'./table/tr[last()]'                                  // the last row of a table
'//rdf:Seq/rdf:li/em:id'                              // using namespaces
'//a/@href'                                           // hrefs of all anchors
'//*[count(*)=3]'                                     // all nodes with 3 children
'//var|//acronym'                                     // all vars and acronyms


// 1. General.


'/html'                     // whole web page (css: html)
'/html/body'                // whole web page body (css: body)
'//text()'                  // all text nodes of web page
'/html/body/.../.../.../E'  // element <E> by absolute reference (css: body > … > … > … > E)


// 2. Tag.


'//E'                                        // element <E> by relative reference (css: E)
'(//E)[2]'                                   // second <E> element anywhere on page
'//img'                                      // image element (css: img)
'//E[@A]'                                    // element <E> with attribute A (css: E[A])
'//E[@A="t"]'                                // element <E> with attribute A containing text 't' exactly (css: E[A='t'])
'//E[contains(@A,"t")]'                      // element <E> with attribute A containing text 't' (css: E[A*='t'])
'//E[starts-with(@A, "t")]'                  // element <E> whose attribute A begins with 't' (css: E[A^='t'])
'//E[ends-with(@A, "t")]'                    // element <E> whose attribute A ends with 't' (css: E[A$='t'])
'//E[contains(concat(" ", @A, " "), " w ")'  // element <E> with attribute A containing word 'w' (css: E[A~='w'])
'//E[matches(@A, "r")]'                      // element <E> with attribute A matching regex ‘r’
'//E1[@id=I1] | //E2[@id=I2]'                // element <E1> with id I1 or element <E2> with id I2 (css: E1#I1, E2#I2)
'//E1[@id=I1 or @id=I2]'                     // element <E1> with id I1 or id I2 (css: E1#I1, E1#I2)


// 3. Attribute.


'//E/@A'                    // attribute A of element <E> (css: E@A)
'//*/@A'                    // attribute A of any element (css: *@A)
'//E[@A2="t"]/@A1'          // attribute A1 of element <E> where attribute A2 is 't' exactly (css: E[A2='t']@A1)
'//E[contains(@A,"t")]/@A'  // attribute A of element <E> where A contains 't' (css: E[A*='t']@A)


// 4. ID & Name.


'//*[@id="I"]'                // element with id I (css: #I)
'//E[@id="I"]'                // element <E> with id I (css: E#I)
'//*[@name="N"]'              // element with name (css: [name='N'])
'//E[@name="N"]'              // element <E> with name (css: E[name='N'])
'//*[@id="X" or @name="X"]'   // element with id X or, failing that, a name X
'//*[@name="N"][v+1]'         // element with name N & specified 0-based index ‘v’ (css: [name='N']:nth-child(v+1))
'//*[@name="N"][@value="v"]'  // element with name N & specified value ‘v’ (css: *[name='N'][value='v’])


// 5. Lang & Class.


'//E[@lang="L" or starts-with(@lang, concat("L", "-"))]'  // element <E> is explicitly in language L or subcode (css: E[lang|=L])
'//*[contains(concat(" ", @class, " "), " C ")]'          // element with a class C (css: .C)
'//E[contains(concat(" ", @class, " "), " C ")]'          // element <E> with a class C (css: E.C)


// 6. Text & Link.


'//*[.="t"]'                  // element containing text 't' exactly
'//E[contains(text(), "t")]'  // element <E> containing text 't' (css: E:contains('t'))
'//a'                         // link element (css: a)
'//a[.="t"]'                  // element <a> containing text 't' exactly
'//a[contains(text(), "t")]'  // element <a> containing text 't' (css: a:contains('t'))
'//a[@href="url"]'            // <a> with target link 'url' (css: a[href='url'])
'//a[.="t"]/@href'            // link URL labeled with text 't' exactly


// 7. Parent & Child.


'//E/*[1]'                                                        // first child of element <E> (css: E > *:first-child)
'//E[1]'                                                          // first <E> child (css: E:first-of-type)
'//E/*[last()]'                                                   // last child of element E (css: E *:last-child)
'//E[last()]'                                                     // last <E> child (css: E:last-of-type)
'//E[2]'                                                          // second <E> child (css: E:nth-of-type(2))
'//*[2][name()="E"]'                                              // second child that is an <E> element (css: E:nth-child(2))
'//E[last()-1]'                                                   // second-to-last <E> child (css: E:nth-last-of-type(2))
'//*[last()-1][name()="E"]'                                       // second-to-last child that is an <E> element (css: E:nth-last-child(2))
'//E1/[E2 and not( *[not(self::E2)])]'                            // element <E1> with only <E2> children
'//E/..'                                                          // parent of element <E>
'//*[@id="I"]/.../.../.../E'                                      // descendant <E> of element with id I using specific path (css: #I > … > … > … > E)
'//*[@id="I"]//E'                                                 // descendant <E> of element with id I using unspecified path (css: #I E)
'//E[count(*)=0]'                                                 // element <E> with no children (E:empty)
'//E[count(*)=1]'                                                 // element <E> with an only child
'//E[count(preceding-sibling::*)+count(following-sibling::*)=0]'  // element <E> that is an only child (css: E:only-child)
'//E[count(../E) = 1]'                                            // element <E> with no <E> siblings (css: E:only-of-type)
'//E[position() mod N = M + 1]'                                   // every Nth element starting with the (M+1)th (css: E:nth-child(Nn+M))


// 8. Sibling.


'//E2/following-sibling::E1'                 // element <E1> following some sibling <E2> (css: E2 ~ E1)
'//E2/following-sibling::*[1][name()="E1"]'  // element <E1> immediately following sibling <E2> (css: E2 + E1)
'//E2/following-sibling::*[2][name()="E1"]'  // element <E1> following sibling <E2> with one intermediary (css: E2 + * + E1)
'//E/following-sibling::*'                   // sibling element immediately following <E> (css: E + *)
'//E2/preceding-sibling::E1'                 // element <E1> preceding some sibling <E2>
'//E2/preceding-sibling::*[1][name()="E1"]'  // element <E1> immediately preceding sibling <E2>
'//E2/preceding-sibling::*[2][name()="E1"]'  // element <E1> preceding sibling <E2> with one intermediary
'//E/preceding-sibling::*[1]'                // sibling element immediately preceding <E>


// 9. Table Cell.


'//*[@id="TestTable"]//tr[3]//td[2]'          // cell by row and column (e.g. 3rd row, 2nd column) (css: #TestTable tr:nth-child(3) td:nth-child(2))
'//td[preceding-sibling::td="t"]'             // cell immediately following cell containing 't' exactly
'td[preceding-sibling::td[contains(.,"t")]]'  // cell immediately following cell containing 't' (css: td:contains('t') ~ td)


// 10. Dynamic.


'//E[@disabled]'       // user interface element <E> that is disabled (css: E:disabled)
'//*[not(@disabled)]'  // user interface element that is enabled (css: E:enabled)
'//*[@checked]'        // checkbox (or radio button) that is checked (css: *:checked)


// 11. XPath Functions.
// https://developer.mozilla.org/en-US/docs/Web/XPath/Functions


// 11.1. Conversion.


boolean(expression)  // evaluates an expression and returns true or false.
string([object])     // converts the given argument to a string.
number([object])     // converts an object to a number and returns the number.


// 11.2. Math.


ceiling(number)  // evaluates a decimal number and returns the smallest integer greater than or equal to the decimal number.
floor(number)    // evaluates a decimal number and returns the largest integer less than or equal to the decimal number.
round(decimal)   // returns a number that is the nearest integer to the given number.
sum(node-set)    // returns a number that is the sum of the numeric values of each node in a given node-set.


// 11.3. Logic.


true()           // returns a boolean value of true.
false()          // returns boolean false.
not(expression)  // evaluates a boolean expression and returns the opposite value.


// 11.4. Node.


lang(string)               // determines whether the context node matches the given language and returns boolean true or false.
name([node-set])           // returns a string representing the QName of the first node in a given node-set.
namespace-uri([node-set])  // returns a string representing the namespace URI of the first node in a given node-set.


// 11.5. Context.


count(node-set)           // counts the number of nodes in a node-set and returns an integer.
function-available(name)  // determines if a given function is available and returns boolean true or false.
last()                    // returns a number equal to the context size from the expression evaluation context.
position()                // returns a number equal to the context position from the expression evaluation context.


// 11.6. String.


contains(haystack-string, needle-string)  // determines whether the first argument string contains the second argument string and returns boolean true or false.
concat(string1, string2 [stringn]*)       // concatenates two or more strings and returns the resulting string.
normalize-space(string)                   // strips leading and trailing white-space from a string, replaces sequences of whitespace characters by a single space, and returns the resulting string.
starts-with(haystack, needle)             // checks whether the first string starts with the second string and returns true or false.
string-length([string])                   // returns a number equal to the number of characters in a given string.
substring(string, start [length])         // returns a part of a given string.
substring-after(haystack, needle)         // returns a string that is the rest of a given string after a given substring.
substring-before(haystack, needle)        // returns a string that is the rest of a given string before a given substring.
translate(string, abc, XYZ)               // evaluates a string and a set of characters to translate and returns the translated string.


// 12. XPath Axes.

ancestor            // indicates all the ancestors of the context node beginning with the parent node and traveling through to the root node.
ancestor-or-self    // indicates the context node and all of its ancestors, including the root node.
attribute (@)       // indicates the attributes of the context node. Only elements have attributes. This axis can be abbreviated with the at sign (@).
child (/)           // indicates the children of the context node. If an XPath expression does not specify an axis, this is understood by default. Since only the root node or element nodes have children, any other use will select nothing.
descendant (//)     // indicates all of the children of the context node, and all of their children, and so forth. Attribute and namespace nodes are not included - the parent of an attribute node is an element node, but attribute nodes are not the children of their parents.
descendant-or-self  // indicates the context node and all of its descendants. Attribute and namespace nodes are not included - the parent of an attribute node is an element node, but attribute nodes are not the children of their parents.
following           // indicates all the nodes that appear after the context node, except any descendant, attribute, and namespace nodes.
following-sibling   // indicates all the nodes that have the same parent as the context node and appear after the context node in the source document.
parent(..)          // indicates the single node that is the parent of the context node. It can be abbreviated as two periods (..).
preceding           // indicates all the nodes that precede the context node in the document except any ancestor, attribute and namespace nodes.
preceding-sibling   // indicates all the nodes that have the same parent as the context node and appear before the context node in the source document.
self (.)            // indicates the context node itself. It can be abbreviated as a single period (.).

On gists

Xpath - ukázky

28.11.2019

PHP-PHPDOM

readme.txt #

@url: https://funkcionalne.k47.cz/2015/01/xpath-co-proc-a-hlavne-jak.html
@url: https://funkcionalne.k47.cz/2014/05/php-dom-simplexml-a-matcher.html

================================================================

/html – Vybere element html, který je kořenovým elementem stromu.

/html/body - Odpovídá všem elementům body, které jsou přímými potomky kořenového elementu html`.

//h2 – Všechny elementy h2, které se v dokumentu vyskytují, včetně vlastních podstromů. Pokud jeden h2 kdekoli v sobě obsahuje jiný element h2, výsledkem budou všechny tyto elementy.

//div/span//a – Selektory je možno libovolně řetězit a tento najde všechny a jako potomky span, které jsou přímými potomky libovolného divu.

./div nebo div – Vybere všechny elementy div které jsou přímými potomky právě aktivního elementu. Jde o relativní polohu a ne o polohu fixovanou ke kořenu dokumentu.

.//div – To samé jako předchozí výraz, ale připouští i nepřímé potomky.

div[@class="asdf"] – div, jehož atribut class je „asdf“. XPath nepodporuje CSS třídy a k obsahu atributů přistupuje jako ke stringům a proto, tento výraz není stejný jako css selektor div.asdf. XPath podmínce @class="asdf" vyhoví jen ty elementy jejichž atribut class je jen a pouze string „asdf“. Na toto je si třeba dávat pozor, protože jde o největší odchylku od chování, které by čekal člověk odkojený CSS selektory.

div[@class != "asdf"] nebo div[not(@class = "asdf")] – Odpovídá divům jejichž třída není „asdf“.

div[contains(@class, "asdf")] – divy jejichž atribut class obsahuje string „asdf“. Například: Třída „xasdfy“ také vyhovuje tomuto predikátu.

div[starts-with(@class, "asdf")] – divy jejichž atribut class začíná na „asdf“.

div[contains(concat(" ", normalize-space(@class), " "), " asdf ")] – Tento výraz má stejný význam jako CSS selektor div.asdf (tedy kromě toho, že CSS nepřihlíží v velikosti písmen a XPath ano). Naštěstí takové konstrukce nejsou skoro nikdy potřeba, protože většinou stačí jednoduché @class="asdf" nebo funkce contains().

div[@class] – divy, které mají nějaký atribut class

div[span] – divy, které mají jako přímého potomka span

div[.//span] – divy, které obsahují span

div[span[@class]] – divy, které mají přímého potomka span, který má nastavenou libovolnou třídu

div[@class and span] – divy, které mají nastaven atribut class a zároveň obsahují span jako potomka. Stejně jako and se dá použít i logické or nebo funkce not().

div[1] – První div v pořadí v jakém se vyskytuje v dokumentu.

div[last()] – Poslední div.

div[position() mod 2 = 0] – Každý sudý div.

Na pořadí predikátů v hranatých závorkách záleží:

div[@class="asdf"][1] – Vybere všechny divy s třídou „asdf“ a z nich pak vybere ten první. Pokud existují nějaké divy s touto třídou, výsledkem bude vždycky první z nich.

div[1][@class="asdf"] – Vybere první div a ten ve výsledné množině ponechá jen pokud má třídu „asdf“. Když existují divy s touto třídou, ale první ji nemá, výsledkem je prázdná množina.

div/text()[1] – Z divu vybere první kus textu, který není obsažen v žádném jiném elementu.

preceding-sibling::h2 – Vybere elementy h2, které předcházejí aktivnímu elementu a zároveň jsou jeho sourozenci (tj. jsou přímými potomky stejného elementu jako aktivní element).

a[@class="active"]/following-sibling::a[1] – Vybere první odkaz, který následuje (a je sourozenec) po odkazu s třídou „active“.

On gists

Ukázka parsování stringu - PHP DOM

27.11.2019

PHP-PHPDOM

example.php #

<?php
// ukazky https://github.com/kaja47/Matcher

$htmlString = '
    <div class="post" id="prvni">
        <div class="date">DNES</div>
        <h2>prvni nadpis</h2>
    </div>
    <div class="post" id="druhy">
        <div class="date">ZITRA</div>
        <h2>druhy nadpis</h2>
    </div>

    <textarea name="message"></textarea>
';

$dom = new DOMDocument;
$dom->loadHTML($htmlString);
$xpath = new \DOMXpath($dom);
$nodes = $xpath->query('//div[@class="post"]');
$res = [];
foreach ($nodes as $node) {
  $res[] = [
    'id'    => $xpath->query('@id', $node)->item(0)->textContent,
    'date'  => $xpath->query('div[@class="date"]', $node)->item(0)->textContent,
    'title' => $xpath->query('h2', $node)->item(0)->textContent,
  ];
}

$res['textarea'] = $xpath->query('//textarea/@name')->item(0)->textContent;


print_R($res);

/*

Array
(
    [0] => Array
        (
            [id] => prvni
            [date] => DNES
            [title] => prvni nadpis
        )

    [1] => Array
        (
            [id] => druhy
            [date] => ZITRA
            [title] => druhy nadpis
        )

    [textarea] => message
)


*/

On gists

Parsing og tags

1.11.2019

PHP PHP-PHPDOM

parser.php #

<?php 


function autoUTF($s)
{
    if (preg_match('#[\x80-\x{1FF}\x{2000}-\x{3FFF}]#u', $s)) // detect UTF-8
    {
        return $s;
    }
    elseif (preg_match('#[\x7F-\x9F\xBC]#', $s)) // detect WINDOWS-1250
    {
        return iconv('WINDOWS-1250', 'UTF-8', $s);
    }
    else // assume ISO-8859-2
    {
        return iconv('ISO-8859-2', 'UTF-8', $s);
    }
}
 

$url = 'https://navratdoreality.cz/uzdraveny-fotbalista-8853.html';

$html = file_get_contents($url);

libxml_use_internal_errors(true); // Yeah if you are so worried about using @ with warnings
$doc = new DomDocument();
$doc->loadHTML($html);
$xpath = new DOMXPath($doc);
$query = '//*/meta[starts-with(@property, \'og:\')]';
$metas = $xpath->query($query);
$rmetas = array();
foreach ($metas as $meta) {
    $property = $meta->getAttribute('property');
    $content = $meta->getAttribute('content');
    $rmetas[$property] = html_entity_decode(autoUTF($content));
}
var_dump($rmetas);



// $pattern='/<\s*meta\s+property="og:([^"]+)"\s+content="([^"]*)/i';
// preg_match_all($pattern, $html, $out);
// var_dump(array_combine($out[1], $out[2]));

On gists

PHP DOM examples

25.10.2018

PHP-PHPDOM

example.php #

<?php


/ Bez tohoto volání při načítání HTML5 vyskakují podobné chyby:
// Warning:  DOMDocument::loadHTML(): Tag header invalid in Entity
libxml_use_internal_errors(true);

$data = file_get_contents("test.html");

$dom = new DOMDocument();
$dom->loadHtml(mb_convert_encoding($data, 'HTML-ENTITIES', 'UTF-8'));
$finder = new DomXPath($dom);


// Elementy
$nodesByElement = $finder->query("//a");       # CSS: a
$nodesByElement = $finder->query("//a/span");  # CSS: a > span
$nodesByElement = $finder->query("//a//span"); # CSS: a span

// ID a atributy
$nodeById = $finder->query("//*[@id='testId']");   # CSS: #testId
$nodeById = $finder->query("//div[@id='testId']"); # CSS: div#testId
$nodeByAttr = $finder->query("//*[@data-city]");   # CSS: [data-city]

// Třídy - těch může být více v jednom atributu, trochu se to komplikuje
// CSS: .inactive
$classToFind = "inactive";
$byClass = $finder->query("//*[contains(concat(' ', normalize-space(@class), ' '), ' ".$classToFind." ')]");

// XPath se v těchto případech nechová úplně stejně jako CSS
$byIndex = $finder->query("//div/a[2]");      # CSS: div > a:nth-child(2)
$lastNode = $finder->query("//div/a[last()]");# CSS: div > a:last-child

// Vyhledávání pouze v dříve vyhledaném elementu
$finder->query("//a", $nodeById);

// Rodič, potomci, předchozí sourozenec, následující sourozenec
// POZOR: Počítá i TextNode
$parentNode = $nodeById->item(0)->parentNode;
$previousSibling = $nodeById->item(0)->previousSibling;
$nextSibling = $nodeById->item(0)->nextSibling;
$children = $nodeById->item(0)->childNodes;


// Získání kusu HTML nalezeného selektorem
$htmlPart = $dom->saveHtml($nodeById->item(0));

//Získání hodnotu atributu
$linkNode->item(0)->getAttribute("href");

// Změnu atributu
$linkNode->item(0)->setAttribute("href", "/");

// Smazání tagu img, který je přímým potomkem
$toRemove = $finder->query("img", $nodeList->item(2));
$removedItem = $nodeList->item(2)->removeChild($toRemove->item(0));

//Vložení nového (nyní odstraněného) elementu
$nodeList->item(0)->appendChild($removedItem);

On gists

PHPDOM - examples

22.10.2018

PHP PHP-PHPDOM

responsive-images.php #

  // Create a DOMDocument
  $dom = new DOMDocument();
	
  // Load html including utf8, like Hebrew
  $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
	
  // Create the div wrapper
  $div = $dom->createElement('div');
  $div->setAttribute('class', 'responsive-img');
	
  // Get all the images
  $images = $dom->getElementsByTagName('img');
 
  // Loop the images
  foreach ($images as $image) 
  {
    //Clone our created div
    $new_div_clone = $div->cloneNode();
		
    //Replace image with wrapper div
    $image->parentNode->replaceChild($new_div_clone,$image);
		
    //Append image to wrapper div
    $new_div_clone->appendChild($image);
  }
	
  // Save the HTML
  $html = $dom->saveHTML();
	
  return $html;

On gists

From https://diskuse.jakpsatweb.cz/?action=vthread&forum=9&topic=109986#7 fotbal-dom-parsování

28.10.2016

PHP-PHPDOM

parsovani-dom.php #

$data = <<< DATA
  <tr>
    <td rowspan=6 align="center" valign="middle"><font size=-2>12. KOLO</font></td>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>17:00</font></td>
    <td align="left"><font size=-2>FC ŠEBÁNEK</font></td>
    <td align="left"><font size=-2>ROZJETEJ STROJ</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>17:53</font></td>
    <td align="left"><font size=-2>BOMBERS TEAM</font></td>
    <td align="left"><font size=-2>STATUS QUO</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>18:46</font></td>
    <td align="left"><font size=-2>ELITA CHASERS</font></td>
    <td align="left"><font size=-2>VIPER TEAM</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>19:39</font></td>
    <td align="left"><font size=-2>PROŠLÝ MLÍKA</font></td>
    <td align="left"><font size=-2>RVHP</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>20:32</font></td>
    <td align="left"><font size=-2>OSPLPPOTR.</font></td>
    <td align="left"><font size=-2>FC KEBOURY</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>10.9.</font></td>
    <td align="center"><font size=-2>21:25</font></td>
    <td align="left"><font size=-2>AC PELYŇKOVÝ DESTILÁT UNITED</font></td>
    <td align="left"><font size=-2>LOSERS OF UFTALAND</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td rowspan=6 align="center" valign="middle"><font size=-2>13. KOLO</font></td>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>17:00</font></td>
    <td align="left"><font size=-2>LOSERS OF UFTALAND</font></td>
    <td align="left"><font size=-2>OSPLPPOTR.</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>17:53</font></td>
    <td align="left"><font size=-2>FC KEBOURY</font></td>
    <td align="left"><font size=-2>PROŠLÝ MLÍKA</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>18:46</font></td>
    <td align="left"><font size=-2>AC PELYŇKOVÝ DESTILÁT UNITED</font></td>
    <td align="left"><font size=-2>FC ŠEBÁNEK</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>19:39</font></td>
    <td align="left"><font size=-2>RVHP</font></td>
    <td align="left"><font size=-2>ELITA CHASERS</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>20:32</font></td>
    <td align="left"><font size=-2>VIPER TEAM</font></td>
    <td align="left"><font size=-2>BOMBERS TEAM</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
  <tr>
    <td align="center"><font size=-2>17.9.</font></td>
    <td align="center"><font size=-2>21:25</font></td>
    <td align="left"><font size=-2>STATUS QUO</font></td>
    <td align="left"><font size=-2>ROZJETEJ STROJ</font></td>
    <td align="right"><font size=-2><B> </B></font></td>
    <td align="right"><font size=-2> </font></td>
    <td align="left"><font size=-2> </font></td>
  </tr>
DATA;
 
$ary = array();
 
$dom = new DOMDocument();
$dom->loadHTML('<?xml encoding="UTF-8">' . $data);
$trs = $dom->getElementsByTagName('tr');
for ($i = 0; $i < $trs->length; $i++) {
    $tds = $trs->item($i)->getElementsbyTagName('td');
    for ($j = 0; $j < $tds->length; $j++) {
        $ary[$i][$j] = $tds->item($j)->getElementsByTagName('font')->item(0)->nodeValue;
    }
}
 
var_dump($ary);

On gists

From http://programujte.com/forum/vlakno/30409-csfd-api/

16.10.2016

PHP-PHPDOM

csfd-dom-xpath.php #

<?php

$dom = new domDocument;
$csfd = file_get_contents("http://www.csfd.cz/film/$csfd_id");
$html = (ord($csfd[0]) == 31) ? gzdecode($csfd) : $csfd;
@$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;

$xpath = new DOMXPath($dom);
$nazvy = array();
$zeme = array();
$names_other = "";
$nodes = $xpath->query("//h1[@itemprop='name']");
$names_cs = $nodes->item(0)->nodeValue;

foreach($xpath->query("//ul[@class='names']/li/h3") as $li) {
    $nazvy[] = $li->nodeValue;
}
foreach($xpath->query("//ul[@class='names']/li/img") as $li) {
    $zeme[] = $li->getAttribute('alt');
}
for($i=0;$i<count($nazvy);$i++){
    if($i==count($nazvy)-1)
        $names_other .= $zeme[$i]."-".$nazvy[$i];
    else
        $names_other .= $zeme[$i]."-".$nazvy[$i].";";
}

$nodes = $xpath->query("//h2[@class='average']");
$hodnoceni = str_replace('%', '', $nodes->item(0)->nodeValue);

$nodes = $xpath->query("//p[@class='origin']");
$podrobnosti = explode(", ", $nodes->item(0)->nodeValue);

$nodes = $xpath->query("//p[@class='genre']");
$genre = str_replace(' / ', '@;@', $nodes->item(0)->nodeValue);

$nodes = $xpath->query("//span[@data-truncate='340']");
$hraji = $nodes->item(0)->nodeValue;

$nodes = $xpath->query("//div[@data-truncate='570']");
$popis = $nodes->item(0)->nodeValue;

$nodes = $xpath->query("//img[@class='film-poster']");
$poster_url = "http:".$nodes->item(0)->getAttribute('src');