File: /home/eliteafr/pmb/classes/docwatch/datasources/docwatch_datasource_monitoring_website.class.php
<?php
// +-------------------------------------------------+
// © 2002-2014 PMB Services / www.sigb.net pmb@sigb.net et contributeurs (voir www.sigb.net)
// +-------------------------------------------------+
// $Id: docwatch_datasource_monitoring_website.class.php,v 1.1.2.5 2016-12-05 13:25:38 dgoron Exp $
if (stristr($_SERVER['REQUEST_URI'], ".class.php")) die("no access");
require_once($class_path."/docwatch/datasources/docwatch_datasource.class.php");
/**
* class docwatch_datasource_monitoring_website
*
*/
class docwatch_datasource_monitoring_website extends docwatch_datasource{
/** Aggregations: */
/** Compositions: */
/*** Attributes: ***/
protected $upload_date;
protected $content;
protected $content_hash;
protected $content_headers;
/**
* @return void
* @access public
*/
public function __construct($id=0) {
parent::__construct($id);
} // end of member function __construct
protected function clean_html($html){
if($this->parameters['xpath_expressions'] == '') {
preg_match("/\<body.*\>(.*)\<\/body\>/isU", $html, $matches);
$html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $matches[1]);
} else {
$html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html);
}
switch ($this->parameters['mode_creation_items']) {
case 'all_change':
case 'by_change':
$html = strip_tags($html);
break;
case 'all_links':
preg_match_all("/\<a.*\>(.*)\<\/a\>/isU", strip_tags($html, '<a>'), $link_matches);
$links = array();
foreach ($link_matches[0] as $link_match) {
$links[] = $link_match;
}
$html = implode(PHP_EOL, $links);
break;
}
return $html;
}
protected function save_content($first=true) {
if($first) {
$query = "insert into docwatch_datasource_monitoring_website set
datasource_monitoring_website_num_datasource = '".$this->id."', ";
} else {
$query = "update docwatch_datasource_monitoring_website set ";
$where = "where datasource_monitoring_website_num_datasource = '".$this->id."' ";
}
$query .= "
datasource_monitoring_website_upload_date = NOW(),
datasource_monitoring_website_content = '".addslashes(serialize($this->content))."',
datasource_monitoring_website_content_hash = '".addslashes($this->content_hash)."'";
$query .= $where;
pmb_mysql_query($query);
}
protected function get_xdiff_change($xdiff_string) {
$xdiff_change = array();
preg_match_all("/@@(.*)@@/isU", $xdiff_string, $matches);
for($i=0; $i<count($matches[0]); $i++) {
$start = strpos($xdiff_string, $matches[0][$i])+strlen($matches[0][$i]);
if($i<count($matches[0])-1) {
$html_extracted = substr($xdiff_string, $start, (strpos($xdiff_string, $matches[0][$i+1]) - $start));
} else {
$html_extracted = substr($xdiff_string, $start);
}
$has_change = false;
$lines = explode(PHP_EOL, $html_extracted);
$change = array();
foreach ($lines as $line) {
$line = trim($line);
if($line != '') {
if(substr($line, 0, 1) == '-') {
if(strip_tags(trim(substr($line, 1))) != '') {
// $change[] = "<p>- <del style='background-color:#ffcccc'>".trim(substr($line, 1))."</del></p>";
$has_change = true;
}
} elseif(substr($line, 0, 1) == '+') {
if(strip_tags(trim(substr($line, 1))) != '') {
// $change[] = "<p>+ <ins style='background-color:#ccffcc'>".trim(substr($line, 1))."</ins></p>";
$change[] = "<p>".trim(substr($line, 1))."</p>";
$has_change = true;
}
} else {
if(strip_tags($line) != '') {
$change[] = "<p>".$line."</p>";
}
}
}
}
if($has_change) {
$xdiff_change[] = $change;
}
}
return $xdiff_change;
}
protected function get_content_from_link($link) {
global $charset;
$content_from_link = array();
if($link){
$datas = array();
@ini_set("zend.ze1_compatibility_mode", "0");
$informations = array();
$loaded=false;
$aCurl = new Curl();
$aCurl->timeout=2;
$content = $aCurl->get($link);
$html=$content->body;
if($html && $content->headers['Status-Code'] == 200){
$this->content_headers = $content->headers;
if($charset != 'utf-8') {
$html = utf8_decode($html);
}
if(is_array($this->parameters['xpath_expressions']) && count($this->parameters['xpath_expressions'])) {
$dom = new DOMDocument();
$dom->encoding = $charset;
$old_errors_value = false;
if(libxml_use_internal_errors(true)){
$old_errors_value = true;
}
$loaded = $dom->loadHTML($html);
if($loaded) {
$xpath = new DOMXPath($dom);
foreach ($this->parameters['xpath_expressions'] as $i=>$xpath_expression) {
$entries = $xpath->query($xpath_expression);
$html_content = $this->clean_html($dom->saveHTML($entries->item(0)));
if($this->parameters['xpath_expressions_for_title'][$i]) {
$entries = $xpath->query($this->parameters['xpath_expressions_for_title'][$i]);
$html_title = $this->clean_html($dom->saveHTML($entries->item(0)));
} else {
$html_title = '';
}
$html_link = '';
if($this->parameters['xpath_expressions_for_link'][$i]) {
$entries = $xpath->query($this->parameters['xpath_expressions_for_link'][$i]);
preg_match("/\<a.*\>(.*)\<\/a\>/isU", strip_tags($dom->saveHTML($entries->item(0)), '<a>'), $link_matches);
if($link_matches[0]) {
$html_link = $link_matches[0];
}
}
$content_from_link[] = array(
'content' => $html_content,
'title' => $html_title,
'link' => $html_link
);
}
}
libxml_use_internal_errors($old_errors_value);
} else {
$content_from_link[] = array(
'content' => $this->clean_html($html),
'title' => '',
'link' => ''
);
}
}
}
return $content_from_link;
}
protected function get_constructed_remote_link($link, $link_for_construct = '') {
if($link_for_construct) {
if(substr($link_for_construct, 0, 7) == 'http://' || substr($link_for_construct, 0, 8) == 'https://') {
$link = $link_for_construct;
} else {
preg_match("/\s+(?:[^\"'>]+|\"[^\"]*\"|'[^']*')*href=(\"[^\"]+\"|'[^']+'|[^<>\s]+)/i", ' '.$link_for_construct.' ', $matches);
if($matches[0]) {
$match_link = str_replace('"', '', $matches[0]);
if(substr($match_link, 0, 7) == 'http://' || substr($match_link, 0, 8) == 'https://') {
$link = $match_link;
} else {
if(strpos($match_link, '/') !== false) {
$match_link = substr($match_link, strpos($match_link, '/')+1);
}
$link .= $match_link;
}
} else {
$link .= $link_for_construct;
}
}
} else {
$link .= '#'.strtotime($this->content_headers['Date']).rand(0,1000);
}
return $link;
}
protected function get_items_datas($link){
$items = array();
$content_from_link = $this->get_content_from_link($link);
$content_hash_from_link = md5(serialize($content_from_link));
if(is_array($content_from_link) && count($content_from_link)) {
$query = "select datasource_monitoring_website_upload_date, datasource_monitoring_website_content, datasource_monitoring_website_content_hash
from docwatch_datasource_monitoring_website where datasource_monitoring_website_num_datasource = ".$this->id;
$result = pmb_mysql_query($query);
if($result && pmb_mysql_num_rows($result)) {
$row = pmb_mysql_fetch_object($result);
if($content_hash_from_link != $row->datasource_monitoring_website_content_hash) {
$content_from_base = unserialize($row->datasource_monitoring_website_content);
if(is_array($this->parameters['xpath_expressions']) && count($this->parameters['xpath_expressions'])) {
$hash_from_base = array();
if(is_array($content_from_base)) {
foreach ($content_from_base as $content) {
$hash_from_base[] = md5($content['content']);
}
}
foreach ($content_from_link as $i=>$content) {
if(!in_array(md5($content['content']), $hash_from_base)) {
$items[] = array(
'content' => $content['content'],
'title' => ($content['title'] ? $content['title'] : $this->get_title()),
'link' => $this->get_constructed_remote_link($link, $content['link'])
);
}
}
} else {
if(!extension_loaded('xdiff')) {
return false;
}
if(md5($content_from_base[0]['content']) != md5($content_from_link[0]['content'])) {
$xdiff_string = xdiff_string_diff($content_from_base[0]['content'] , $content_from_link[0]['content']);
$xdiff_change = $this->get_xdiff_change($xdiff_string);
if($this->parameters['mode_creation_items'] == 'by_change') {
foreach ($xdiff_change as $i=>$change) {
$items[] = array(
'content' => implode('', $change),
'title' => $this->get_title(),
'link' => $this->get_constructed_remote_link($link)
);
}
} else {
$item_content = '';
foreach ($xdiff_change as $change) {
$item_content .= implode('', $change);
}
$items[] = array(
'content' => $item_content,
'title' => $this->get_title(),
'link' => $this->get_constructed_remote_link($link)
);
}
}
}
foreach ($items as $item) {
$data = array();
$data["type"] = "monitoring_website";
$data["title"] = $item['title'];
$data["summary"] = $item['content'];
$data["content"] = '';
$data["url"] = $item['link'];
$data["publication_date"] = date( 'Y-m-d H:i:s', strtotime($this->content_headers['Date']));
$data["logo_url"] = '';
$data["descriptors"] = "";
$data["tags"] = '';
$datas[] = $data;
}
$this->content = $content_from_link;
$this->content_hash = $content_hash_from_link;
$this->save_content(false);
}
} else {
$this->content = $content_from_link;
$this->content_hash = $content_hash_from_link;
$this->save_content(true);
}
return $datas;
}else{
return false;
}
}
public function get_available_selectors(){
global $msg;
return array(
"docwatch_selector_monitoring_website" => $msg['dsi_docwatch_selector_monitoring_website']
);
}
protected function get_expression_xpath_content($parameter_name, $i, $expression) {
global $msg;
$form = "<input type='text' data-dojo-type='dijit/form/TextBox' id='docwatch_datasource_monitoring_website_".$parameter_name."_".$i."' name='docwatch_datasource_monitoring_website_".$parameter_name."[]' value=\"".$expression."\" style='width: 50em;'/>
<button data-dojo-type='dijit/form/Button' type='button'>".$msg['raz']."
<script type='dojo/on' data-dojo-event='click' data-dojo-args='evt'>
require(['dojo/dom'], function(dom){
dom.byId('docwatch_datasource_monitoring_website_".$parameter_name."_".$i."').value = '';
});
</script>
</button>";
if($i == 0) {
$form .= "<button data-dojo-type='dijit/form/Button' type='button'>+
<script type='dojo/on' data-dojo-event='click' data-dojo-args='evt'>
require(['dojo/dom', 'dojo/dom-construct', 'dojo/dom-attr', 'dojo/on', 'dojo/parser'], function(dom, domConstruct, domAttr, on, parser){
var count = dom.byId('".$parameter_name."_count').value;
var div = domConstruct.create('div', {id : 'monitoring_website_".$parameter_name."_'+count});
var input = domConstruct.create('input', {type : 'text', 'data-dojo-type' : 'dijit/form/TextBox', id : 'docwatch_datasource_monitoring_website_".$parameter_name."_'+count, name : 'docwatch_datasource_monitoring_website_".$parameter_name."[]', style : 'width: 50em;'});
domConstruct.place(input, div);
var button = domConstruct.create('button', {id : 'monitoring_website_".$parameter_name."_'+count+'_button', 'data-dojo-type' : 'dijit/form/Button', type : 'button', innerHTML : ' X '});
on(button, 'click', function(){
dom.byId('docwatch_datasource_monitoring_website_".$parameter_name."_'+count).value = '';
});
domConstruct.place(button, div);
domConstruct.place(div, 'add_".$parameter_name."');
parser.parse('monitoring_website_".$parameter_name."_'+count);
dom.byId('".$parameter_name."_count').value = count+1;
});
</script>
</button>";
}
return $form;
}
protected function get_xpath_expressions_form($parameter_name) {
global $msg,$charset;
$form = "
<div class='row'>
<label>".htmlentities($msg['dsi_docwatch_datasource_monitoring_website_'.$parameter_name],ENT_QUOTES,$charset)."</label>
</div>";
if(is_array($this->parameters[$parameter_name]) && count($this->parameters[$parameter_name])) {
foreach ($this->parameters[$parameter_name] as $i=>$expression) {
$form .= "
<div class='row'>
".$this->get_expression_xpath_content($parameter_name, $i, $expression)."
</div>";
}
$form .= "<input type='hidden' id='".$parameter_name."_count' name='".$parameter_name."_count' value= '".count($this->parameters[$parameter_name])."' />";
} else {
$form .= "
<div class='row'>
".$this->get_expression_xpath_content($parameter_name, 0, '')."
</div>
<input type='hidden' id='".$parameter_name."_count' name='".$parameter_name."_count' value= '1' />";
}
$form .= "<div id='add_".$parameter_name."'></div>";
return $form;
}
public function get_form_content(){
global $msg,$charset;
$form = parent::get_form_content();
$form .= "<div class='row'> </div>
<div class='row'>
<label>".htmlentities($msg['dsi_docwatch_datasource_monitoring_website_mode_creation_items'],ENT_QUOTES,$charset)."</label>
</div>
<div class='row'>
<select id='docwatch_datasource_monitoring_website_mode_creation_items' name='docwatch_datasource_monitoring_website_mode_creation_items'>
<option value='all_change' ".("all_change" == $this->parameters['mode_creation_items'] ? "selected='selected'" : "").">".htmlentities($msg['dsi_docwatch_datasource_monitoring_website_mode_creation_items_all_change'], ENT_QUOTES, $charset)."</option>
<option value='by_change' ".("by_change" == $this->parameters['mode_creation_items'] ? "selected='selected'" : "").">".htmlentities($msg['dsi_docwatch_datasource_monitoring_website_mode_creation_items_by_change'], ENT_QUOTES, $charset)."</option>
<option value='all_links' ".("all_links" == $this->parameters['mode_creation_items'] ? "selected='selected'" : "").">".htmlentities($msg['dsi_docwatch_datasource_monitoring_website_mode_creation_items_all_links'], ENT_QUOTES, $charset)."</option>
</select>
</div>
<div class='row'> </div>
".$this->get_xpath_expressions_form('xpath_expressions')."
<div class='row'> </div>
".$this->get_xpath_expressions_form('xpath_expressions_for_title')."
<div class='row'> </div>
".$this->get_xpath_expressions_form('xpath_expressions_for_link')."
";
return $form;
}
public function set_from_form() {
global $docwatch_datasource_monitoring_website_mode_creation_items;
global $docwatch_datasource_monitoring_website_xpath_expressions;
global $docwatch_datasource_monitoring_website_xpath_expressions_for_title;
global $docwatch_datasource_monitoring_website_xpath_expressions_for_link;
$this->parameters['mode_creation_items'] = stripslashes($docwatch_datasource_monitoring_website_mode_creation_items);
$this->parameters['xpath_expressions'] = array();
foreach ($docwatch_datasource_monitoring_website_xpath_expressions as $xpath_expression) {
if($xpath_expression) {
$this->parameters['xpath_expressions'][] = stripslashes($xpath_expression);
}
}
$this->parameters['xpath_expressions_for_title'] = array();
foreach ($docwatch_datasource_monitoring_website_xpath_expressions_for_title as $xpath_expression_for_title) {
if($xpath_expression_for_title) {
$this->parameters['xpath_expressions_for_title'][] = stripslashes($xpath_expression_for_title);
}
}
$this->parameters['xpath_expressions_for_link'] = array();
foreach ($docwatch_datasource_monitoring_website_xpath_expressions_for_link as $xpath_expression_for_link) {
if($xpath_expression_for_link) {
$this->parameters['xpath_expressions_for_link'][] = stripslashes($xpath_expression_for_link);
}
}
parent::set_from_form();
}
} // end of docwatch_datasource_monitoring_website