scraping images and videos with regex, similar to the way wp core does

This commit is contained in:
Paul Kirby 2018-10-27 14:28:20 -05:00
parent 65c404fa3e
commit 34620a4edb
2 changed files with 104 additions and 91 deletions

View file

@ -60,30 +60,30 @@ class LBRY_Speech
if ($all_media) { if ($all_media) {
foreach ($all_media as $media) { foreach ($all_media as $media) {
//// TODO: Check if media type is accepted //// TODO: Check if media type is accepted
$meta = get_post_meta($media->id, '_wp_attachment_metadata', true); // $meta = get_post_meta($media->id, '_wp_attachment_metadata', true);
error_log(print_r($meta, true)); error_log(print_r($media, true));
if (! get_post_meta($media->id, 'lbry_speech_uploaded')) { // if (! get_post_meta($media->id, 'lbry_speech_uploaded')) {
$params = array( // $params = array(
'name' => $media->name, // 'name' => $media->name,
'file' => $media->file, // 'file' => $media->file,
'title' => $media->title, // 'title' => $media->title,
'type' => $media->type // 'type' => $media->type
); // );
//
if (LBRY_SPEECH_CHANNEL && LBRY_SPEECH_CHANNEL_PASSWORD) { // if (LBRY_SPEECH_CHANNEL && LBRY_SPEECH_CHANNEL_PASSWORD) {
$params['channelName'] = LBRY_SPEECH_CHANNEL; // $params['channelName'] = LBRY_SPEECH_CHANNEL;
$params['channelPassword'] = LBRY_SPEECH_CHANNEL_PASSWORD; // $params['channelPassword'] = LBRY_SPEECH_CHANNEL_PASSWORD;
} // }
//
$result = $this->request('publish', $params); // $result = $this->request('publish', $params);
error_log(print_r($result, true)); // error_log(print_r($result, true));
//
// TODO: Make sure to warn if image name is already taken on channel // // TODO: Make sure to warn if image name is already taken on channel
if ($result->success) { // if ($result->success) {
update_post_meta($media->id, 'lbry_speech_uploaded', true); // update_post_meta($media->id, 'lbry_speech_uploaded', true);
update_post_meta($media->id, 'lbry_speech_url', $result->data->serveUrl); // update_post_meta($media->id, 'lbry_speech_url', $result->data->serveUrl);
} // }
} // }
} }
} }
} }
@ -95,57 +95,79 @@ class LBRY_Speech
*/ */
protected function find_media($post_id) protected function find_media($post_id)
{ {
// TODO: Check wp_make_content_images_responsive for cannon way to scrub images & attachments
// https://developer.wordpress.org/reference/functions/wp_make_content_images_responsive/
$all_media = array(); $all_media = array();
// Get content and put into a DOMDocument // Get content and put into a DOMDocument
$content = apply_filters('the_content', get_post_field('post_content', $post_id)); $content = get_post_field('post_content', $post_id);
if (!$content) { if (!$content) {
return $all_media; return $all_media;
} }
$DOM = new DOMDocument();
// Hide HTML5 Tag warnings
libxml_use_internal_errors(true);
$DOM->loadHTML($content);
$images = $DOM->getElementsByTagName('img'); preg_match_all('/<img [^>]+>/', $content, $images);
$videos = $DOM->getElementsByTagName('video');
// Get each image attribute // Only MP4 videos for now
preg_match_all('/\[video.*mp4=".*".*\]/', $content, $videos);
error_log(print_r($images, true));
error_log(print_r($videos, true));
// Throw each image into a media object
foreach ($images as $image) { foreach ($images as $image) {
// error_log(print_r(get_intermediate_image_sizes(), true));
$src = $image->getAttribute('src'); // Looks for wp image class first, if not, pull id from source
if ($this->is_local($src)) { if (preg_match('/wp-image-([0-9]+)/i', $image[0], $class_id)) {
$all_media[] = new LBRY_Speech_Media($src); $attachment_id = absint($class_id[1]);
} elseif (preg_match('/src="((?:https?:)?\/\/[^"]+)"/', $image[0], $src)) {
$attachment_id = $this->rigid_attachment_url_to_postid($src[1]);
}
if ($attachment_id) {
$all_media[] = new LBRY_Speech_Media($attachment_id, array(), true);
} }
} }
// Parse video tags based on wordpress output for local embedds // Parse video tags based on wordpress shortcode for local embedds
// Because video tag is HTML5, treat it like an XML node
foreach ($videos as $video) { foreach ($videos as $video) {
$source = $video->getElementsByTagName('source'); if (preg_match('/mp4="((?:https?:)?\/\/[^"]+)"/', $video[0], $src)) {
$src = $source[0]->attributes->getNamedItem('src')->value; $attachment_id = $this->rigid_attachment_url_to_postid($src[1]);
if ($this->is_local($src)) {
$all_media[] = new LBRY_Speech_Media($src); if ($attachment_id) {
$all_media[] = new LBRY_Speech_Media($attachment_id);
}
} }
} }
return $all_media; return $all_media;
} }
/** /**
* Checks to see if a url is local to this installation * Checks for image crop sizes and filters out query params
* @param string $url * Courtesy of this post: http://bordoni.me/get-attachment-id-by-image-url/
* @return boolean * @param string $url The url of the attachment you want an ID for
* @return int The found post_id
*/ */
private function is_local($url) private function rigid_attachment_url_to_postid($url)
{ {
if (strpos($url, home_url()) !== false) { $scrubbed_url = strtok($url, '?'); // Clean up query params first
return true; $post_id = attachment_url_to_postid($scrubbed_url);
if (! $post_id) {
$dir = wp_upload_dir();
$path = $scrubbed_url;
if (0 === strpos($path, $dir['baseurl'] . '/')) {
$path = substr($path, strlen($dir['baseurl'] . '/'));
}
if (preg_match('/^(.*)(\-\d*x\d*)(\.\w{1,})/i', $path, $matches)) {
$url = $dir['baseurl'] . '/' . $matches[1] . $matches[3];
$post_id = attachment_url_to_postid($url);
}
} }
return false; return (int) $post_id;
} }
/** /**

View file

@ -27,7 +27,9 @@ class LBRY_Speech_Media
public $thumbnail; public $thumbnail;
public function __construct($url, $args = array()) private $is_image = false;
public function __construct(int $attachment_id, $args = array(), bool $is_image = false)
{ {
// Set supplied arguments // Set supplied arguments
@ -44,46 +46,35 @@ class LBRY_Speech_Media
$this->{$key} = $value; $this->{$key} = $value;
} }
// Get attachment ID, name, file, and type from the URL // Flag as image if it is one
$url = strtok($url, '?'); // Clean up query params first if ($is_image) {
$id = $this->rigid_attachment_url_to_postid($url); $this->is_image = true;
$attachment = get_post($id);
$path = get_attached_file($id);
$type = $attachment->post_mime_type;
$filename = wp_basename($path);
$this->id = $id;
// COMBAK: Probably wont need this underscore check with Daemon V3
$this->name = str_replace('_', '-', $attachment->post_name);
$this->file = new CURLFile($path, $type, $filename);
$this->type = $type;
$this->title = $attachment->post_title;
}
/**
* Checks for image crop sizes and filters out query params
* Courtesy of this post: http://bordoni.me/get-attachment-id-by-image-url/
* @param string $url The url of the attachment you want an ID for
* @return int The found post_id
*/
private function rigid_attachment_url_to_postid($url)
{
$post_id = attachment_url_to_postid($url);
if (! $post_id) {
$dir = wp_upload_dir();
$path = $url;
if (0 === strpos($path, $dir['baseurl'] . '/')) {
$path = substr($path, strlen($dir['baseurl'] . '/'));
}
if (preg_match('/^(.*)(\-\d*x\d*)(\.\w{1,})/i', $path, $matches)) {
$url = $dir['baseurl'] . '/' . $matches[1] . $matches[3];
$post_id = attachment_url_to_postid($url);
}
} }
return (int) $post_id;
// // Get attachment ID, name, file, and type from the URL
// $url = strtok($url, '?'); // Clean up query params first
// $id = $this->rigid_attachment_url_to_postid($url);
$meta = wp_get_attachment_metadata($attachment_id);
error_log(print_r($meta, true));
$attachment = get_post($id);
$path = get_attached_file($id);
// $type = $attachment->post_mime_type;
// $filename = wp_basename($path);
//
// $this->id = $id;
// // COMBAK: Probably wont need this underscore check with Daemon V3
// $this->name = str_replace('_', '-', $attachment->post_name);
// $this->file = new CURLFile($path, $type, $filename);
// $this->type = $type;
// $this->title = $attachment->post_title;
}
public function is_image()
{
return $this->is_image;
} }
} }