scraping images and videos with regex, similar to the way wp core does

This commit is contained in:
Paul Kirby 2018-10-27 14:28:20 -05:00
parent 65c404fa3e
commit 34620a4edb
2 changed files with 104 additions and 91 deletions

View file

@ -60,30 +60,30 @@ class LBRY_Speech
if ($all_media) {
foreach ($all_media as $media) {
//// TODO: Check if media type is accepted
$meta = get_post_meta($media->id, '_wp_attachment_metadata', true);
error_log(print_r($meta, true));
if (! get_post_meta($media->id, 'lbry_speech_uploaded')) {
$params = array(
'name' => $media->name,
'file' => $media->file,
'title' => $media->title,
'type' => $media->type
);
if (LBRY_SPEECH_CHANNEL && LBRY_SPEECH_CHANNEL_PASSWORD) {
$params['channelName'] = LBRY_SPEECH_CHANNEL;
$params['channelPassword'] = LBRY_SPEECH_CHANNEL_PASSWORD;
}
$result = $this->request('publish', $params);
error_log(print_r($result, true));
// TODO: Make sure to warn if image name is already taken on channel
if ($result->success) {
update_post_meta($media->id, 'lbry_speech_uploaded', true);
update_post_meta($media->id, 'lbry_speech_url', $result->data->serveUrl);
}
}
// $meta = get_post_meta($media->id, '_wp_attachment_metadata', true);
error_log(print_r($media, true));
// if (! get_post_meta($media->id, 'lbry_speech_uploaded')) {
// $params = array(
// 'name' => $media->name,
// 'file' => $media->file,
// 'title' => $media->title,
// 'type' => $media->type
// );
//
// if (LBRY_SPEECH_CHANNEL && LBRY_SPEECH_CHANNEL_PASSWORD) {
// $params['channelName'] = LBRY_SPEECH_CHANNEL;
// $params['channelPassword'] = LBRY_SPEECH_CHANNEL_PASSWORD;
// }
//
// $result = $this->request('publish', $params);
// error_log(print_r($result, true));
//
// // TODO: Make sure to warn if image name is already taken on channel
// if ($result->success) {
// update_post_meta($media->id, 'lbry_speech_uploaded', true);
// update_post_meta($media->id, 'lbry_speech_url', $result->data->serveUrl);
// }
// }
}
}
}
@ -95,57 +95,79 @@ class LBRY_Speech
*/
protected function find_media($post_id)
{
// TODO: Check wp_make_content_images_responsive for cannon way to scrub images & attachments
// https://developer.wordpress.org/reference/functions/wp_make_content_images_responsive/
$all_media = array();
// Get content and put into a DOMDocument
$content = apply_filters('the_content', get_post_field('post_content', $post_id));
$content = get_post_field('post_content', $post_id);
if (!$content) {
return $all_media;
}
$DOM = new DOMDocument();
// Hide HTML5 Tag warnings
libxml_use_internal_errors(true);
$DOM->loadHTML($content);
$images = $DOM->getElementsByTagName('img');
$videos = $DOM->getElementsByTagName('video');
preg_match_all('/<img [^>]+>/', $content, $images);
// Get each image attribute
// Only MP4 videos for now
preg_match_all('/\[video.*mp4=".*".*\]/', $content, $videos);
error_log(print_r($images, true));
error_log(print_r($videos, true));
// Throw each image into a media object
foreach ($images as $image) {
// error_log(print_r(get_intermediate_image_sizes(), true));
$src = $image->getAttribute('src');
if ($this->is_local($src)) {
$all_media[] = new LBRY_Speech_Media($src);
// Looks for wp image class first, if not, pull id from source
if (preg_match('/wp-image-([0-9]+)/i', $image[0], $class_id)) {
$attachment_id = absint($class_id[1]);
} elseif (preg_match('/src="((?:https?:)?\/\/[^"]+)"/', $image[0], $src)) {
$attachment_id = $this->rigid_attachment_url_to_postid($src[1]);
}
if ($attachment_id) {
$all_media[] = new LBRY_Speech_Media($attachment_id, array(), true);
}
}
// Parse video tags based on wordpress output for local embedds
// Because video tag is HTML5, treat it like an XML node
// Parse video tags based on wordpress shortcode for local embedds
foreach ($videos as $video) {
$source = $video->getElementsByTagName('source');
$src = $source[0]->attributes->getNamedItem('src')->value;
if ($this->is_local($src)) {
$all_media[] = new LBRY_Speech_Media($src);
if (preg_match('/mp4="((?:https?:)?\/\/[^"]+)"/', $video[0], $src)) {
$attachment_id = $this->rigid_attachment_url_to_postid($src[1]);
if ($attachment_id) {
$all_media[] = new LBRY_Speech_Media($attachment_id);
}
}
}
return $all_media;
}
/**
* Checks to see if a url is local to this installation
* @param string $url
* @return boolean
* Checks for image crop sizes and filters out query params
* Courtesy of this post: http://bordoni.me/get-attachment-id-by-image-url/
* @param string $url The url of the attachment you want an ID for
* @return int The found post_id
*/
private function is_local($url)
private function rigid_attachment_url_to_postid($url)
{
if (strpos($url, home_url()) !== false) {
return true;
$scrubbed_url = strtok($url, '?'); // Clean up query params first
$post_id = attachment_url_to_postid($scrubbed_url);
if (! $post_id) {
$dir = wp_upload_dir();
$path = $scrubbed_url;
if (0 === strpos($path, $dir['baseurl'] . '/')) {
$path = substr($path, strlen($dir['baseurl'] . '/'));
}
return false;
if (preg_match('/^(.*)(\-\d*x\d*)(\.\w{1,})/i', $path, $matches)) {
$url = $dir['baseurl'] . '/' . $matches[1] . $matches[3];
$post_id = attachment_url_to_postid($url);
}
}
return (int) $post_id;
}
/**

View file

@ -27,7 +27,9 @@ class LBRY_Speech_Media
public $thumbnail;
public function __construct($url, $args = array())
private $is_image = false;
public function __construct(int $attachment_id, $args = array(), bool $is_image = false)
{
// Set supplied arguments
@ -44,46 +46,35 @@ class LBRY_Speech_Media
$this->{$key} = $value;
}
// Get attachment ID, name, file, and type from the URL
$url = strtok($url, '?'); // Clean up query params first
$id = $this->rigid_attachment_url_to_postid($url);
// Flag as image if it is one
if ($is_image) {
$this->is_image = true;
}
// // Get attachment ID, name, file, and type from the URL
// $url = strtok($url, '?'); // Clean up query params first
// $id = $this->rigid_attachment_url_to_postid($url);
$meta = wp_get_attachment_metadata($attachment_id);
error_log(print_r($meta, true));
$attachment = get_post($id);
$path = get_attached_file($id);
$type = $attachment->post_mime_type;
$filename = wp_basename($path);
$this->id = $id;
// COMBAK: Probably wont need this underscore check with Daemon V3
$this->name = str_replace('_', '-', $attachment->post_name);
$this->file = new CURLFile($path, $type, $filename);
$this->type = $type;
$this->title = $attachment->post_title;
// $type = $attachment->post_mime_type;
// $filename = wp_basename($path);
//
// $this->id = $id;
// // COMBAK: Probably wont need this underscore check with Daemon V3
// $this->name = str_replace('_', '-', $attachment->post_name);
// $this->file = new CURLFile($path, $type, $filename);
// $this->type = $type;
// $this->title = $attachment->post_title;
}
/**
* Checks for image crop sizes and filters out query params
* Courtesy of this post: http://bordoni.me/get-attachment-id-by-image-url/
* @param string $url The url of the attachment you want an ID for
* @return int The found post_id
*/
private function rigid_attachment_url_to_postid($url)
public function is_image()
{
$post_id = attachment_url_to_postid($url);
if (! $post_id) {
$dir = wp_upload_dir();
$path = $url;
if (0 === strpos($path, $dir['baseurl'] . '/')) {
$path = substr($path, strlen($dir['baseurl'] . '/'));
}
if (preg_match('/^(.*)(\-\d*x\d*)(\.\w{1,})/i', $path, $matches)) {
$url = $dir['baseurl'] . '/' . $matches[1] . $matches[3];
$post_id = attachment_url_to_postid($url);
}
}
return (int) $post_id;
return $this->is_image;
}
}