scraping images and videos with regex, similar to the way wp core does
This commit is contained in:
parent
65c404fa3e
commit
34620a4edb
2 changed files with 104 additions and 91 deletions
|
@ -60,30 +60,30 @@ class LBRY_Speech
|
||||||
if ($all_media) {
|
if ($all_media) {
|
||||||
foreach ($all_media as $media) {
|
foreach ($all_media as $media) {
|
||||||
//// TODO: Check if media type is accepted
|
//// TODO: Check if media type is accepted
|
||||||
$meta = get_post_meta($media->id, '_wp_attachment_metadata', true);
|
// $meta = get_post_meta($media->id, '_wp_attachment_metadata', true);
|
||||||
error_log(print_r($meta, true));
|
error_log(print_r($media, true));
|
||||||
if (! get_post_meta($media->id, 'lbry_speech_uploaded')) {
|
// if (! get_post_meta($media->id, 'lbry_speech_uploaded')) {
|
||||||
$params = array(
|
// $params = array(
|
||||||
'name' => $media->name,
|
// 'name' => $media->name,
|
||||||
'file' => $media->file,
|
// 'file' => $media->file,
|
||||||
'title' => $media->title,
|
// 'title' => $media->title,
|
||||||
'type' => $media->type
|
// 'type' => $media->type
|
||||||
);
|
// );
|
||||||
|
//
|
||||||
if (LBRY_SPEECH_CHANNEL && LBRY_SPEECH_CHANNEL_PASSWORD) {
|
// if (LBRY_SPEECH_CHANNEL && LBRY_SPEECH_CHANNEL_PASSWORD) {
|
||||||
$params['channelName'] = LBRY_SPEECH_CHANNEL;
|
// $params['channelName'] = LBRY_SPEECH_CHANNEL;
|
||||||
$params['channelPassword'] = LBRY_SPEECH_CHANNEL_PASSWORD;
|
// $params['channelPassword'] = LBRY_SPEECH_CHANNEL_PASSWORD;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
$result = $this->request('publish', $params);
|
// $result = $this->request('publish', $params);
|
||||||
error_log(print_r($result, true));
|
// error_log(print_r($result, true));
|
||||||
|
//
|
||||||
// TODO: Make sure to warn if image name is already taken on channel
|
// // TODO: Make sure to warn if image name is already taken on channel
|
||||||
if ($result->success) {
|
// if ($result->success) {
|
||||||
update_post_meta($media->id, 'lbry_speech_uploaded', true);
|
// update_post_meta($media->id, 'lbry_speech_uploaded', true);
|
||||||
update_post_meta($media->id, 'lbry_speech_url', $result->data->serveUrl);
|
// update_post_meta($media->id, 'lbry_speech_url', $result->data->serveUrl);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -95,57 +95,79 @@ class LBRY_Speech
|
||||||
*/
|
*/
|
||||||
protected function find_media($post_id)
|
protected function find_media($post_id)
|
||||||
{
|
{
|
||||||
|
// TODO: Check wp_make_content_images_responsive for cannon way to scrub images & attachments
|
||||||
|
// https://developer.wordpress.org/reference/functions/wp_make_content_images_responsive/
|
||||||
$all_media = array();
|
$all_media = array();
|
||||||
|
|
||||||
// Get content and put into a DOMDocument
|
// Get content and put into a DOMDocument
|
||||||
$content = apply_filters('the_content', get_post_field('post_content', $post_id));
|
$content = get_post_field('post_content', $post_id);
|
||||||
if (!$content) {
|
if (!$content) {
|
||||||
return $all_media;
|
return $all_media;
|
||||||
}
|
}
|
||||||
$DOM = new DOMDocument();
|
|
||||||
// Hide HTML5 Tag warnings
|
|
||||||
libxml_use_internal_errors(true);
|
|
||||||
$DOM->loadHTML($content);
|
|
||||||
|
|
||||||
$images = $DOM->getElementsByTagName('img');
|
preg_match_all('/<img [^>]+>/', $content, $images);
|
||||||
$videos = $DOM->getElementsByTagName('video');
|
|
||||||
|
|
||||||
// Get each image attribute
|
// Only MP4 videos for now
|
||||||
|
preg_match_all('/\[video.*mp4=".*".*\]/', $content, $videos);
|
||||||
|
|
||||||
|
error_log(print_r($images, true));
|
||||||
|
error_log(print_r($videos, true));
|
||||||
|
|
||||||
|
// Throw each image into a media object
|
||||||
foreach ($images as $image) {
|
foreach ($images as $image) {
|
||||||
// error_log(print_r(get_intermediate_image_sizes(), true));
|
|
||||||
$src = $image->getAttribute('src');
|
// Looks for wp image class first, if not, pull id from source
|
||||||
if ($this->is_local($src)) {
|
if (preg_match('/wp-image-([0-9]+)/i', $image[0], $class_id)) {
|
||||||
$all_media[] = new LBRY_Speech_Media($src);
|
$attachment_id = absint($class_id[1]);
|
||||||
|
} elseif (preg_match('/src="((?:https?:)?\/\/[^"]+)"/', $image[0], $src)) {
|
||||||
|
$attachment_id = $this->rigid_attachment_url_to_postid($src[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($attachment_id) {
|
||||||
|
$all_media[] = new LBRY_Speech_Media($attachment_id, array(), true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse video tags based on wordpress output for local embedds
|
// Parse video tags based on wordpress shortcode for local embedds
|
||||||
// Because video tag is HTML5, treat it like an XML node
|
|
||||||
foreach ($videos as $video) {
|
foreach ($videos as $video) {
|
||||||
$source = $video->getElementsByTagName('source');
|
if (preg_match('/mp4="((?:https?:)?\/\/[^"]+)"/', $video[0], $src)) {
|
||||||
$src = $source[0]->attributes->getNamedItem('src')->value;
|
$attachment_id = $this->rigid_attachment_url_to_postid($src[1]);
|
||||||
if ($this->is_local($src)) {
|
|
||||||
$all_media[] = new LBRY_Speech_Media($src);
|
if ($attachment_id) {
|
||||||
|
$all_media[] = new LBRY_Speech_Media($attachment_id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return $all_media;
|
return $all_media;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks to see if a url is local to this installation
|
* Checks for image crop sizes and filters out query params
|
||||||
* @param string $url
|
* Courtesy of this post: http://bordoni.me/get-attachment-id-by-image-url/
|
||||||
* @return boolean
|
* @param string $url The url of the attachment you want an ID for
|
||||||
|
* @return int The found post_id
|
||||||
*/
|
*/
|
||||||
private function is_local($url)
|
private function rigid_attachment_url_to_postid($url)
|
||||||
{
|
{
|
||||||
if (strpos($url, home_url()) !== false) {
|
$scrubbed_url = strtok($url, '?'); // Clean up query params first
|
||||||
return true;
|
$post_id = attachment_url_to_postid($scrubbed_url);
|
||||||
|
|
||||||
|
if (! $post_id) {
|
||||||
|
$dir = wp_upload_dir();
|
||||||
|
$path = $scrubbed_url;
|
||||||
|
|
||||||
|
if (0 === strpos($path, $dir['baseurl'] . '/')) {
|
||||||
|
$path = substr($path, strlen($dir['baseurl'] . '/'));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/^(.*)(\-\d*x\d*)(\.\w{1,})/i', $path, $matches)) {
|
||||||
|
$url = $dir['baseurl'] . '/' . $matches[1] . $matches[3];
|
||||||
|
$post_id = attachment_url_to_postid($url);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return (int) $post_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -27,7 +27,9 @@ class LBRY_Speech_Media
|
||||||
|
|
||||||
public $thumbnail;
|
public $thumbnail;
|
||||||
|
|
||||||
public function __construct($url, $args = array())
|
private $is_image = false;
|
||||||
|
|
||||||
|
public function __construct(int $attachment_id, $args = array(), bool $is_image = false)
|
||||||
{
|
{
|
||||||
|
|
||||||
// Set supplied arguments
|
// Set supplied arguments
|
||||||
|
@ -44,46 +46,35 @@ class LBRY_Speech_Media
|
||||||
$this->{$key} = $value;
|
$this->{$key} = $value;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get attachment ID, name, file, and type from the URL
|
// Flag as image if it is one
|
||||||
$url = strtok($url, '?'); // Clean up query params first
|
if ($is_image) {
|
||||||
$id = $this->rigid_attachment_url_to_postid($url);
|
$this->is_image = true;
|
||||||
$attachment = get_post($id);
|
|
||||||
$path = get_attached_file($id);
|
|
||||||
$type = $attachment->post_mime_type;
|
|
||||||
$filename = wp_basename($path);
|
|
||||||
|
|
||||||
$this->id = $id;
|
|
||||||
// COMBAK: Probably wont need this underscore check with Daemon V3
|
|
||||||
$this->name = str_replace('_', '-', $attachment->post_name);
|
|
||||||
$this->file = new CURLFile($path, $type, $filename);
|
|
||||||
$this->type = $type;
|
|
||||||
$this->title = $attachment->post_title;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks for image crop sizes and filters out query params
|
|
||||||
* Courtesy of this post: http://bordoni.me/get-attachment-id-by-image-url/
|
|
||||||
* @param string $url The url of the attachment you want an ID for
|
|
||||||
* @return int The found post_id
|
|
||||||
*/
|
|
||||||
private function rigid_attachment_url_to_postid($url)
|
|
||||||
{
|
|
||||||
$post_id = attachment_url_to_postid($url);
|
|
||||||
|
|
||||||
if (! $post_id) {
|
|
||||||
$dir = wp_upload_dir();
|
|
||||||
$path = $url;
|
|
||||||
|
|
||||||
if (0 === strpos($path, $dir['baseurl'] . '/')) {
|
|
||||||
$path = substr($path, strlen($dir['baseurl'] . '/'));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (preg_match('/^(.*)(\-\d*x\d*)(\.\w{1,})/i', $path, $matches)) {
|
|
||||||
$url = $dir['baseurl'] . '/' . $matches[1] . $matches[3];
|
|
||||||
$post_id = attachment_url_to_postid($url);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return (int) $post_id;
|
|
||||||
|
|
||||||
|
// // Get attachment ID, name, file, and type from the URL
|
||||||
|
// $url = strtok($url, '?'); // Clean up query params first
|
||||||
|
// $id = $this->rigid_attachment_url_to_postid($url);
|
||||||
|
$meta = wp_get_attachment_metadata($attachment_id);
|
||||||
|
error_log(print_r($meta, true));
|
||||||
|
$attachment = get_post($id);
|
||||||
|
$path = get_attached_file($id);
|
||||||
|
// $type = $attachment->post_mime_type;
|
||||||
|
// $filename = wp_basename($path);
|
||||||
|
//
|
||||||
|
// $this->id = $id;
|
||||||
|
// // COMBAK: Probably wont need this underscore check with Daemon V3
|
||||||
|
// $this->name = str_replace('_', '-', $attachment->post_name);
|
||||||
|
// $this->file = new CURLFile($path, $type, $filename);
|
||||||
|
// $this->type = $type;
|
||||||
|
// $this->title = $attachment->post_title;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public function is_image()
|
||||||
|
{
|
||||||
|
return $this->is_image;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue