FREE THOUGHT · FREE SOFTWARE · FREE WORLD

Download Multiple URL’s FAST with cURL

Made 10,000 GET requests for 1,000 files from 500 different hosts (top 500 alexa sites) in 1 min 44 seconds. Saved 4,500 files. 500 file handles open at one time.

<?php

ini_set
('error_reporting', 2147483647 );

function
askapache_curl_multi( $args = array() ) {
ISCLOG::ti();

// save the start time
$started = time();

$defaults = array(
'urls' => array(), // array containing all the urls to fetch
'batch' => 1000, // fetch this many urls concurrently (don't do more than 200 if using savedir)
'max_time' => ( 60 * 6 ), // maximum time allowed to complete all requests. 5 minutes
'max_request_time' => 10, // maximum time an individual request will last before being closed. 2 minutes
'max_connect_time' => 0, // The number of seconds to wait while trying to connect. Use 0 to wait indefinitely.
'max_redirs' => 2, // Number of redirects allowed
'user_agent' => 'AskApache;', // user-agent
'headers' => array( 'Accept-Encoding: none' ), // array of http headers, such as array( 'Cookie: thiscookie', 'Accept-Encoding: none' )
'logfile' => '',
'debug' => false,
'save' => false,
'savedir' => '',
'savelog' => '',
);
$args = array_merge( $defaults, $args );

$urls = $batch = $user_agent = $headers = $logfile = $debug = $save = $savedir = $savelog = null;
$max_time = $max_request_time = $max_connect_time = $max_redirs = null;
extract( $args, EXTR_IF_EXISTS );



// Do not abort script execution if a client disconnects
//ignore_user_abort( true );

// Set the number of seconds a script is allowed to run. Restarts the timeout counter from zero.
//set_time_limit( $max_time );

$fplog = $fpsavelog = null;

if (
$debug ) $fplog = fopen( $logfile, 'a');


// setup saving
if ( $save ) {

if ( empty(
$savedir ) ) {
$save = false;
} else {
$savedir = rtrim( $savedir, '/' ) . '/';

if ( !
is_dir( $savedir ) ) {
$save = false;
} else {
// set savelog containing the mapping of urls to files
if ( empty( $savelog ) ) $savelog = $savedir . '__' . date( 'Y-m-d' ) . '_urls-to-files-map.log';

// open save log
$fpsavelog = fopen( $savelog, 'a');

if ( !
is_resource( $fpsavelog ) ) $save = false;
}
}
}

// can't follow redirects when open_basedir is in effect
if ( strlen( ini_get( 'open_basedir' ) ) > 0 ) $max_redirs = 0;

$total_urls = count( $urls );

foreach (
array_chunk( $urls, $batch, true ) as $the_urls ) {
$con = $fps = $chinfo = array();
$url_count = count( $the_urls );
$runtime = ( time() - $started );

ISCLOG::ti( "BATCH: {$batch} total_urls: {$total_urls}" );

if (
$runtime > $max_time ) {
ISCLOG::ti( ' !!' . " ({$runtime} > {$max_time}) runtime: {$runtime} batch: {$batch} url_count: {$url_count}" );
die(
'CRITICAL!! RUNTIME > MAX_TIME' );
}

$mh = curl_multi_init(); // create a 'multi handle'

curl_multi_setopt( $mh, CURLMOPT_MAXCONNECTS, 20 ); // maximum amount of simultaneously open connections that libcurl may cache. D10.
curl_multi_setopt( $mh, CURLMOPT_PIPELINING, 1 ); // Pipelining as far as possible for this handle. if you add a second request that can use an already existing connection, 2nd request will be "piped"

foreach ( $the_urls as $i => $url ) {

$con[ $i ] = curl_init( $url );

// skip bad urls
if ( ! is_resource( $con[ $i ] ) ) {
ISCLOG::ti( "ERROR!! SKIPPED: {$url}" );
continue;
}


// TRUE to return the transfer as a string of the return value of curl_exec() instead of outputting it out directly.
curl_setopt( $con[ $i ], CURLOPT_RETURNTRANSFER, 1 );

// binary transfer mode
curl_setopt( $con[ $i ], CURLOPT_BINARYTRANSFER, 1 );


if (
$save ) {
// TRUE to return the transfer as a string of the return value of curl_exec() instead of outputting it out directly.
curl_setopt( $con[ $i ], CURLOPT_RETURNTRANSFER, 0 );

$filename = $i . '_' . md5( $url ) . '.file';
$fps[ $i ] = fopen( $savedir . $filename, 'wb' );

// skip error opening handler to file
if ( ! is_resource( $fps[ $i ] ) ) {
ISCLOG::ti( 'ERROR!! SAVING FILE TO: ' . $savedir . $filename . " !! SKIPPED: {$url}" );
continue;
}

// save the filename mapping
fwrite( $fpsavelog, $filename . ' ' . trim( $url ) . "\n" );

// have curl save the file
curl_setopt( $con[ $i ], CURLOPT_FILE, $fps[ $i ] );
}

// The number of seconds to wait while trying to connect. Use 0 to wait indefinitely.
curl_setopt( $con[ $i ], CURLOPT_CONNECTTIMEOUT, $max_connect_time );

// maximum time in seconds that you allow the libcurl transfer operation to take
curl_setopt( $con[ $i ], CURLOPT_TIMEOUT, $max_request_time );

// allow following redirects
if ( $max_redirs > 0 ) curl_setopt( $con[ $i ], CURLOPT_FOLLOWLOCATION, 1 );

// Number of redirects allowed
curl_setopt( $con[ $i ], CURLOPT_MAXREDIRS, $max_redirs );

// TRUE to fail verbosely if the HTTP code returned is greater than or equal to 400. default return the page ignoring the code.
curl_setopt( $con[ $i ], CURLOPT_FAILONERROR, 0 );

// Do not output verbose information.
curl_setopt( $con[ $i ], CURLOPT_VERBOSE, 0 );

if (
$debug && is_resource( $fplog ) ) {

// TRUE to output verbose information. Writes output to STDERR, or the file specified using CURLOPT_STDERR.
curl_setopt( $con[ $i ], CURLOPT_VERBOSE, 1 );

// An alternative location to output errors to instead of STDERR.
curl_setopt( $con[ $i ], CURLOPT_STDERR, $fplog );

//curl_setopt( $con[ $i ], CURLINFO_HEADER_OUT, 1);
}

// A parameter set to 1 tells the library to include the header in the body output.
curl_setopt( $con[ $i ], CURLOPT_HEADER, 0 );

// TRUE to ignore any cURL function that causes a signal sent to the PHP.
// curl_setopt( $con[ $i ], CURLOPT_NOSIGNAL, 1 );

// Ignore the Content-Length header.
// curl_setopt( $con[ $i ], CURLOPT_IGNORE_CONTENT_LENGTH, 1 );

curl_setopt( $con[ $i ], CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0 );

// TRUE to exclude the body from the output. Request method is then set to HEAD.
// curl_setopt( $con[ $i ], CURLOPT_NOBODY, 1 );

// A custom request method to use instead of "GET" or "HEAD" when doing a HTTP request.
// curl_setopt( $con[ $i ], CURLOPT_CUSTOMREQUEST, 'GET' );

// The User-Agent header
if ( ! empty( $user_agent ) ) curl_setopt( $con[ $i ], CURLOPT_USERAGENT, $user_agent );

// Additional headers to send
if ( count( $headers ) > 0 ) curl_setopt( $con[ $i ], CURLOPT_HTTPHEADER, $headers );

curl_multi_add_handle( $mh, $con[ $i ] ); // add the easy handle to the multi handle 'multi stack' $mh
}

$still_running = null;
do {
//usleep( 50000 );
//usleep( 50000 );
$status = curl_multi_exec( $mh, $still_running );
} while (
$still_running > 0 ); // Processes each of the handles in the stack.

foreach ( $the_urls as $i => $url ) {
if ( !
is_resource( $con[ $i ] ) ) {
ISCLOG::epx( array( 'url' => $url, 'chinfo' => $chinfo, 'curl_errno' => curl_errno( $con[ $i ] ), 'curl_error' => curl_error( $con[ $i ] ) ) );
continue;
}

$code = curl_getinfo( $con[ $i ], CURLINFO_HTTP_CODE );
$rcount = curl_getinfo( $con[ $i ], CURLINFO_REDIRECT_COUNT );
$size = curl_getinfo( $con[ $i ], CURLINFO_SIZE_DOWNLOAD );
//$info = curl_getinfo( $con[ $i ] ); ISCLOG::epx($info);

//if ( $code != 200 || $rcount > $max_redirs || curl_errno( $con[ $i ] ) ) {
if ( $rcount > $max_redirs || curl_errno( $con[ $i ] ) || $size <= 0 ) {
$chinfo = curl_getinfo( $con[ $i ] );
ISCLOG::l( curl_error( $con[ $i ] ) );
//sleep( 2 );
if ( $save ) {
if (
is_resource( $fps[ $i ] ) ) fclose( $fps[ $i ] );
if (
is_file( $savedir . $i . '_' . md5( $url ) . '.file' ) ) unlink( $savedir . $i . '_' . md5( $url ) . '.file' );
}
}

curl_multi_remove_handle( $mh, $con[ $i ] ); // remove handle from 'multi stack' $mh
curl_close( $con[ $i ] ); // close the individual handle
}

curl_multi_close( $mh ); // close the multi stack


// close the save file handlers
if ( $save ) {
foreach (
$fps as $fp ) {
if (
is_resource( $fp ) ) fclose( $fp );
}
}

//ISCLOG::ti( "BATCH: {$batch} total_urls: {$total_urls}" );

} // end foreach ( array_chunk( $the_urls, $batch_size, true ) as $urls ) {

if ( is_resource( $fplog ) ) fclose( $fplog ); // close the logfile
if ( is_resource( $fpsavelog ) ) fclose( $fpsavelog ); // close the logfile

echo "\nCOMPLETED IN: " . ( time() - $started ) . " SECONDS\n";
ISCLOG::ti();
ISCLOG::pls( $savedir );
}

!
defined( 'ISC_ROOT' ) && define( 'ISC_ROOT', str_replace( '/htdocs', '', $_SERVER['DOCUMENT_ROOT'] ) );
require_once
ISC_ROOT . '/inc/isclog.inc.php';
$content = trim( file_get_contents( '/web/askapach/sites/askapache.com/urls.txt' ) );
$urls = explode( "\n", $content );

header( 'Content-Type: text/plain' );

ob_start();
askapache_curl_multi( array(
'savedir' => '/web/askapach/sites/askapache.com/savedir/',
'save' => false,
'urls'=> $urls,
'logfile' => '/web/askapach/sites/askapache.com/logs/multi-curl.log',
'debug' => false,
) );

echo
ob_get_clean();

?>

PHP cURL download PHP

 

 

Comments