528 lines
22 KiB
C
528 lines
22 KiB
C
|
|
// CkSpider.h: interface for the CkSpider class.
|
||
|
|
//
|
||
|
|
//////////////////////////////////////////////////////////////////////
|
||
|
|
|
||
|
|
// This header is generated for Chilkat 11.3.0
|
||
|
|
|
||
|
|
#define _CkVersion 11.3.0
|
||
|
|
|
||
|
|
#ifndef _CkSpider_H
|
||
|
|
#define _CkSpider_H
|
||
|
|
|
||
|
|
#include "chilkatDefs.h"
|
||
|
|
#include "CkString.h"
|
||
|
|
#include "CkClassWithCallbacks.h"
|
||
|
|
|
||
|
|
class CkTask;
|
||
|
|
class CkBaseProgress;
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
#if !defined(__sun__) && !defined(__sun)
|
||
|
|
#pragma pack (push, 8)
|
||
|
|
#endif
|
||
|
|
|
||
|
|
|
||
|
|
#undef Copy
|
||
|
|
|
||
|
|
// CLASS: CkSpider
|
||
|
|
class CK_VISIBLE_PUBLIC CkSpider : public CkClassWithCallbacks
|
||
|
|
{
|
||
|
|
private:
|
||
|
|
|
||
|
|
// Don't allow assignment or copying these objects.
|
||
|
|
CkSpider(const CkSpider &);
|
||
|
|
CkSpider &operator=(const CkSpider &);
|
||
|
|
|
||
|
|
public:
|
||
|
|
CkSpider(void);
|
||
|
|
virtual ~CkSpider(void);
|
||
|
|
|
||
|
|
static CkSpider *createNew(void);
|
||
|
|
static CkSpider *createNew2(int progLang);
|
||
|
|
void CK_VISIBLE_PRIVATE inject(void *impl);
|
||
|
|
|
||
|
|
// May be called when finished with the object to free/dispose of any
|
||
|
|
// internal resources held by the object.
|
||
|
|
void dispose(void);
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
CkBaseProgress *get_EventCallbackObject(void) const;
|
||
|
|
void put_EventCallbackObject(CkBaseProgress *progress);
|
||
|
|
|
||
|
|
|
||
|
|
// BEGIN PUBLIC INTERFACE
|
||
|
|
|
||
|
|
// ----------------------
|
||
|
|
// Properties
|
||
|
|
// ----------------------
|
||
|
|
// When set to true, causes the currently running method to abort. Methods that
|
||
|
|
// always finish quickly (i.e.have no length file operations or network
|
||
|
|
// communications) are not affected. If no method is running, then this property is
|
||
|
|
// automatically reset to false when the next method is called. When the abort
|
||
|
|
// occurs, this property is reset to false. Both synchronous and asynchronous
|
||
|
|
// method calls can be aborted. (A synchronous method call could be aborted by
|
||
|
|
// setting this property from a separate thread.)
|
||
|
|
bool get_AbortCurrent(void);
|
||
|
|
// When set to true, causes the currently running method to abort. Methods that
|
||
|
|
// always finish quickly (i.e.have no length file operations or network
|
||
|
|
// communications) are not affected. If no method is running, then this property is
|
||
|
|
// automatically reset to false when the next method is called. When the abort
|
||
|
|
// occurs, this property is reset to false. Both synchronous and asynchronous
|
||
|
|
// method calls can be aborted. (A synchronous method call could be aborted by
|
||
|
|
// setting this property from a separate thread.)
|
||
|
|
void put_AbortCurrent(bool newVal);
|
||
|
|
|
||
|
|
// If set the 1 (true) the spider will avoid all HTTPS URLs. The default is 0
|
||
|
|
// (false).
|
||
|
|
bool get_AvoidHttps(void);
|
||
|
|
// If set the 1 (true) the spider will avoid all HTTPS URLs. The default is 0
|
||
|
|
// (false).
|
||
|
|
void put_AvoidHttps(bool newVal);
|
||
|
|
|
||
|
|
// Specifies a cache directory to use for spidering. If either of the
|
||
|
|
// FetchFromCache or UpdateCache properties are true, this is the location of the
|
||
|
|
// cache to be used. Note: the Internet Explorer, Netscape, and FireFox caches are
|
||
|
|
// completely separate from the Chilkat Spider cache directory. You should specify
|
||
|
|
// a new and empty directory.
|
||
|
|
void get_CacheDir(CkString &str);
|
||
|
|
// Specifies a cache directory to use for spidering. If either of the
|
||
|
|
// FetchFromCache or UpdateCache properties are true, this is the location of the
|
||
|
|
// cache to be used. Note: the Internet Explorer, Netscape, and FireFox caches are
|
||
|
|
// completely separate from the Chilkat Spider cache directory. You should specify
|
||
|
|
// a new and empty directory.
|
||
|
|
const char *cacheDir(void);
|
||
|
|
// Specifies a cache directory to use for spidering. If either of the
|
||
|
|
// FetchFromCache or UpdateCache properties are true, this is the location of the
|
||
|
|
// cache to be used. Note: the Internet Explorer, Netscape, and FireFox caches are
|
||
|
|
// completely separate from the Chilkat Spider cache directory. You should specify
|
||
|
|
// a new and empty directory.
|
||
|
|
void put_CacheDir(const char *newVal);
|
||
|
|
|
||
|
|
// If equal to 1 (true), then the query portion of all URLs are automatically
|
||
|
|
// removed when adding to the unspidered list. The default value is 0 (false).
|
||
|
|
bool get_ChopAtQuery(void);
|
||
|
|
// If equal to 1 (true), then the query portion of all URLs are automatically
|
||
|
|
// removed when adding to the unspidered list. The default value is 0 (false).
|
||
|
|
void put_ChopAtQuery(bool newVal);
|
||
|
|
|
||
|
|
// The maximum number of seconds to wait while connecting to an HTTP server.
|
||
|
|
int get_ConnectTimeout(void);
|
||
|
|
// The maximum number of seconds to wait while connecting to an HTTP server.
|
||
|
|
void put_ConnectTimeout(int newVal);
|
||
|
|
|
||
|
|
// The domain name that is being spidered. This is the domain previously set in the
|
||
|
|
// Initialize method.
|
||
|
|
void get_Domain(CkString &str);
|
||
|
|
// The domain name that is being spidered. This is the domain previously set in the
|
||
|
|
// Initialize method.
|
||
|
|
const char *domain(void);
|
||
|
|
|
||
|
|
// If equal to 1 (true) then pages are fetched from cache when possible. If 0, the
|
||
|
|
// cache is ignored. The default value is 1. Regardless, if no CacheDir is set then
|
||
|
|
// the cache is not used.
|
||
|
|
bool get_FetchFromCache(void);
|
||
|
|
// If equal to 1 (true) then pages are fetched from cache when possible. If 0, the
|
||
|
|
// cache is ignored. The default value is 1. Regardless, if no CacheDir is set then
|
||
|
|
// the cache is not used.
|
||
|
|
void put_FetchFromCache(bool newVal);
|
||
|
|
|
||
|
|
// If the last URL crawled was redirected (as indicated by the WasRedirected
|
||
|
|
// property), this property will contain the final redirect URL.
|
||
|
|
void get_FinalRedirectUrl(CkString &str);
|
||
|
|
// If the last URL crawled was redirected (as indicated by the WasRedirected
|
||
|
|
// property), this property will contain the final redirect URL.
|
||
|
|
const char *finalRedirectUrl(void);
|
||
|
|
|
||
|
|
// The interval in milliseconds between each AbortCheck event callback, which
|
||
|
|
// enables an application to abort certain method calls before they complete. By
|
||
|
|
// default, HeartbeatMs is set to 0 , meaning no AbortCheck event callbacks will
|
||
|
|
// trigger.
|
||
|
|
//
|
||
|
|
// References:
|
||
|
|
// 1: #AbortCheck
|
||
|
|
int get_HeartbeatMs(void);
|
||
|
|
// The interval in milliseconds between each AbortCheck event callback, which
|
||
|
|
// enables an application to abort certain method calls before they complete. By
|
||
|
|
// default, HeartbeatMs is set to 0 , meaning no AbortCheck event callbacks will
|
||
|
|
// trigger.
|
||
|
|
//
|
||
|
|
// References:
|
||
|
|
// 1: #AbortCheck
|
||
|
|
void put_HeartbeatMs(int newVal);
|
||
|
|
|
||
|
|
// Equal to 1 if the last page spidered was fetched from the cache. Otherwise equal
|
||
|
|
// to 0.
|
||
|
|
bool get_LastFromCache(void);
|
||
|
|
|
||
|
|
// The HTML text of the last paged fetched by the spider.
|
||
|
|
void get_LastHtml(CkString &str);
|
||
|
|
// The HTML text of the last paged fetched by the spider.
|
||
|
|
const char *lastHtml(void);
|
||
|
|
|
||
|
|
// The HTML META description from the last page fetched by the spider.
|
||
|
|
void get_LastHtmlDescription(CkString &str);
|
||
|
|
// The HTML META description from the last page fetched by the spider.
|
||
|
|
const char *lastHtmlDescription(void);
|
||
|
|
|
||
|
|
// The HTML META keywords from the last page fetched by the spider.
|
||
|
|
void get_LastHtmlKeywords(CkString &str);
|
||
|
|
// The HTML META keywords from the last page fetched by the spider.
|
||
|
|
const char *lastHtmlKeywords(void);
|
||
|
|
|
||
|
|
// The HTML title from the last page fetched by the spider.
|
||
|
|
void get_LastHtmlTitle(CkString &str);
|
||
|
|
// The HTML title from the last page fetched by the spider.
|
||
|
|
const char *lastHtmlTitle(void);
|
||
|
|
|
||
|
|
// The last modification date/time (RFC822 format) from the last page fetched by
|
||
|
|
// the spider.
|
||
|
|
void get_LastModDateStr(CkString &str);
|
||
|
|
// The last modification date/time (RFC822 format) from the last page fetched by
|
||
|
|
// the spider.
|
||
|
|
const char *lastModDateStr(void);
|
||
|
|
|
||
|
|
// The URL of the last page spidered.
|
||
|
|
void get_LastUrl(CkString &str);
|
||
|
|
// The URL of the last page spidered.
|
||
|
|
const char *lastUrl(void);
|
||
|
|
|
||
|
|
// The maximum HTTP response size allowed. The spider will automatically fail any
|
||
|
|
// pages larger than this size. The default value is 250,000 bytes.
|
||
|
|
int get_MaxResponseSize(void);
|
||
|
|
// The maximum HTTP response size allowed. The spider will automatically fail any
|
||
|
|
// pages larger than this size. The default value is 250,000 bytes.
|
||
|
|
void put_MaxResponseSize(int newVal);
|
||
|
|
|
||
|
|
// The maximum URL length allowed. URLs longer than this are not added to the
|
||
|
|
// unspidered list. The default value is 200.
|
||
|
|
int get_MaxUrlLen(void);
|
||
|
|
// The maximum URL length allowed. URLs longer than this are not added to the
|
||
|
|
// unspidered list. The default value is 200.
|
||
|
|
void put_MaxUrlLen(int newVal);
|
||
|
|
|
||
|
|
// The number of avoid patterns previously set by calling AddAvoidPattern.
|
||
|
|
int get_NumAvoidPatterns(void);
|
||
|
|
|
||
|
|
// The number of URLs in the object's failed URL list.
|
||
|
|
int get_NumFailed(void);
|
||
|
|
|
||
|
|
// The number of URLs in the object's outbound links URL list.
|
||
|
|
int get_NumOutboundLinks(void);
|
||
|
|
|
||
|
|
// The number of URLs in the object's already-spidered URL list.
|
||
|
|
int get_NumSpidered(void);
|
||
|
|
|
||
|
|
// The number of URLs in the object's unspidered URL list.
|
||
|
|
int get_NumUnspidered(void);
|
||
|
|
|
||
|
|
// If true, then use IPv6 over IPv4 when both are supported for a particular
|
||
|
|
// domain. The default value of this property is false, which will choose IPv4
|
||
|
|
// over IPv6.
|
||
|
|
bool get_PreferIpv6(void);
|
||
|
|
// If true, then use IPv6 over IPv4 when both are supported for a particular
|
||
|
|
// domain. The default value of this property is false, which will choose IPv4
|
||
|
|
// over IPv6.
|
||
|
|
void put_PreferIpv6(bool newVal);
|
||
|
|
|
||
|
|
// The domain name of a proxy host if an HTTP proxy is used.
|
||
|
|
void get_ProxyDomain(CkString &str);
|
||
|
|
// The domain name of a proxy host if an HTTP proxy is used.
|
||
|
|
const char *proxyDomain(void);
|
||
|
|
// The domain name of a proxy host if an HTTP proxy is used.
|
||
|
|
void put_ProxyDomain(const char *newVal);
|
||
|
|
|
||
|
|
// If an HTTP proxy is used and it requires authentication, this property specifies
|
||
|
|
// the HTTP proxy login.
|
||
|
|
void get_ProxyLogin(CkString &str);
|
||
|
|
// If an HTTP proxy is used and it requires authentication, this property specifies
|
||
|
|
// the HTTP proxy login.
|
||
|
|
const char *proxyLogin(void);
|
||
|
|
// If an HTTP proxy is used and it requires authentication, this property specifies
|
||
|
|
// the HTTP proxy login.
|
||
|
|
void put_ProxyLogin(const char *newVal);
|
||
|
|
|
||
|
|
// If an HTTP proxy is used and it requires authentication, this property specifies
|
||
|
|
// the HTTP proxy password.
|
||
|
|
void get_ProxyPassword(CkString &str);
|
||
|
|
// If an HTTP proxy is used and it requires authentication, this property specifies
|
||
|
|
// the HTTP proxy password.
|
||
|
|
const char *proxyPassword(void);
|
||
|
|
// If an HTTP proxy is used and it requires authentication, this property specifies
|
||
|
|
// the HTTP proxy password.
|
||
|
|
void put_ProxyPassword(const char *newVal);
|
||
|
|
|
||
|
|
// The port number of a proxy server if an HTTP proxy is used.
|
||
|
|
int get_ProxyPort(void);
|
||
|
|
// The port number of a proxy server if an HTTP proxy is used.
|
||
|
|
void put_ProxyPort(int newVal);
|
||
|
|
|
||
|
|
// The maximum number of seconds to wait when reading from an HTTP server.
|
||
|
|
int get_ReadTimeout(void);
|
||
|
|
// The maximum number of seconds to wait when reading from an HTTP server.
|
||
|
|
void put_ReadTimeout(int newVal);
|
||
|
|
|
||
|
|
// If equal to 1 (true) then pages saved to the cache. If 0, the cache is ignored.
|
||
|
|
// The default value is 1. Regardless, if no CacheDir is set then the cache is not
|
||
|
|
// used.
|
||
|
|
bool get_UpdateCache(void);
|
||
|
|
// If equal to 1 (true) then pages saved to the cache. If 0, the cache is ignored.
|
||
|
|
// The default value is 1. Regardless, if no CacheDir is set then the cache is not
|
||
|
|
// used.
|
||
|
|
void put_UpdateCache(bool newVal);
|
||
|
|
|
||
|
|
// The value of the HTTP user-agent header field to be sent with HTTP requests.
|
||
|
|
// This can be set to any desired value, but be aware that some websites may reject
|
||
|
|
// requests from unknown user agents.
|
||
|
|
void get_UserAgent(CkString &str);
|
||
|
|
// The value of the HTTP user-agent header field to be sent with HTTP requests.
|
||
|
|
// This can be set to any desired value, but be aware that some websites may reject
|
||
|
|
// requests from unknown user agents.
|
||
|
|
const char *userAgent(void);
|
||
|
|
// The value of the HTTP user-agent header field to be sent with HTTP requests.
|
||
|
|
// This can be set to any desired value, but be aware that some websites may reject
|
||
|
|
// requests from unknown user agents.
|
||
|
|
void put_UserAgent(const char *newVal);
|
||
|
|
|
||
|
|
// Indicates whether the last URL crawled was redirected. (true = yes, false =
|
||
|
|
// no)
|
||
|
|
bool get_WasRedirected(void);
|
||
|
|
|
||
|
|
// The wind-down phase begins when this number of URLs has been spidered. When in
|
||
|
|
// the wind-down phase, no new URLs are added to the unspidered list. The default
|
||
|
|
// value is 0 which means that there is NO wind-down phase.
|
||
|
|
int get_WindDownCount(void);
|
||
|
|
// The wind-down phase begins when this number of URLs has been spidered. When in
|
||
|
|
// the wind-down phase, no new URLs are added to the unspidered list. The default
|
||
|
|
// value is 0 which means that there is NO wind-down phase.
|
||
|
|
void put_WindDownCount(int newVal);
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
// ----------------------
|
||
|
|
// Methods
|
||
|
|
// ----------------------
|
||
|
|
// Adds a wildcarded pattern to prevent collecting matching outbound link URLs. For
|
||
|
|
// example, if *google* is added, then any outbound links containing the word
|
||
|
|
// google will be ignored. The * character matches zero or more of any character.
|
||
|
|
void AddAvoidOutboundLinkPattern(const char *pattern);
|
||
|
|
|
||
|
|
|
||
|
|
// Adds a wildcarded pattern to prevent spidering matching URLs. For example, if
|
||
|
|
// *register* is added, then any url containing the word register is not spidered.
|
||
|
|
// The * character matches zero or more of any character.
|
||
|
|
void AddAvoidPattern(const char *pattern);
|
||
|
|
|
||
|
|
|
||
|
|
// Adds a wildcarded pattern to limit spidering to only URLs that match the
|
||
|
|
// pattern. For example, if */products/* is added, then only URLs containing
|
||
|
|
// /products/ are spidered. This is helpful for only spidering a portion of a
|
||
|
|
// website. The * character matches zero or more of any character.
|
||
|
|
void AddMustMatchPattern(const char *pattern);
|
||
|
|
|
||
|
|
|
||
|
|
// To begin spidering you must call this method one or more times to provide
|
||
|
|
// starting points. It adds a single URL to the object's internal queue of URLs to
|
||
|
|
// be spidered.
|
||
|
|
void AddUnspidered(const char *url);
|
||
|
|
|
||
|
|
|
||
|
|
// Canonicalizes a URL by doing the following:
|
||
|
|
// * Drops username/password if present.
|
||
|
|
// * Drops fragment if present.
|
||
|
|
// * Converts domain to lowercase.
|
||
|
|
// * Removes port 80 or 443
|
||
|
|
// * Remove default.asp, index.html, index.htm, default.html, index.htm,
|
||
|
|
// default.htm, index.php, index.asp, default.php, .cfm, .aspx, ,php3, .pl, .cgi,
|
||
|
|
// .txt, .shtml, .phtml
|
||
|
|
// * Remove www. from the domain if present.
|
||
|
|
bool CanonicalizeUrl(const char *url, CkString &outStr);
|
||
|
|
|
||
|
|
// Canonicalizes a URL by doing the following:
|
||
|
|
// * Drops username/password if present.
|
||
|
|
// * Drops fragment if present.
|
||
|
|
// * Converts domain to lowercase.
|
||
|
|
// * Removes port 80 or 443
|
||
|
|
// * Remove default.asp, index.html, index.htm, default.html, index.htm,
|
||
|
|
// default.htm, index.php, index.asp, default.php, .cfm, .aspx, ,php3, .pl, .cgi,
|
||
|
|
// .txt, .shtml, .phtml
|
||
|
|
// * Remove www. from the domain if present.
|
||
|
|
const char *canonicalizeUrl(const char *url);
|
||
|
|
|
||
|
|
// Clears the object's internal list of URLs that could not be downloaded.
|
||
|
|
void ClearFailedUrls(void);
|
||
|
|
|
||
|
|
|
||
|
|
// Clears the object's internal list of outbound URLs that will automatically
|
||
|
|
// accumulate while spidering.
|
||
|
|
void ClearOutboundLinks(void);
|
||
|
|
|
||
|
|
|
||
|
|
// Clears the object's internal list of already-spidered URLs that will
|
||
|
|
// automatically accumulate while spidering.
|
||
|
|
void ClearSpideredUrls(void);
|
||
|
|
|
||
|
|
|
||
|
|
// Crawls the next URL in the internal list of unspidered URLs. The URL is moved
|
||
|
|
// from the unspidered list to the spidered list. Any new links within the same
|
||
|
|
// domain and not yet spidered are added to the unspidered list. (providing that
|
||
|
|
// they do not match avoid patterns, etc.) Any new outbound links are added to the
|
||
|
|
// outbound URL list. If successful, the HTML of the downloaded page is available
|
||
|
|
// in the LastHtml property. If there are no more URLs left unspidered, the method
|
||
|
|
// returns false. Information about the URL crawled is available in the
|
||
|
|
// properties LastUrl, LastFromCache, and LastModDate.
|
||
|
|
bool CrawlNext(void);
|
||
|
|
|
||
|
|
// Crawls the next URL in the internal list of unspidered URLs. The URL is moved
|
||
|
|
// from the unspidered list to the spidered list. Any new links within the same
|
||
|
|
// domain and not yet spidered are added to the unspidered list. (providing that
|
||
|
|
// they do not match avoid patterns, etc.) Any new outbound links are added to the
|
||
|
|
// outbound URL list. If successful, the HTML of the downloaded page is available
|
||
|
|
// in the LastHtml property. If there are no more URLs left unspidered, the method
|
||
|
|
// returns false. Information about the URL crawled is available in the
|
||
|
|
// properties LastUrl, LastFromCache, and LastModDate.
|
||
|
|
CkTask *CrawlNextAsync(void);
|
||
|
|
|
||
|
|
|
||
|
|
// Returns the contents of the robots.txt file from the domain being crawled. This
|
||
|
|
// spider object will not crawl URLs excluded by robots.txt. If you believe the
|
||
|
|
// spider is not behaving correctly, please notify us at support@chilkatsoft.com
|
||
|
|
// and provide information detailing a case that allows us to reproduce the
|
||
|
|
// problem.
|
||
|
|
bool FetchRobotsText(CkString &outStr);
|
||
|
|
|
||
|
|
// Returns the contents of the robots.txt file from the domain being crawled. This
|
||
|
|
// spider object will not crawl URLs excluded by robots.txt. If you believe the
|
||
|
|
// spider is not behaving correctly, please notify us at support@chilkatsoft.com
|
||
|
|
// and provide information detailing a case that allows us to reproduce the
|
||
|
|
// problem.
|
||
|
|
const char *fetchRobotsText(void);
|
||
|
|
// Returns the contents of the robots.txt file from the domain being crawled. This
|
||
|
|
// spider object will not crawl URLs excluded by robots.txt. If you believe the
|
||
|
|
// spider is not behaving correctly, please notify us at support@chilkatsoft.com
|
||
|
|
// and provide information detailing a case that allows us to reproduce the
|
||
|
|
// problem.
|
||
|
|
CkTask *FetchRobotsTextAsync(void);
|
||
|
|
|
||
|
|
|
||
|
|
// Returns the Nth avoid pattern previously added by calling AddAvoidPattern.
|
||
|
|
// Indexing begins at 0.
|
||
|
|
bool GetAvoidPattern(int index, CkString &outStr);
|
||
|
|
|
||
|
|
// Returns the Nth avoid pattern previously added by calling AddAvoidPattern.
|
||
|
|
// Indexing begins at 0.
|
||
|
|
const char *getAvoidPattern(int index);
|
||
|
|
// Returns the Nth avoid pattern previously added by calling AddAvoidPattern.
|
||
|
|
// Indexing begins at 0.
|
||
|
|
const char *avoidPattern(int index);
|
||
|
|
|
||
|
|
|
||
|
|
// Returns the second-level + top-level domain of the domain. For example, if domain is
|
||
|
|
// xyz.example.com , this returns example.com . For some domains, such as
|
||
|
|
// xyz.example.co.uk , the top 3 levels are returned, such as example.co.uk .
|
||
|
|
bool GetBaseDomain(const char *domain, CkString &outStr);
|
||
|
|
|
||
|
|
// Returns the second-level + top-level domain of the domain. For example, if domain is
|
||
|
|
// xyz.example.com , this returns example.com . For some domains, such as
|
||
|
|
// xyz.example.co.uk , the top 3 levels are returned, such as example.co.uk .
|
||
|
|
const char *getBaseDomain(const char *domain);
|
||
|
|
// Returns the second-level + top-level domain of the domain. For example, if domain is
|
||
|
|
// xyz.example.com , this returns example.com . For some domains, such as
|
||
|
|
// xyz.example.co.uk , the top 3 levels are returned, such as example.co.uk .
|
||
|
|
const char *baseDomain(const char *domain);
|
||
|
|
|
||
|
|
|
||
|
|
// Returns the Nth URL in the failed URL list. Indexing begins at 0.
|
||
|
|
bool GetFailedUrl(int index, CkString &outStr);
|
||
|
|
|
||
|
|
// Returns the Nth URL in the failed URL list. Indexing begins at 0.
|
||
|
|
const char *getFailedUrl(int index);
|
||
|
|
// Returns the Nth URL in the failed URL list. Indexing begins at 0.
|
||
|
|
const char *failedUrl(int index);
|
||
|
|
|
||
|
|
|
||
|
|
// Returns the Nth URL in the outbound link URL list. Indexing begins at 0.
|
||
|
|
bool GetOutboundLink(int index, CkString &outStr);
|
||
|
|
|
||
|
|
// Returns the Nth URL in the outbound link URL list. Indexing begins at 0.
|
||
|
|
const char *getOutboundLink(int index);
|
||
|
|
// Returns the Nth URL in the outbound link URL list. Indexing begins at 0.
|
||
|
|
const char *outboundLink(int index);
|
||
|
|
|
||
|
|
|
||
|
|
// Returns the Nth URL in the already-spidered URL list. Indexing begins at 0.
|
||
|
|
bool GetSpideredUrl(int index, CkString &outStr);
|
||
|
|
|
||
|
|
// Returns the Nth URL in the already-spidered URL list. Indexing begins at 0.
|
||
|
|
const char *getSpideredUrl(int index);
|
||
|
|
// Returns the Nth URL in the already-spidered URL list. Indexing begins at 0.
|
||
|
|
const char *spideredUrl(int index);
|
||
|
|
|
||
|
|
|
||
|
|
// Returns the Nth URL in the unspidered URL list. Indexing begins at 0.
|
||
|
|
bool GetUnspideredUrl(int index, CkString &outStr);
|
||
|
|
|
||
|
|
// Returns the Nth URL in the unspidered URL list. Indexing begins at 0.
|
||
|
|
const char *getUnspideredUrl(int index);
|
||
|
|
// Returns the Nth URL in the unspidered URL list. Indexing begins at 0.
|
||
|
|
const char *unspideredUrl(int index);
|
||
|
|
|
||
|
|
|
||
|
|
// Returns the domain name part of a URL. For example, if the URL is
|
||
|
|
// https://www.chilkatsoft.com/test.asp , then www.chilkatsoft.com is returned.
|
||
|
|
bool GetUrlDomain(const char *url, CkString &outStr);
|
||
|
|
|
||
|
|
// Returns the domain name part of a URL. For example, if the URL is
|
||
|
|
// https://www.chilkatsoft.com/test.asp , then www.chilkatsoft.com is returned.
|
||
|
|
const char *getUrlDomain(const char *url);
|
||
|
|
// Returns the domain name part of a URL. For example, if the URL is
|
||
|
|
// https://www.chilkatsoft.com/test.asp , then www.chilkatsoft.com is returned.
|
||
|
|
const char *urlDomain(const char *url);
|
||
|
|
|
||
|
|
|
||
|
|
// Initializes the object to begin spidering a domain. Calling Initialize clears
|
||
|
|
// any patterns added via the AddAvoidOutboundLinkPattern, AddAvoidPattern, and
|
||
|
|
// AddMustMatchPattern methods. The domain name passed to this method is what is
|
||
|
|
// returned by the Domain property. The spider only crawls URLs within the same
|
||
|
|
// domain.
|
||
|
|
void Initialize(const char *domain);
|
||
|
|
|
||
|
|
|
||
|
|
// Loads the caller of the task's async method.
|
||
|
|
bool LoadTaskCaller(CkTask &task);
|
||
|
|
|
||
|
|
|
||
|
|
// Re-crawls the last URL spidered. This helpful when cookies set in a previous
|
||
|
|
// page load cause the page to be loaded differently the next time.
|
||
|
|
bool RecrawlLast(void);
|
||
|
|
|
||
|
|
// Re-crawls the last URL spidered. This helpful when cookies set in a previous
|
||
|
|
// page load cause the page to be loaded differently the next time.
|
||
|
|
CkTask *RecrawlLastAsync(void);
|
||
|
|
|
||
|
|
|
||
|
|
// Moves a URL from the unspidered list to the spidered list. This allows an
|
||
|
|
// application to skip a specific URL.
|
||
|
|
void SkipUnspidered(int index);
|
||
|
|
|
||
|
|
|
||
|
|
// Suspends the execution of the current thread until the time-out interval
|
||
|
|
// elapses.
|
||
|
|
void SleepMs(int numMilliseconds);
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
// END PUBLIC INTERFACE
|
||
|
|
|
||
|
|
|
||
|
|
};
|
||
|
|
#if !defined(__sun__) && !defined(__sun)
|
||
|
|
#pragma pack (pop)
|
||
|
|
#endif
|
||
|
|
|
||
|
|
#endif
|