Starts a crawl job for a given URL.
Method: client.crawl.start(params: StartCrawlJobParams): StartCrawlJobResponse
Endpoint: POST /api/crawl
Parameters:
StartCrawlJobParams
:
url: string
- URL to scrape
max_pages?: number
- Max number of pages to crawl
follow_links?: boolean
- Follow links on the page
ignore_sitemap?: boolean
- Ignore sitemap when finding links to crawl
exclude_patterns?: string[]
- Patterns for paths to exclude from crawl
include_patterns?: string[]
- Patterns for paths to include in the crawl
Response: StartCrawlJobResponse
Example:
response = client.crawl.start(StartCrawlJobParams(url="https://example.com"))
print(response.status)
Retrieves details of a specific crawl job.
Method: client.crawl.get(id: str): CrawlJobResponse
Endpoint: GET /api/crawl/{id}
Parameters:
id: string
- Crawl job ID
Response: CrawlJobResponse
Example:
response = client.crawl.get(
"182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e"
)
print(response.status)
Start a crawl job and wait for it to complete
Method: client.crawl.start_and_wait(params: StartCrawlJobParams): CrawlJobResponse
Parameters:
StartCrawlJobParams
:
url: string
- URL to scrape
max_pages?: number
- Max number of pages to crawl
follow_links?: boolean
- Follow links on the page
ignore_sitemap?: boolean
- Ignore sitemap when finding links to crawl
exclude_patterns?: string[]
- Patterns for paths to exclude from crawl
include_patterns?: string[]
- Patterns for paths to include in the crawl
Response: CrawlJobResponse
Example:
response = client.crawl.start_and_wait(StartCrawlJobParams(url="https://example.com"))
print(response.status)
CrawlPageStatus = Literal["completed", "failed"]
CrawlJobStatus = Literal["pending", "running", "completed", "failed"]
class StartCrawlJobResponse(BaseModel):
job_id: str = Field(alias="jobId")
class CrawledPage(BaseModel):
metadata: Optional[dict[str, Union[str, list[str]]]] = None
html: Optional[str] = None
markdown: Optional[str] = None
links: Optional[List[str]] = None
url: str
status: CrawlPageStatus
error: Optional[str] = None
class CrawlJobResponse(BaseModel):
job_id: str = Field(alias="jobId")
status: CrawlJobStatus
error: Optional[str] = None
data: List[CrawledPage] = Field(alias="data")
total_crawled_pages: int = Field(alias="totalCrawledPages")
total_page_batches: int = Field(alias="totalPageBatches")
current_page_batch: int = Field(alias="currentPageBatch")
batch_size: int = Field(alias="batchSize")
Last updated 7 months ago
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4