o
    Dh2N                     @   s  d Z ddlZddlZddlZddlZddlZzddlZdZW n ey)   dZY nw zddl	Z	dZ
W n ey=   dZ
Y nw ddlmZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- e.e/Z0G dd dZ1e1 Z2G dd deZ3G dd dZ4G dd dZ5G dd dZ6de7de6fddZ8dS ) zd
Defines a URL store which holds URLs along with relevant information and entails crawling helpers.
    NTF)defaultdictdeque)datetime	timedelta)Enum)
itemgetter)Lock)AnyDefaultDictDequeDictListOptionalTupleUnion)RobotFileParser   )normalize_url)filter_links)lang_filtervalidate_url)clear_caches)get_base_urlget_host_and_pathis_known_linkc                   @   sb   e Zd ZdZdZddeddfddZed	edefd
dZ	d	edefddZ
d	edefddZdS )
CompressorzYUse system information on available compression modules and define corresponding methods.)
compressordecompressorTcompressionreturnNc                 C   sX   |rt rtjn	|rtrtjn| j| _|rt rtj| _d S |r&tr&tj| _d S | j| _d S N)	HAS_BZ2bz2compressHAS_ZLIBzlib
_identicalr   
decompressr   )selfr    r)   Q/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/courlan/urlstore.py__init__:   s   zCompressor.__init__datac                 C   s   | S )zReturn unchanged data.r)   )r,   r)   r)   r*   r&   F   s   zCompressor._identicalc                 C   s   |  tj|ddS )z9Pickle the data and compress it if a method is available.   )protocol)r   pickledumpsr(   r,   r)   r)   r*   r#   K      zCompressor.compressc                 C   s   t | |S )zADecompress the data if a method is available and load the object.)r/   loadsr   r1   r)   r)   r*   r'   O   s   zCompressor.decompressT)__name__
__module____qualname____doc__	__slots__boolr+   staticmethodr	   r&   r#   bytesr'   r)   r)   r)   r*   r   6   s    r   c                   @   s   e Zd ZdZdZdZdZdS )Statez0Record state information about a domain or host.r         N)r5   r6   r7   r8   OPENALL_VISITEDBUSTEDr)   r)   r)   r*   r=   W   s
    r=   c                   @   s,   e Zd ZdZdZejfdeddfddZdS )DomainEntryz7Class to record host-related information and URL paths.)countrulesstate	timestamptotaltuplesrF   r   Nc                 C   s*   d| _ d | _|| _d | _d| _t | _d S )Nr   )rD   rE   rF   rG   rH   r   rI   )r(   rF   r)   r)   r*   r+   b   s   zDomainEntry.__init__)r5   r6   r7   r8   r9   r=   r@   r+   r)   r)   r)   r*   rC   ^   s    rC   c                   @   s8   e Zd ZdZdZdededdfddZdefd	d
ZdS )UrlPathTuplezBClass storing information for URL paths relative to a domain/host.)urlpathvisitedrK   rL   r   Nc                 C   s   | d| _|| _d S )Nutf-8)encoderK   rL   )r(   rK   rL   r)   r)   r*   r+   o   s   
zUrlPathTuple.__init__c                 C   s   | j dS )zGet the URL path as string.rM   )rK   decoder(   r)   r)   r*   paths      zUrlPathTuple.path)	r5   r6   r7   r8   r9   strr:   r+   rQ   r)   r)   r)   r*   rJ   k   s
    rJ   c                   @   sT  e Zd ZdZdZ					djdedee ded	ed
eddfddZ	dkde	e dede
eee f fddZdedee fddZdlddZ			dmdedeee  dee deee  ddf
ddZ	dnde	e dee de	eeef  fddZ			dodee	e  d ee	e  deddfd!d"Z			dpd#ed$ed%ed&ee d'eddfd(d)Zd*e	e ddfd+d,Zdld-d.Zde	e fd/d0Zde	e fd1d2Zdedefd3d4Zdefd5d6Zdede	e fd7d8Zdede	e fd9d:Zde	e de	e fd;d<Z de	e de	eeef  fd=d>Z!d$edefd?d@Z"d$edefdAdBZ#dqdedCedee fdDdEZ$	F	GdrdHe%dIede	e fdJdKZ&	MdsdIedHede	e fdNdOZ'dPedQee( ddfdRdSZ)dPedee( fdTdUZ*dtdPedWe%de%fdXdYZ+de	e fdZd[Z,defd\d]Z-d^e%defd_d`Z.de	e fdadbZ/dldcddZ0dldedfZ1dgeddfdhdiZ2dS )uUrlStorezNDefines a class to store domain-classified URLs and perform checks against it.)
compresseddonelanguagestricttrailing_slashurldict_lockFNTrU   rW   rX   trailingverboser   c                    s   | _ d _| _| _| _tt _t  _	dt
dt
dd f fdd}|r=tjds?ttj| ttj| d S d S d S )NFnumframer   c                    s(   t dt j    td d S )Nz<Processing interrupted, dumping unvisited URLs from %s hostsr   )LOGGERdebuglenrZ   print_unvisited_urlssysexit)r^   r_   rP   r)   r*   dump_unvisited_urls   s   z.UrlStore.__init__.<locals>.dump_unvisited_urlswin)rU   rV   rW   rX   rY   r   rC   rZ   r   r[   r	   rd   platform
startswithsignalSIGINTSIGTERM)r(   rU   rW   rX   r\   r]   rf   r)   rP   r*   r+      s   
	zUrlStore.__init__r,   rL   c           	   
   C   s   t t}t|D ][}zHt|\}}|du rtd| t| jd ur7t	|| j| j
| jdu r7td| tt|| j
| j| jd}t|\}}|| t|| W q	 ttfyd   td| Y q	w |S )NFzInvalid URL: %szWrong language: %s)rX   rW   rY   zDiscarding URL: %s)r   r   dictfromkeysr   r`   ra   
ValueErrorrW   r   rX   rY   r   r   appendrJ   	TypeErrorwarning)	r(   r,   rL   	inputdicturlvalidation_result
parsed_urlhostinforK   r)   r)   r*   _buffer_urls   s6   
zUrlStore._buffer_urlsdomainc                 C   s4   || j v r| jrt| j | jS | j | jS t S r    )rZ   rU   
COMPRESSORr'   rI   r   r(   ry   r)   r)   r*   
_load_urls   s
   
zUrlStore._load_urlsc                 C   sX   | j s(tdd | j D r*| j d| _ W d    d S 1 s!w   Y  d S d S d S )Nc                 s   s    | ]	}|j tjkV  qd S r    rF   r=   r@   .0vr)   r)   r*   	<genexpr>   s    z%UrlStore._set_done.<locals>.<genexpr>T)rV   allrZ   valuesr[   rP   r)   r)   r*   	_set_done   s
   "zUrlStore._set_doneto_rightrG   to_leftc                    s  | drd|dd   }|| jv r|}n| dr3d|dd   }|| jv r3| j| | j|< | j|= || jv rP| j| jtju rCd S | |}dd |D  nt }t  |d urf| fd	d
|D  |d urv|	 fdd
|D  | j
\ | jrt|| j| _n|| j| _t|| j| _|d ur|| j| _tdd
 |D rtj| j| _ntj| j| _| jrd| _W d    d S W d    d S W d    d S 1 sw   Y  d S )Nzhttp://https   zhttps://httpr-   c                 S      h | ]}|  qS r)   rQ   r   ur)   r)   r*   	<setcomp>       z'UrlStore._store_urls.<locals>.<setcomp>c                 3   "    | ]}t |  s|V  qd S r    r   rQ   r   tknownr)   r*   r           z'UrlStore._store_urls.<locals>.<genexpr>c                 3   r   r    r   r   r   r)   r*   r      r   c                 s       | ]}|j V  qd S r    )rL   r   r)   r)   r*   r          F)ri   rZ   rF   r=   rB   r|   r   setextend
extendleftr[   rU   rz   r#   rI   rb   rH   rG   r   rA   r@   rV   )r(   ry   r   rG   r   	candidateurlsr)   r   r*   _store_urls   sL   





"zUrlStore._store_urlsr   switchc           	      C   sz   d }i }t |}t|D ]+}t|\}}||kr%|}dd | |D }||v r8|dks5|dkr8|| r8||= qt|S )Nc                 S   s   i | ]}|  |jqS r)   )rQ   rL   r   r)   r)   r*   
<dictcomp>      z)UrlStore._search_urls.<locals>.<dictcomp>r   r>   )rm   rn   sortedr   r|   list)	r(   r   r   last_domainknown_pathsremaining_urlsrt   rw   rK   r)   r)   r*   _search_urls  s   
zUrlStore._search_urls
appendleftc                 C   s`   |r|  || D ]\}}| j||d q
|r,|  || D ]\}}| j||d q dS dS )zAdd a list of URLs to the (possibly) existing one.
        Optional: append certain URLs to the left,
        specify if the URLs have already been visited.)r   )r   N)rx   itemsr   )r(   r   r   rL   host	urltuplesr)   r)   r*   add_urls  s   	zUrlStore.add_urls
htmlstringrt   externallangwith_navc           
   	   C   sF   t |}| |}t||||p| j|| j|d\}}	| j||	d dS )zJFind links in a HTML document, filter them and add them to the data store.)r   rt   r   r   rE   rX   r   )r   r   N)r   	get_rulesr   rW   rX   r   )
r(   r   rt   r   r   r   base_urlrE   linkslinks_priorityr)   r)   r*   add_from_html-  s   


	zUrlStore.add_from_htmldomainsc                 C   sb   | j  |D ]}ttjd| j|< qW d   n1 sw   Y  |   t }t	d| dS )z)Declare domains void and prune the store.)rF   Nz'%s objects in GC after UrlStore.discard)
r[   rC   r=   rB   rZ   r   gccollectr`   ra   )r(   r   dr^   r)   r)   r*   discardD  s   zUrlStore.discardc                 C   sN   | j  tt| _W d   n1 sw   Y  t  t }td| dS )zRe-initialize the URL store.Nz UrlStore reset, %s objects in GC)	r[   r   rC   rZ   r   r   r   r`   ra   )r(   r^   r)   r)   r*   resetM  s   zUrlStore.resetc                 C   s   t | j S )z#Return all known domains as a list.)r   rZ   keysrP   r)   r)   r*   get_known_domainsW     zUrlStore.get_known_domainsc                 C      dd | j  D S )ziFind all domains for which there are unvisited URLs
        and potentially adjust done meta-information.c                 S   s    g | ]\}}|j tjkr|qS r)   r}   )r   r   r   r)   r)   r*   
<listcomp>^  s     z2UrlStore.get_unvisited_domains.<locals>.<listcomp>)rZ   r   rP   r)   r)   r*   get_unvisited_domains[  s   zUrlStore.get_unvisited_domainsc                 C   s    || j v r| j | jtjkS dS )z9Tell if all known URLs for the website have been visited.F)rZ   rF   r=   r@   r{   r)   r)   r*   is_exhausted_domain`  s   
zUrlStore.is_exhausted_domainc                 C   s   t |  S )zFReturn the number of websites for which there are still URLs to visit.)rb   r   rP   r)   r)   r*   unvisited_websites_numberg  rR   z"UrlStore.unvisited_websites_numberc                    s    fdd|   D S )zLGet all already known URLs for the given domain (ex. "https://example.org").c                    s   g | ]} |   qS r)   r   r   ry   r)   r*   r   o  r   z,UrlStore.find_known_urls.<locals>.<listcomp>)r|   r{   r)   r   r*   find_known_urlsm     zUrlStore.find_known_urlsc                    s&   |   s fdd|  D S g S )z,Get all unvisited URLs for the given domain.c                    s   g | ]}|j s |  qS r)   )rL   rQ   r   r   r)   r*   r   t  s    z0UrlStore.find_unvisited_urls.<locals>.<listcomp>)r   r|   r{   r)   r   r*   find_unvisited_urlsq  s   
zUrlStore.find_unvisited_urlsc                 C      | j |ddS )z:Take a list of URLs and return the currently unknown ones.r   r   r   r(   r   r)   r)   r*   filter_unknown_urlsw  r   zUrlStore.filter_unknown_urlsc                 C   r   )z<Take a list of URLs and return the currently unvisited ones.r>   r   r   r   r)   r)   r*   filter_unvisited_urls{  r   zUrlStore.filter_unvisited_urlsc                 C   s   t | |g S )z0Check if the given URL has already been visited.)r:   r   )r(   rt   r)   r)   r*   has_been_visited  s   zUrlStore.has_been_visitedc                 C   s$   t |\}}|dd | |D v S )z/Check if the given URL has already been stored.c                 S   r   r)   r   r   r)   r)   r*   r     r   z$UrlStore.is_known.<locals>.<setcomp>)r   r|   )r(   rt   rw   rK   r)   r)   r*   is_known  s   zUrlStore.is_known
as_visitedc              	   C   s   |  |sF| |}|D ]9}|jsE|r=d|_| j | j|  jd7  _W d   n1 s.w   Y  | j||t d ||	    S q| j t
j| j| _W d   n1 s[w   Y  |   dS )zSRetrieve a single URL and consider it to be visited (with corresponding timestamp).Tr   NrG   )r   r|   rL   r[   rZ   rD   r   r   nowrQ   r=   rA   rF   r   )r(   ry   r   
url_tuplesrt   r)   r)   r*   get_url  s"   

	zUrlStore.get_url      $@'  
time_limitmax_urlsc                 C   sz   g }| j  D ]/\}}|jtjkrq|jr t |j  |kr6| 	|}|dur6|
| t||kr6 nq|   |S )zaGet a list of immediately downloadable URLs according to the given
        time limit per domain.N)rZ   r   rF   r=   r@   rG   r   r   total_secondsr   rp   rb   r   )r(   r   r   r   websiteentryrt   r)   r)   r*   get_download_urls  s   

zUrlStore.get_download_urlsd   
   c              
   C   sb  |   }|sg S |t| pd}g }|D ]}| |}g }|D ]>}	t||ks1t|t| |kr3 n+|	js]||	  d|	_| j | j|  jd7  _W d   n1 sXw   Y  qt	
 }
| j| j}|rr|
|  |krud}n|t|
|  d }|D ]}|||| f ||7 }q|
td||  }| j|||d q|   t|tddS )	zcGet up to the specified number of URLs along with a suitable
        backoff schedule (in seconds).r   TNg        z.2fr   r   )key)r   rb   r|   rL   rp   rQ   r[   rZ   rD   r   r   rG   r   floatr   r   r   r   r   )r(   r   r   	potential
per_domaintargetsry   r   urlpathsrt   r   original_timestampschedule_secsrK   
total_diffr)   r)   r*   establish_download_schedule  sF   

z$UrlStore.establish_download_scheduler   rE   c                 C   s    | j rt|}|| j| _dS )z)Store crawling rules for a given website.N)rU   rz   r#   rZ   rE   )r(   r   rE   r)   r)   r*   store_rules  s   
zUrlStore.store_rulesc                 C   s2   || j v r| jrt| j | jS | j | jS dS )z7Return the stored crawling rules for the given website.N)rZ   rU   rz   r'   rE   )r(   r   r)   r)   r*   r     s
   
zUrlStore.get_rulesr-   defaultc                 C   s8   d}|  |}z|d}W n	 ty   Y nw |p|S )zBReturn the delay as extracted from robots.txt, or a given default.N*)r   crawl_delayAttributeError)r(   r   r   delayrE   r)   r)   r*   get_crawl_delay  s   
zUrlStore.get_crawl_delayc                 C   r   )z2Return all download counts for the hosts in store.c                 S   s   g | ]}|j qS r)   rD   r~   r)   r)   r*   r   
  s    z+UrlStore.get_all_counts.<locals>.<listcomp>)rZ   r   rP   r)   r)   r*   get_all_counts  r2   zUrlStore.get_all_countsc                 C   s   t dd | j D S )z!Find number of all URLs in store.c                 s   r   r    )rH   r~   r)   r)   r*   r     r   z,UrlStore.total_url_number.<locals>.<genexpr>)sumrZ   r   rP   r)   r)   r*   total_url_number  r   zUrlStore.total_url_number	thresholdc                    s   t  fdd| j D S )z^Find out if the download limit (in seconds) has been reached for one of the websites in store.c                 3   s    | ]}|j  kV  qd S r    r   r~   r   r)   r*   r     s    z6UrlStore.download_threshold_reached.<locals>.<genexpr>)anyrZ   r   )r(   r   r)   r   r*   download_threshold_reached  s   z#UrlStore.download_threshold_reachedc                 C   s$   g }| j D ]
}|| | q|S )z Return a list of all known URLs.)rZ   r   r   )r(   r   ry   r)   r)   r*   	dump_urls  s   
zUrlStore.dump_urlsc                 C   s(   | j D ]}td| |dd qdS )z"Print all unvisited URLs in store.
TflushN)rZ   printjoinr   r{   r)   r)   r*   rc     s   
zUrlStore.print_unvisited_urlsc                    s6   | j D ] td fdd|  D dd qdS )z5Print all URLs in store (URL + TAB + visited or not).r   c                    s(   g | ]}  |   d t|j qS )	)rQ   rS   rL   r   r   r)   r*   r   %  s    z'UrlStore.print_urls.<locals>.<listcomp>Tr   N)rZ   r   r   r|   rP   r)   r   r*   
print_urls   s   

zUrlStore.print_urlsfilenamec                 C   s@   | ` t|d}t| | W d   dS 1 sw   Y  dS )zWrite the URL store to disk.wbN)r[   openr/   dump)r(   r   outputr)   r)   r*   write/  s   "zUrlStore.write)FNFTF)F)r   N)NNNr    )NNF)FNTr4   )r   r   )r   r   )r-   )3r5   r6   r7   r8   r9   r:   r   rS   r+   r   r
   r   rJ   rx   r|   r   r   r   intr   r	   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rc   r   r  r)   r)   r)   r*   rT   x   s    

"



7





	
"

6

rT   r   r   c                 C   s@   t | d}t|}W d   n1 sw   Y  t |_|S )zLoad a URL store from disk.rbN)r  r/   loadr   r[   )r   r  	url_storer)   r)   r*   
load_store6  s
   r
  )9r8   r   loggingr/   rj   rd   r"   r!   ImportErrorr%   r$   collectionsr   r   r   r   enumr   operatorr   	threadingr   typingr	   r
   r   r   r   r   r   r   urllib.robotparserr   cleanr   corer   filtersr   r   metar   urlutilsr   r   r   	getLoggerr5   r`   r   rz   r=   rC   rJ   rT   rS   r
  r)   r)   r)   r*   <module>   sR    (
   A