U
    Nf                     @   s0  d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZmZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZmZm Z m!Z!m"Z" dd
l#m$Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z: G dd deZ;dS )    N)BytesIOUnsupportedOperation)Path)	AnyCallableDictIterableListOptionalTupleUnioncast   )PdfDocCommonconvert_to_int)
EncryptionPasswordType)
PageObject)	StrByteType
StreamTypeb_logger_warningread_non_whitespaceread_previous_lineread_until_whitespaceskip_over_commentskip_over_whitespace)TrailerKeys)EmptyFileErrorFileNotDecryptedErrorPdfReadErrorPdfStreamErrorWrongPasswordError)ArrayObjectContentStreamDecodedStreamObjectDictionaryObjectEncodedStreamObjectIndirectObject
NameObject
NullObjectNumberObject	PdfObjectTextStringObjectread_object)XmpInformationc                	   @   s&  e Zd ZdZdTeeef eedee	f ddddZ
eeddd	Zeee dd
dZeee dddZdUedee f edee f eeef dddZeedddZeee dddZeedddZedeeef ee dddZeeeeef dddZ eeef ee dddZ!e"e#eef dd d!Z$eeee d"d#d$Z%eeee ee d%d&d'Z&eeed(d)d*Z'e"ddd+d,Z(e"ddd-d.Z)e"ddd/d0Z*e"edd1d2Z+e"ddd3d4Z,e"ee edd5d6d7Z-e"ee dd8d9Z.e"eee d:d;d<Z/e"ee0e1e2f dd=d>Z3e4e"eed:d?d@Z5e"dddAdBZ6e7e e8egeee#edCf f f e8eeee#edCf f gef ddDdEdFZ9e7e ee#eef  dGdHdIZ:eee	f e;dJdKdLZ<eeddMdNZ=eee dOdPdQZ>eee dOdRdSZ?dS )V	PdfReadera  
    Initialize a PdfReader object.

    This operation can take some time, as the PDF stream's cross-reference
    tables are read into memory.

    Args:
        stream: A File object or an object that supports the standard read
            and seek methods similar to a File object. Could also be a
            string representing a path to a PDF file.
        strict: Determines whether user should be warned of all
            problems and also causes some correctable problems to be fatal.
            Defaults to ``False``.
        password: Decrypt PDF file at initialization. If the
            password is None, the file will not be decrypted.
            Defaults to ``None``.
    FN)streamstrictpasswordreturnc           	   	   C   s2  || _ d | _i | _d| _d | _t|dr<d|jkr<tdt t	|t
tfrlt|d}t| }W 5 Q R X | | || _d| _d | _| jrd| _| jtj}|r|d  jnd}tt| jtj  }t||| _|d k	r|nd}| j|tjkr|d k	rt d	d| _n|d k	r.t!d
d S )Nr   modebzQPdfReader stream/file object is not in binary mode. It may not be read correctly.rbFT    zWrong passwordNot encrypted file)"r2   flattened_pagesresolved_objects
xref_index_page_id2numhasattrr5   r   __name__
isinstancestrr   openr   readr1   _override_encryption_encryptionis_encryptedtrailergetTKID
get_objectoriginal_bytesr   r&   ENCRYPTr   verifyr   ZNOT_DECRYPTEDr"   r    )	selfr1   r2   r3   fhZid_entryZ	id1_entryZencrypt_entrypwd rR   1/tmp/pip-unpacked-wheel-zevpxvmc/pypdf/_reader.py__init__h   sH    
 
zPdfReader.__init__)r4   c                 C   s   t t| jtj  S )z7Provide access to "/Root". standardized with PdfWriter.)r   r&   rG   rI   ROOTrK   rO   rR   rR   rS   root_object   s    zPdfReader.root_objectc                 C   sB   | j tjd}|dkrdS | }|dkr4tdtt|S dS )z
        Provide access to "/Info". standardized with PdfWriter.

        Returns:
            /Info Dictionary ; None if the entry does not exists
        NzETrailer not found or does not point to document information directory)rG   rH   rI   INFOrK   r    r   r&   )rO   inforR   rR   rS   _info   s    zPdfReader._infoc                 C   s*   | j tjd}|dkrdS tt| S )z
        Provide access to "/ID". standardized with PdfWriter.

        Returns:
            /ID array ; None if the entry does not exists
        N)rG   rH   rI   rJ   r   r#   rK   )rO   idrR   rR   rS   _ID   s    zPdfReader._ID)includeexcluder4   c                    s^   | j d | j  }d|i}dk	r<fdd| D } dk	rZ fdd| D }|S )z
        Integration into Jupyter Notebooks.

        This method returns a dictionary that maps a mime-type to it's
        representation.

        See https://ipython.readthedocs.io/en/stable/config/integrating.html
        r   zapplication/pdfNc                    s   i | ]\}}| kr||qS rR   rR   .0kv)r]   rR   rS   
<dictcomp>   s       z/PdfReader._repr_mimebundle_.<locals>.<dictcomp>c                    s   i | ]\}}| kr||qS rR   rR   r_   )r^   rR   rS   rc      s       )r1   seekrC   items)rO   r]   r^   Zpdf_datadatarR   )r^   r]   rS   _repr_mimebundle_   s    
 zPdfReader._repr_mimebundle_c                 C   s>   | j  }| j dd | j ddd}| j |d |S )z
        The first 8 bytes of the file.

        This is typically something like ``'%PDF-1.6'`` and can be used to
        detect if the file is actually a PDF file and which version it is.
        r      zutf-8backslashreplace)r1   tellrd   rC   decode)rO   locZpdf_file_versionrR   rR   rS   
pdf_header   s
    

zPdfReader.pdf_headerc                 C   s&   zd| _ tt| jjW S d| _ X dS )z(XMP (Extensible Metadata Platform) data.FTN)rD   r   r/   rW   xmp_metadatarV   rR   rR   rS   rn      s    zPdfReader.xmp_metadata)page_numberr4   c                 C   s.   | j dkr|   | j dk	s$td| j | S )z
        Retrieve a page by number from this PDF file.

        Args:
            page_number: The page number to retrieve
                (pages begin at zero)

        Returns:
            A :class:`PageObject<pypdf._page.PageObject>` instance.
        Nhint for mypy)r:   _flattenAssertionError)rO   ro   rR   rR   rS   	_get_page   s    
zPdfReader._get_page)indirect_referencer4   c                 C   sp   | j dkr dd t| jD | _ |dks2t|tr6dS t|trF|}n|j}| j dk	s^td| j |d}|S )z
        Generate _page_id2num.

        Args:
            indirect_reference:

        Returns:
            The page number or None
        Nc                 S   s   i | ]\}}|j j|qS rR   )rt   idnum)r`   ixrR   rR   rS   rc     s     z:PdfReader._get_page_number_by_indirect.<locals>.<dictcomp>rp   )	r=   	enumerateZpagesr@   r*   intru   rr   rH   )rO   rt   ru   retrR   rR   rS   _get_page_number_by_indirect  s    

z&PdfReader._get_page_number_by_indirectc                 C   s  | j |j \}}t|d|  }tt|d dks6t||d k sFttt|	 }t
|d D ]}t| |dd t|}t| |dd t|}t| |dd ||jkrqb| jr||krtd|t|d | d t| |dd zt|| }	W nf tk
rz }
 zFtd	| d
|j d|j d|
 t | jrdtd|
 t }	W 5 d }
~
X Y nX |	  S | jrtdt S )Nr   /Typez/ObjStmz/Nr   zObject is in wrong index.z/FirstzInvalid stream (index z) within object  z: zCannot read object stream: z%This is a fatal error in strict mode.)xref_objStmru   r(   rK   r   rA   rr   r   r   get_dataranger   rd   r+   Zread_from_streamr2   r    ry   r.   r!   r   
generationr?   r*   )rO   rt   ZstmnumidxZobj_stmstream_datarv   ZobjnumoffsetobjexcrR   rR   rS   _get_object_from_stream   sN    


z!PdfReader._get_object_from_streamc           	   
   C   s^  t |trt|d| }| |j|j}|d k	r2|S |jdkrV|j| jkrV| |}n|j| jkr|j| j|j kr| j	
|ji 
|jdrt S | j|j |j }| j|d z0| | j\}}||jks||jkrtdW n tk
r   t| jdrt| j }n2| j }| jdd | jd}| j|d td|j d|j d |}|d k	rtd	|j d
|j dt |dd | j|j |j< | j|dd  | | j\}}nd}Y nX ||jkr$| jr$| jr^td|j d|j d| d| d	n:||jkr^| jr^td|j d|j d| d| d	| jrv||jksvtt| j| }| j sH| j!d k	rH| j!" st#dt$t%|}| j!&||j|j}n|t| jdrt| j }n2| j }| jdd | jd}| j|d td|j d|j d |}|d k	rtd|j d|j dt |j| jkr~i | j|j< |dd | j|j |j< | j|'dd  t(| j | jdd t| j| }| j sH| j!d k	rH| j!" st#dt$t%|}| j!&||j|j}n,td|j d|j dt | jrHtd| )|j|j| |S )Nr   Fz&not matching, we parse the file for it	getbufferr}   z\sz\s+z\s+objz
Object ID ,z ref repairedr   zExpected object ID (r~   z) does not match actual (z); xref table not zero-indexed.z).zFile has not been decryptedzObject z foundz not defined.zCould not find object.)*r@   ry   r(   cache_get_indirect_objectr   ru   r   r   xrefxref_free_entryrH   r*   r1   rd   read_object_headerr    	Exceptionr>   bytesr   rj   rC   researchencoder   r?   startr<   r2   rr   r.   rD   rE   Zis_decryptedr   r   r,   Zdecrypt_objectendr   cache_indirect_object)	rO   rt   retvalr   ru   r   bufpmrR   rR   rS   rK   Z  s    
 

 



  
  



    zPdfReader.get_object)r1   r4   c                 C   s   d}t | |t|O }|dd t|}|t|O }|dd t|}|t|O }|dd |d}t| |dd |r| jrtd| d| t t	|t	|fS )NFr}   r      z.Superfluous whitespace found in object header r~   )
r   r   rd   r   rC   r   r2   r   r?   ry   )rO   r1   extraru   r   _objrR   rR   rS   r     s&    

zPdfReader.read_object_header)r   ru   r4   c                 C   s   | j ||fS N)r;   rH   )rO   r   ru   rR   rR   rS   r     s    z#PdfReader.cache_get_indirect_object)r   ru   r   r4   c                 C   s^   ||f| j kr6d| d| }| jr,t|t|t || j ||f< |d k	rZt||| |_|S )NzOverwriting cache for r~   )r;   r2   r    r   r?   r(   rt   )rO   r   ru   r   msgrR   rR   rS   r     s    
zPdfReader.cache_indirect_object)indirectr   r4   c                 C   sH   |j | krtd|j|jf| jkr,td|| j|j|jf< ||_|S )Nz,Cannot update PdfReader with external objectzCannot find referenced object)Zpdf
ValueErrorr   ru   r;   rt   )rO   r   r   rR   rR   rS   _replace_object  s    
zPdfReader._replace_objectc                 C   s  |  | | | | |}| ||}|dkrV| jrD|rDtdtd| dt | ||| | j	r6| js6|
 }| j D ]\}}|dkrqt| }|D ]~}||| d z| |\}	}
W n$ tk
r   | | Y  qY nX |	|| j	 kr| j| | | j| |	< | j| |= qq||d | js|
 }| j D ]\}}|dkrfqPt| }|D ]d}||| d z| | W n> tk
r   td| d| d||  dt ||= Y nX qvqP||d d S )	Nr   zBroken xref tablezincorrect startxref pointer()  zIgnoring wrong pointing object r~   z	 (offset )_basic_validation_find_eof_marker_find_startxref_pos_get_xref_issuesr2   r    r   r?   _read_xref_tables_and_trailersr<   rj   r   re   sortedkeysrd   r   r   _rebuild_xref_tablelist)rO   r1   	startxrefxref_issue_nrrl   genZ
xref_entryZxref_kr[   pidZ_pgenidsrR   rR   rS   rC     sZ    





zPdfReader.readc                 C   s   | dtj z|d}W n tk
r8   tdY nX |dkrLtdn6|dkr| jrrtd|	d d	nt
d
| t | dtj dS )z/Ensure file is not empty. Read at most 5 bytes.r      zcannot read headerr8   zCannot read an empty files   %PDF-zPDF starts with 'utf8z', but '%PDF-' expectedzinvalid pdf header: N)rd   osSEEK_SETrC   UnicodeDecodeErrorr   r   r2   r    rk   r   r?   SEEK_END)rO   r1   Zheader_byterR   rR   rS   r   S  s    
zPdfReader._basic_validationc                 C   sL   d}d}|dd dkrH|  |k r>| jr4tdn
tdt t|}qdS )a  
        Jump to the %%EOF marker.

        According to the specs, the %%EOF marker should be at the very end of
        the file. Hence for standard-compliant PDF documents this function will
        read only the last part (DEFAULT_BUFFER_SIZE).
        rh   r8   Nr   s   %%EOFzEOF marker not found)rj   r2   r    r   r?   r   )rO   r1   ZHEADER_SIZElinerR   rR   rS   r   f  s    

zPdfReader._find_eof_markerc                 C   s~   t |}zt|}W nD tk
rX   |ds6tdt|dd  }tdt Y n"X t |}|dd dkrztd|S )z
        Find startxref entry - the location of the xref table.

        Args:
            stream:

        Returns:
            The bytes offset
        s	   startxrefzstartxref not found	   Nz startxref on same line as offset)r   ry   r   
startswithr    stripr   r?   )rO   r1   r   r   rR   rR   rS   r   x  s    

zPdfReader._find_startxref_posc                 C   s  | d}|dkrtdt| |dd d}ttt|| }|rd|dkrd|| _| jrdt	dt
 d	}t| |dd ttt|| }t|tst	d
t
 | | |   d S t| |dd d}||k r| d}|d dkr|dd | d}q|d dkr"|dd z8|d d d\}}	|dd }
t|t|	 }}W n tk
r   t|drt| }n(| }|dd | d}|| t| d |}|d krt	d| dt
 d}d}n(t	d| dt
 t|d}| }Y nX || jkr8i | j|< i | j|< || j| krJnt|
dkrb|| j| |< z|
dk| j| |< W n tk
r   Y nX z|
dk| jd |< W n tk
r   Y nX |d7 }|d7 }qt| |dd | d}|dkr|dd q2qq2d S )Nr   s   refzxref table read errorr}   r   Tr   zFXref table not zero-indexed. ID numbers for objects will be corrected.Fz,Invalid/Truncated xref table. Rebuilding it.      
is   0123456789t             r   z\s+(\d+)\s+objzentry z( in Xref table invalid; object not foundr   z' in Xref table invalid but object found   n   f   s   traileri)rC   r    r   rd   r   ry   r.   r<   r2   r   r?   r@   r   splitr   r>   r   r   rj   r   r   r   groupr   r   r   )rO   r1   refZ
first_timenumsizeZcntr   Zoffset_bZgeneration_bZentry_type_br   r   r   r   fZtrailer_tagrR   rR   rS   _read_standard_xref_table  s    















z#PdfReader._read_standard_xref_table)r1   r   r   r4   c           
   
   C   s  i | _ i | _i | _t | _|d k	r||d |d}|dkrL|d}|dkr`| |}q|rz| | W qW n t	k
r   d}Y nX q|
 rz| |}W n^ t	k
r
 } z>tj| jkrtd|j t W Y qntd|j W 5 d }~X Y nX tjtjtjtjtjf}|D ]0}||kr(|| jkr(||| jt|< q(d|kr| }	|tt|d d d | | ||	d d|krtt|d }nqq| ||}qd S )	Nr   r   r      xz!Previous trailer can not be read ztrailer can not be read /XRefStm/Prev)r   r   r   r&   rG   rd   rC   
_read_xrefr   r   isdigit_read_pdf15_xref_streamrI   rU   r   argsr?   r    rM   rX   rJ   ZSIZEZraw_getr)   rj   r   ry   _read_xref_other_error)
rO   r1   r   r   rw   
xrefstreameZtrailer_keyskeyr   rR   rR   rS   r     sR    






"


z(PdfReader._read_xref_tables_and_trailersc                 C   s  |  | |ddkrd S |dd t| |dd ttttf t|| }|	 D ]\}}|| j
kr\|| j
|< q\d|kr| }|tt|d d d z| | W n* tk
r   td|d  dt Y nX ||d d|kr |d }|S d S d S )	Nr   r8   r}   r   r   zXRef object at z, can not be read, some object may be missingr   )r   rC   rd   r   r   r   rA   r   r.   re   rG   rj   ry   r   r   r   r?   )rO   r1   new_trailerr   valuer   r   rR   rR   rS   r   8  s2    



zPdfReader._read_xref)r1   r   r4   c                 C   s   |dkr$| j rtdtdt d S |dd |d}|d}|dkr\|d	| 8 }|S ||d td
D ]"}|d rp||7 }|  S qpd| j	kr| j stdt z| 
| W d S  tk
r   tdY nX tdd S )Nr   z6/Prev=0 in the trailer (try opening with strict=False)zA/Prev=0 in the trailer - assuming there is no previous xref tableir   r      xrefr}   
      z/Rootz"Invalid parent xref., rebuild xrefzcan not rebuild xrefz/Could not find xref table at specified location)r2   r    r   r?   rd   rC   findr   r   rG   r   r   )rO   r1   r   tmpZxref_locZlookrR   rR   rS   r   T  s:    




z PdfReader._read_xref_other_errorc                    s  | dd |\}}ttt|}tt|d dks@t||| tt	|
 |dd|dg}ttttf |d t d	kstjrt d	krtd
  tttttdf f d fdd}tttttdf f tdfdd}||| |S )Nr}   r   r|   z/XRefz/Indexr   z/Sizez/Wr   zToo many entry sizes: .)rv   r4   c                    s<    |  dkr(  |  }t| |  S | dkr4dS dS d S )Nr   r   )rC   r   )rv   d)entry_sizesr   rR   rS   	get_entry  s    z4PdfReader._read_pdf15_xref_stream.<locals>.get_entry)r   r   r4   c                    s   |  j |g kp|  jkS r   )r   rH   r   )r   r   rV   rR   rS   used_before  s    z6PdfReader._read_pdf15_xref_stream.<locals>.used_before)rd   r   r   r$   r.   rA   rr   r   r   r   r   rH   r   r   lenr2   r    ry   r   r   bool_read_xref_subsections)rO   r1   ru   r   r   	idx_pairsr   r   rR   )r   rO   r   rS   r   }  s    &&z!PdfReader._read_pdf15_xref_streamc                 C   s   |  |d d | d}|dkr,| d}|dkr8dS | d}|dkrd}|dkrn| d}|dkrNd	S qN|| d	7 }| d
krdS dS )z
        Return an int which indicates an issue. 0 means there is no issue.

        Args:
            stream:
            startxref:

        Returns:
            0 means no issue, other values represent specific issues.
        r   r      js   
 	   r   r8   s   0123456789 	   s   objr   )rd   rC   lower)r1   r   r   rR   rR   rS   r     s"    



zPdfReader._get_xref_issuesc           	      C   s   i | _ |dd |d}td|D ]H}t|d}t|d}|| j kr\i | j |< |d| j | |< q(|dd td|D ]N}||dd tt	t
t
f t|| }t| D ]\}}|| j|< qqd S )Nr   r}   s(   [\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+objr   r   s$   [\r\n \t][ \t]*trailer[\r\n \t]*(<<))r   rd   rC   r   finditerry   r   r   r   r   r   r.   r   re   rG   )	rO   r1   Zf_r   ru   r   r   r   r   rR   rR   rS   r     s    


zPdfReader._rebuild_xref_table.)r   r   r   r4   c                 C   s   |  |D ]\}}t||| D ]}|d}|dkrF|d}|d}	q |dkr|d}
|d}|| jkrri | j|< |||s|
| j| |< q |dkr|d}|d}d}|||s||f| j|< q | jr td| q q
d S )Nr   r   r   zUnknown xref type: )_pairsr   r   r   r2   r    )rO   r   r   r   r   r   r   Z	xref_typeZnext_free_objectZnext_generationZbyte_offsetr   Z
objstr_numZ	obstr_idxrR   rR   rS   r     s*    




z PdfReader._read_xref_subsections)arrayr4   c                 c   s:   d}|| ||d  fV  |d7 }|d t |krq6qd S )Nr   r   r   )r   )rO   r   rv   rR   rR   rS   r     s
    zPdfReader._pairs)r3   r4   c                 C   s   | j std| j |S )a  
        When using an encrypted / secured PDF file with the PDF Standard
        encryption handler, this function will allow the file to be decrypted.
        It checks the given password against the document's user password and
        owner password, and then stores the resulting decryption key if either
        password is correct.

        It does not matter which password was matched.  Both passwords provide
        the correct decryption key that will allow the document to be used with
        this library.

        Args:
            password: The password to match.

        Returns:
            An indicator if the document was decrypted and whether it was the
            owner password or the user password.
        r9   )rE   r    rN   )rO   r3   rR   rR   rS   decrypt  s    zPdfReader.decryptc                 C   s   t j| jkS )z
        Read-only boolean property showing whether this PDF file is encrypted.

        Note that this property, if true, will remain true even after the
        :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
        )rI   rM   rG   rV   rR   rR   rS   rF     s    zPdfReader.is_encrypted)namer4   c                 C   s   | j }d|kst|d ts dS tt|td }d|kr>dS t }t||td< |td |td< | dtdd | jD d	 | t	 }|
|j ||td< tt	|d D ]6}| }d
|krtd|j dt |j|td
< q|S )z
        Add a top level form that groups all form fields below it.

        Args:
            name: text string of the "/T" Attribute of the created object

        Returns:
            The created object. ``None`` means no object was created.
        	/AcroFormN/Fields/Tz/Kidsr   c                 S   s   g | ]\}}|d kr|qS )r   rR   )r`   grv   rR   rR   rS   
<listcomp>=  s      z.PdfReader.add_form_topname.<locals>.<listcomp>r   z/ParentzTop Level Form Field z have a non-expected parent)rW   r@   r&   r   r)   r-   r   maxr;   r#   appendrt   rK   r   r?   )rO   r   catalogacroforminterimZarror   rR   rR   rS   add_form_topname#  s:    

 zPdfReader.add_form_topnamec                 C   sr   | j }d|kst|d ts dS tt|td }d|kr>dS tttt|td d  }t||td< |S )z
        Rename top level form field that all form fields below it.

        Args:
            name: text string of the "/T" field of the created object

        Returns:
            The modified object. ``None`` means no object was modified.
        r   Nr   r   r   )rW   r@   r&   r   r)   r#   rK   r-   )rO   r   r   r   r   rR   rR   rS   rename_form_topnameM  s    

 zPdfReader.rename_form_topname)FN)NN)@r?   
__module____qualname____doc__r   r   r   r   rA   r   rT   propertyr&   rW   r
   rZ   r#   r\   r   r   r   rg   rm   r/   rn   ry   r   rs   r*   r(   r{   r,   r   rK   r   r   r   r   r   r   rC   r   r   r   r   r   r   r   r$   r'   r%   r   staticmethodr   r   r	   r   r   r   r   r   rF   r   r   rR   rR   rR   rS   r0   U   s     
4  
;
    
:w  1 *(" 	*r0   )<r   r   ior   r   pathlibr   typingr   r   r   r   r	   r
   r   r   r   Z_doc_commonr   r   rE   r   r   Z_pager   _utilsr   r   r   r   r   r   r   r   r   	constantsr   rI   errorsr   r   r    r!   r"   Zgenericr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   xmpr/   r0   rR   rR   rR   rS   <module>   s   ,,8