alejandro-ao · costabm · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Nov 1, 2023
diff --git a/.env.example b/.env.example
@@ -1,2 +1,2 @@
 OPENAI_API_KEY=
-HUGGINGFACEHUB_API_TOKEN=
+HUGGINGFACEHUB_API_TOKEN=
diff --git a/app.py b/app.py
@@ -1,6 +1,13 @@
+"""
+For PyCharm users:
+    The "configuration" in Pycharm that allows for both Run & Debug:
+        use as the module: streamlit (instead of script -> python.exe)
+        use as the script parameters: run app.py
+"""
+
 import streamlit as st
 from dotenv import load_dotenv
-from PyPDF2 import PdfReader
+import pypdfium2 as pdfium  # Check leaderboard here: https://github.com/py-pdf/benchmarks  # yiwei-ang:feature/pdfium
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
@@ -10,94 +17,106 @@
 from htmlTemplates import css, bot_template, user_template
 from langchain.llms import HuggingFaceHub
 
+
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
+        pdf_reader = pdfium.PdfDocument(pdf)
+        for i in range(len(pdf_reader)):
+            page = pdf_reader.get_page(i)
+            textpage = page.get_textpage()
+            text += textpage.get_text_range() + "\n"
     return text
 
 
 def get_text_chunks(text):
-    text_splitter = CharacterTextSplitter(
-        separator="\n",
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
+    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=5000, chunk_overlap=500, length_function=len)
     chunks = text_splitter.split_text(text)
     return chunks
 
 
 def get_vectorstore(text_chunks):
-    embeddings = OpenAIEmbeddings()
-    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+    # Check leaderboard here: https://huggingface.co/spaces/mteb/leaderboard
+    # embeddings = OpenAIEmbeddings()
+    embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-large-en-v1.5")
     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 
 
 def get_conversation_chain(vectorstore):
-    llm = ChatOpenAI()
+    # Check leaderboard here: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
+    # llm = ChatOpenAI()
     # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
-
-    memory = ConversationBufferMemory(
-        memory_key='chat_history', return_messages=True)
-    conversation_chain = ConversationalRetrievalChain.from_llm(
-        llm=llm,
-        retriever=vectorstore.as_retriever(),
-        memory=memory
-    )
+    llm = HuggingFaceHub(repo_id="HuggingFaceH4/zephyr-7b-alpha",
+                         model_kwargs={"temperature": 0.1,
+                                       "max_new_tokens": 1000})   # ,"max_length": 1000})
+    memory = ConversationBufferMemory(memory_key='chat_history',
+                                      return_messages=True)
+    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
+                                                               retriever=vectorstore.as_retriever(),
+                                                               memory=memory)
     return conversation_chain
 
 
-def handle_userinput(user_question):
-    response = st.session_state.conversation({'question': user_question})
-    st.session_state.chat_history = response['chat_history']
+def save_question_and_clear_prompt(ss):
+    ss.user_question = ss.prompt_bar
+    ss.prompt_bar = ""  # clearing the prompt bar after clicking enter to prevent automatic re-submissions
 
-    for i, message in enumerate(st.session_state.chat_history):
-        if i % 2 == 0:
-            st.write(user_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
-        else:
-            st.write(bot_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
 
+def write_chat(msgs):  # Write the Q&A in a pretty chat format
+    for i, msg in enumerate(msgs):
+        if i % 2 == 0:  # it's a question
+            st.write(user_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)
+        else:  # it's an answer
+            st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)
 
-def main():
-    load_dotenv()
-    st.set_page_config(page_title="Chat with multiple PDFs",
-                       page_icon=":books:")
-    st.write(css, unsafe_allow_html=True)
 
-    if "conversation" not in st.session_state:
-        st.session_state.conversation = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = None
+def main():
+    load_dotenv()  # loads api keys
+    ss = st.session_state  # https://docs.streamlit.io/library/api-reference/session-state
 
+    # Page design
+    st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
+    st.write(css, unsafe_allow_html=True)
     st.header("Chat with multiple PDFs :books:")
-    user_question = st.text_input("Ask a question about your documents:")
-    if user_question:
-        handle_userinput(user_question)
+
+    # Initializing session state variables
+    if "conversation_chain" not in ss:
+        ss.conversation_chain = None  # the main variable storing the llm, retriever and memory
+    if "prompt_bar" not in ss:
+        ss.prompt_bar = ""
+    if "user_question" not in ss:
+        ss.user_question = ""
+    if "docs_are_processed" not in ss:
+        ss.docs_are_processed = False
 
     with st.sidebar:
         st.subheader("Your documents")
-        pdf_docs = st.file_uploader(
-            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
-        if st.button("Process"):
+        pdf_docs = st.file_uploader("Upload your PDFs here and click 'Process'", accept_multiple_files=True, type="pdf")
+        if st.button("Process") and pdf_docs:
             with st.spinner("Processing"):
-                # get pdf text
-                raw_text = get_pdf_text(pdf_docs)
-
-                # get the text chunks
-                text_chunks = get_text_chunks(raw_text)
-
-                # create vector store
-                vectorstore = get_vectorstore(text_chunks)
-
-                # create conversation chain
-                st.session_state.conversation = get_conversation_chain(
-                    vectorstore)
+                raw_text = get_pdf_text(pdf_docs)  # get pdf text
+                text_chunks = get_text_chunks(raw_text)  # get the text chunks
+                vectorstore = get_vectorstore(text_chunks)  # create vector store
+                ss.conversation_chain = get_conversation_chain(vectorstore)  # create conversation chain
+                ss.docs_are_processed = True
+        if ss.docs_are_processed:
+            st.text('Documents processed')
+
+    st.text_input("Ask a question here:", key='prompt_bar', on_change=save_question_and_clear_prompt(ss))
+
+    if ss.user_question:
+        ss.conversation_chain({'question': ss.user_question})  # This is what gets the response from the LLM!
+        if hasattr(ss.conversation_chain.memory, 'chat_memory'):
+            chat = ss.conversation_chain.memory.chat_memory.messages
+            write_chat(chat)
+
+    if hasattr(ss.conversation_chain, 'memory'):  # There is memory if the documents have been processed
+        if hasattr(ss.conversation_chain.memory, 'chat_memory'):  # There is chat_memory if questions have been asked
+            if st.button("Forget conversation"):  # adding a button
+                ss.conversation_chain.memory.chat_memory.clear()  # clears the ConversationBufferMemory
+
+    # st.write(ss)  # use this when debugging for visualizing the session_state variables
 
 
 if __name__ == '__main__':

diff --git a/htmlTemplates.py b/htmlTemplates.py
@@ -19,7 +19,7 @@
   object-fit: cover;
 }
 .chat-message .message {
-  width: 80%;
+  width: 100%;
   padding: 0 1.5rem;
   color: #fff;
 }
@@ -37,7 +37,7 @@
 user_template = '''
 <div class="chat-message user">
     <div class="avatar">
-        <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
+        <img src="https://www.freeiconspng.com/uploads/grab-vector-graphic-person-icon--imagebasket-13.png" width="350" alt="Grab Vector Graphic Person Icon | imagebasket" /></a>
     </div>    
     <div class="message">{{MSG}}</div>
 </div>

diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,4 @@
-langchain==0.0.184
-PyPDF2==3.0.1
+langchain==0.0.329
+pypdfium2==4.23.1
 python-dotenv==1.0.0
-streamlit==1.18.1
-openai==0.27.6
-faiss-cpu==1.7.4
-altair==4
-tiktoken==0.4.0
-# uncomment to use huggingface llms
-# huggingface-hub==0.14.1
-
-# uncomment to use instructor embeddings
-# InstructorEmbedding==1.0.1
-# sentence-transformers==2.2.2
+streamlit==1.27.2