Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Major improvements: new functionalities, better code readability, fixed bugs, improved interface, state-of-the-art free open-source model as default #45

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
OPENAI_API_KEY=
HUGGINGFACEHUB_API_TOKEN=
HUGGINGFACEHUB_API_TOKEN=
135 changes: 77 additions & 58 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
"""
For PyCharm users:
The "configuration" in Pycharm that allows for both Run & Debug:
use as the module: streamlit (instead of script -> python.exe)
use as the script parameters: run app.py
"""

import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import pypdfium2 as pdfium # Check leaderboard here: https://github.com/py-pdf/benchmarks # yiwei-ang:feature/pdfium
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
Expand All @@ -10,94 +17,106 @@
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub


def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
pdf_reader = pdfium.PdfDocument(pdf)
for i in range(len(pdf_reader)):
page = pdf_reader.get_page(i)
textpage = page.get_textpage()
text += textpage.get_text_range() + "\n"
return text


def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=5000, chunk_overlap=500, length_function=len)
chunks = text_splitter.split_text(text)
return chunks


def get_vectorstore(text_chunks):
embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
# Check leaderboard here: https://huggingface.co/spaces/mteb/leaderboard
# embeddings = OpenAIEmbeddings()
embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-large-en-v1.5")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore


def get_conversation_chain(vectorstore):
llm = ChatOpenAI()
# Check leaderboard here: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
# llm = ChatOpenAI()
# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
llm = HuggingFaceHub(repo_id="HuggingFaceH4/zephyr-7b-alpha",
model_kwargs={"temperature": 0.1,
"max_new_tokens": 1000}) # ,"max_length": 1000})
memory = ConversationBufferMemory(memory_key='chat_history',
return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory)
return conversation_chain


def handle_userinput(user_question):
response = st.session_state.conversation({'question': user_question})
st.session_state.chat_history = response['chat_history']
def save_question_and_clear_prompt(ss):
ss.user_question = ss.prompt_bar
ss.prompt_bar = "" # clearing the prompt bar after clicking enter to prevent automatic re-submissions

for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)

def write_chat(msgs): # Write the Q&A in a pretty chat format
for i, msg in enumerate(msgs):
if i % 2 == 0: # it's a question
st.write(user_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)
else: # it's an answer
st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)

def main():
load_dotenv()
st.set_page_config(page_title="Chat with multiple PDFs",
page_icon=":books:")
st.write(css, unsafe_allow_html=True)

if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
def main():
load_dotenv() # loads api keys
ss = st.session_state # https://docs.streamlit.io/library/api-reference/session-state

# Page design
st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
st.write(css, unsafe_allow_html=True)
st.header("Chat with multiple PDFs :books:")
user_question = st.text_input("Ask a question about your documents:")
if user_question:
handle_userinput(user_question)

# Initializing session state variables
if "conversation_chain" not in ss:
ss.conversation_chain = None # the main variable storing the llm, retriever and memory
if "prompt_bar" not in ss:
ss.prompt_bar = ""
if "user_question" not in ss:
ss.user_question = ""
if "docs_are_processed" not in ss:
ss.docs_are_processed = False

with st.sidebar:
st.subheader("Your documents")
pdf_docs = st.file_uploader(
"Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
if st.button("Process"):
pdf_docs = st.file_uploader("Upload your PDFs here and click 'Process'", accept_multiple_files=True, type="pdf")
if st.button("Process") and pdf_docs:
with st.spinner("Processing"):
# get pdf text
raw_text = get_pdf_text(pdf_docs)

# get the text chunks
text_chunks = get_text_chunks(raw_text)

# create vector store
vectorstore = get_vectorstore(text_chunks)

# create conversation chain
st.session_state.conversation = get_conversation_chain(
vectorstore)
raw_text = get_pdf_text(pdf_docs) # get pdf text
text_chunks = get_text_chunks(raw_text) # get the text chunks
vectorstore = get_vectorstore(text_chunks) # create vector store
ss.conversation_chain = get_conversation_chain(vectorstore) # create conversation chain
ss.docs_are_processed = True
if ss.docs_are_processed:
st.text('Documents processed')

st.text_input("Ask a question here:", key='prompt_bar', on_change=save_question_and_clear_prompt(ss))

if ss.user_question:
ss.conversation_chain({'question': ss.user_question}) # This is what gets the response from the LLM!
if hasattr(ss.conversation_chain.memory, 'chat_memory'):
chat = ss.conversation_chain.memory.chat_memory.messages
write_chat(chat)

if hasattr(ss.conversation_chain, 'memory'): # There is memory if the documents have been processed
if hasattr(ss.conversation_chain.memory, 'chat_memory'): # There is chat_memory if questions have been asked
if st.button("Forget conversation"): # adding a button
ss.conversation_chain.memory.chat_memory.clear() # clears the ConversationBufferMemory

# st.write(ss) # use this when debugging for visualizing the session_state variables


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions htmlTemplates.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
object-fit: cover;
}
.chat-message .message {
width: 80%;
width: 100%;
padding: 0 1.5rem;
color: #fff;
}
Expand All @@ -37,7 +37,7 @@
user_template = '''
<div class="chat-message user">
<div class="avatar">
<img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
<img src="https://www.freeiconspng.com/uploads/grab-vector-graphic-person-icon--imagebasket-13.png" width="350" alt="Grab Vector Graphic Person Icon | imagebasket" /></a>
</div>
<div class="message">{{MSG}}</div>
</div>
Expand Down
16 changes: 3 additions & 13 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,4 @@
langchain==0.0.184
PyPDF2==3.0.1
langchain==0.0.329
pypdfium2==4.23.1
python-dotenv==1.0.0
streamlit==1.18.1
openai==0.27.6
faiss-cpu==1.7.4
altair==4
tiktoken==0.4.0
# uncomment to use huggingface llms
# huggingface-hub==0.14.1

# uncomment to use instructor embeddings
# InstructorEmbedding==1.0.1
# sentence-transformers==2.2.2
streamlit==1.27.2