Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

verbosity and CharacterTextSplitter #35

Merged
merged 8 commits into from
Oct 18, 2023
Merged
146 changes: 146 additions & 0 deletions src/libs/LangChain.Core/Base/TextSplitter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
using LangChain.Docstore;

namespace LangChain.Base;

/// <summary>
/// Functionality for splitting text.
/// <remarks>
/// - ported from langchain/text_splitter.py
///
/// </remarks>
/// </summary>
public abstract class TextSplitter
{
private readonly int _chunkSize;
private readonly int _chunkOverlap;
private readonly Func<string, int> _lengthFunction;



protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string,int>? lengthFunction = null)
{
if (chunkOverlap > chunkSize)
{
throw new ArgumentException($"Chunk overlap ({chunkOverlap}) is greater than chunk size ({chunkSize}).");
}

_chunkSize = chunkSize;
_chunkOverlap = chunkOverlap;
_lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length);
}

public abstract List<string> SplitText(string text);

/// <summary>
/// Create documents from a list of texts.
/// </summary>
/// <exception cref="ArgumentException">
/// If the number of texts and metadata(when not null) are not equal, this method will throw an ArgumentException.
/// </exception>
public List<Document> CreateDocuments(List<string> texts, List<Dictionary<string, object>>? metadatas = null)
{
var documents = new List<Document>();

// if no metadata is provided, create a list of empty dictionaries
metadatas ??= Enumerable.Repeat(new Dictionary<string, object>(), texts.Count).ToList();

if (texts.Count != metadatas.Count)
{
throw new ArgumentException("Number of texts and metadata must be equal.");
}


// each text is split into chunks, and each chunk is added to the list of documents
for (int i = 0; i < texts.Count; i++)
{
var text = texts[i];
var metadata = metadatas[i];

foreach (var chunk in SplitText(text))
{
documents.Add(new Document(chunk, metadata));
}
}

return documents;
}

public List<Document> SplitDocuments(List<Document> documents)
{
var texts = documents.Select(doc => doc.PageContent).ToList();
var metadatas = documents.Select(doc => doc.Metadata).ToList();

return CreateDocuments(texts, metadatas);
}

/// <summary>
/// Joins a list of strings with a separator and returns null if the resulting string is empty
/// </summary>
protected string? JoinDocs(List<string> docs, string separator)
{
var text = string.Join(separator, docs).Trim();
return string.IsNullOrEmpty(text) ? null : text;
}

/// <summary>
/// Merges a list of texts into chunks of size chunk_size with overlap
/// </summary>
protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
{
var docs = new List<string>(); // result of chunks
var currentDoc = new List<string>(); // documents of current chunk
int total = 0;

foreach (var split in splits)
{
int len = _lengthFunction(split);

// if we can't fit the next split into current chunk
if (total + len >= _chunkSize)
{
// if the chunk is already was too big
if (total > _chunkSize)
{
// todo: Implement a logger
// todo: Log a warning about a split that is larger than the chunk size
}


if (currentDoc.Count > 0)
{
// join all the docs in current chunk and add to the result
var doc = JoinDocs(currentDoc, separator);
if (doc != null)
{
docs.Add(doc);
}

// start erasing docs from the beginning of the chunk until we can fit the next split
while (total > _chunkOverlap || (total + len > _chunkSize && total > 0))
{
total -= _lengthFunction(currentDoc[0]);
currentDoc.RemoveAt(0);
}
}
}

// add the next split to the current chunk
currentDoc.Add(split);
total += len; // recalculate the total length of the current chunk
}

// add the last chunk
var lastDoc = JoinDocs(currentDoc, separator);
if (lastDoc != null)
{
docs.Add(lastDoc);
}

return docs;
}

// todo: Implement from_huggingface_tokenizer
// todo: Implement from_tiktoken_encoder


}
15 changes: 15 additions & 0 deletions src/libs/LangChain.Core/Chains/LLM/LLMChain.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,24 @@ public override async Task<IChainValues> CallAsync(IChainValues values)
}

BasePromptValue promptValue = await Prompt.FormatPromptValue(new InputValues(values.Value));
var chatMessages = promptValue.ToChatMessages();
if (Verbose == true)
{

Console.WriteLine(string.Join("\n\n", chatMessages));
Console.WriteLine("\n".PadLeft(Console.WindowWidth, '>'));
}
var response = await Llm.GenerateAsync(new ChatRequest(promptValue.ToChatMessages(), stop));
if (Verbose == true)
{

Console.WriteLine(string.Join("\n\n", response.Messages.Except(chatMessages)));
Console.WriteLine("\n".PadLeft(Console.WindowWidth, '<'));
}

if(string.IsNullOrEmpty(OutputKey))
return new ChainValues(response.Messages.Last().Content);

return new ChainValues(OutputKey,response.Messages.Last().Content);
}

Expand Down
78 changes: 78 additions & 0 deletions src/libs/LangChain.Core/Docstore/Document.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
namespace LangChain.Docstore;

/// <summary>
/// Class for storing document
/// <remarks>
/// - no BaseModel implementation from pydantic
/// - ported from langchain/docstore/document.py
/// </remarks>
/// </summary>
public class Document
{
public Document(string content, Dictionary<string, object> metadata)
{
PageContent = content;
Metadata = metadata;
}

public string PageContent { get; set; }
public int LookupIndex { get; set; }
public string LookupStr { get; set; }
public Dictionary<string, object> Metadata { get; set; }

/// <summary>
/// Paragraphs of the page.
/// </summary>
public List<string> Paragraphs()
{
return PageContent.Split(new []{"\n\n"},StringSplitOptions.None).ToList();
}
/// <summary>
/// Summary of the page (the first paragraph)
/// </summary>
public string Summary()
{
return Paragraphs()[0];
}

/// <summary>
/// Lookup a term in the page, imitating cmd-F functionality.
/// </summary>
public string Lookup(string searchString)
{
// if there is a new search string, reset the index
if (searchString.ToLower() != LookupStr)
{
LookupStr = searchString.ToLower();
LookupIndex = 0;
}
else
{
LookupIndex++;
}

// get all the paragraphs that contain the search string
var lookups = Paragraphs().Where(p => p.ToLower().Contains(LookupStr)).ToList();

if (lookups.Count == 0)
{
return "No Results";
}
else if (LookupIndex >= lookups.Count)
{
return "No More Results";
}
else
{
string resultPrefix = $"(Result {LookupIndex + 1}/{lookups.Count})";
return $"{resultPrefix} {lookups[LookupIndex]}";
}
}

public override string ToString()
{
var serializedMetadata = string.Join(", ", Metadata.Select(x => $"{{{x.Key}:{x.Value}}}"));
return $"(PageContent='{PageContent}', LookupStr='{LookupStr}', Metadata={serializedMetadata}), LookupIndex={LookupIndex}";
}

}
5 changes: 5 additions & 0 deletions src/libs/LangChain.Core/LangChain.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,9 @@
<ProjectReference Include="..\Providers\LangChain.Providers.Abstractions\LangChain.Providers.Abstractions.csproj" />
</ItemGroup>

<ItemGroup>
<Folder Include="Docstore\" />
<Folder Include="TextSplitters\" />
</ItemGroup>

</Project>
30 changes: 30 additions & 0 deletions src/libs/LangChain.Core/TextSplitters/CharacterTextSplitter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using LangChain.Base;

namespace LangChain.TextSplitters;

/// <summary>
/// Implementation of splitting text that looks at characters
/// </summary>
public class CharacterTextSplitter:TextSplitter
{
private readonly string? _separator;

public CharacterTextSplitter(string? separator = "\n\n", int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)
{
_separator = separator;
}

public override List<string> SplitText(string text)
{
List<string> splits;
if (_separator!=null)
{
splits = text.Split(new[] { _separator }, StringSplitOptions.None).ToList();
}
else
{
splits = new List<string> { text};
}
return this.MergeSplits(splits,_separator);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,13 @@
{
return left + right;
}

public override string ToString()

Check warning on line 44 in src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Missing XML comment for publicly visible type or member 'Message.ToString()'

Check warning on line 44 in src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Missing XML comment for publicly visible type or member 'Message.ToString()'

Check warning on line 44 in src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Missing XML comment for publicly visible type or member 'Message.ToString()'

Check warning on line 44 in src/libs/Providers/LangChain.Providers.Abstractions/Models/Message.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Missing XML comment for publicly visible type or member 'Message.ToString()'
{
if (FunctionName!=null)
{
return $"{Role}({FunctionName}):\n{Content}";
}
return $"{Role}: {Content}";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
<Compile Remove="Resources\**\*.*" />
</ItemGroup>


<ItemGroup>
<PackageReference Include="H.Resources.Generator">
<PrivateAssets>all</PrivateAssets>
Expand All @@ -16,6 +17,7 @@
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\libs\LangChain.Core\LangChain.Core.csproj" />
<ProjectReference Include="..\..\libs\Splitters\LangChain.Splitters.CSharp\LangChain.Splitters.CSharp.csproj" />
</ItemGroup>

Expand Down
Loading
Loading