2023-07-11 14:52:08 -07:00
|
|
|
# Copyright (c) Microsoft. All rights reserved.
|
|
|
|
|
|
2024-10-07 17:08:19 -04:00
|
|
|
import uuid
|
2025-01-30 10:44:49 -05:00
|
|
|
from collections.abc import AsyncGenerator, Sequence
|
2024-10-07 17:08:19 -04:00
|
|
|
from contextlib import asynccontextmanager
|
|
|
|
|
from typing import Annotated, Any
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2024-10-07 17:08:19 -04:00
|
|
|
import pandas as pd
|
2023-07-11 14:52:08 -07:00
|
|
|
import pytest
|
2024-10-07 17:08:19 -04:00
|
|
|
import pytest_asyncio
|
|
|
|
|
from pydantic import BaseModel
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
from semantic_kernel.connectors.postgres import PostgresCollection, PostgresSettings, PostgresStore
|
|
|
|
|
from semantic_kernel.data.vector import (
|
|
|
|
|
DistanceFunction,
|
|
|
|
|
IndexKind,
|
|
|
|
|
VectorStoreCollectionDefinition,
|
|
|
|
|
VectorStoreField,
|
|
|
|
|
vectorstoremodel,
|
Python: Introducing vector and text search (#9345)
### Motivation and Context
<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
1. Why is this change required?
2. What problem does it solve?
3. What scenario does it contribute to?
4. If it fixes an open issue, please link to the issue here.
-->
This PR does the following things:
- Introduces TextSearch abstractions, including implementation for Bing
- This consists of the TextSearch class, which implements three public
search methods, and handles the internals, the search methods are:
'search' returns a string, 'get_text_search_results' returns a
TextSearchResult object and 'get_search_results' returns a object native
to the search service (i.e. BingWebPages for Bing)
- This also has a method called "create_{search_method}' which returns a
KernelFunction based on the search method. This function can be adapted
by setting the parameters and has several adaptability options and
allows you to create a RAG pipeline easily with custom names and
descriptions of both the functions and the parameters!
- Introduces VectorSearch abstractions, including implementation for
Azure AI Search
- This consists of a VectorStoreBase class which handles all the
internal and three public interfaces, vectorized_search (supply a
vector), vectorizable_text_search (supply a string that get's vectorized
downstream), vector_text_search (supply a string), each vector store
record collection can pick and choose which ones they need to support by
importing one or more next to the VectorSearchBase class.
- Introduces VectorStoreTextSearch as a way to leverage text search
against vector stores
- Since this builds on TextSearch this is now the best way to create a
super powerfull RAG setup with your own data model!
- Adds all the related classes, samples and tests for the above.
- Also reorders the data folder, which might cause some slight breaking
changes for the few stores that have the new vector store model.
- Adds additional IndexKinds and DistanceFunctions to stay in sync with
dotnet.
- Renames VolatileStore and VolatileCollection to InMemoryVectorStore
and InMemoryVectorCollection.
Closes #6832 #6833
### Contribution Checklist
<!-- Before submitting this PR, please make sure: -->
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [ ] I didn't break anyone :smile:
---------
Co-authored-by: Tao Chen <taochen@microsoft.com>
2024-11-06 14:22:50 +01:00
|
|
|
)
|
|
|
|
|
from semantic_kernel.exceptions.memory_connector_exceptions import (
|
|
|
|
|
MemoryConnectorConnectionException,
|
|
|
|
|
MemoryConnectorInitializationError,
|
2024-10-07 17:08:19 -04:00
|
|
|
)
|
2023-07-11 14:52:08 -07:00
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import psycopg # noqa: F401
|
|
|
|
|
import psycopg_pool # noqa: F401
|
|
|
|
|
|
|
|
|
|
psycopg_pool_installed = True
|
|
|
|
|
except ImportError:
|
|
|
|
|
psycopg_pool_installed = False
|
|
|
|
|
|
Python: allow settings to be created directly (#11468)
### Motivation and Context
<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
1. Why is this change required?
2. What problem does it solve?
3. What scenario does it contribute to?
4. If it fixes an open issue, please link to the issue here.
-->
I've always hated having to add the .create to a settings object, this
removes that, you can just use the regular init, added benefit is that
it has the proper fields in the docstrings for each implemented settings
object!
### Description
<!-- Describe your changes, the overall approach, the underlying design.
These notes will help understanding how your code works. Thanks! -->
Adds a __new__ method to the base settings class, which takes the prefix
and if supplied the env_file and encoding, then calls model_rebuild and
then the init.
### Contribution Checklist
<!-- Before submitting this PR, please make sure: -->
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone :smile:
2025-04-11 08:26:22 +02:00
|
|
|
pg_settings: PostgresSettings = PostgresSettings()
|
Python: Introducing vector and text search (#9345)
### Motivation and Context
<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
1. Why is this change required?
2. What problem does it solve?
3. What scenario does it contribute to?
4. If it fixes an open issue, please link to the issue here.
-->
This PR does the following things:
- Introduces TextSearch abstractions, including implementation for Bing
- This consists of the TextSearch class, which implements three public
search methods, and handles the internals, the search methods are:
'search' returns a string, 'get_text_search_results' returns a
TextSearchResult object and 'get_search_results' returns a object native
to the search service (i.e. BingWebPages for Bing)
- This also has a method called "create_{search_method}' which returns a
KernelFunction based on the search method. This function can be adapted
by setting the parameters and has several adaptability options and
allows you to create a RAG pipeline easily with custom names and
descriptions of both the functions and the parameters!
- Introduces VectorSearch abstractions, including implementation for
Azure AI Search
- This consists of a VectorStoreBase class which handles all the
internal and three public interfaces, vectorized_search (supply a
vector), vectorizable_text_search (supply a string that get's vectorized
downstream), vector_text_search (supply a string), each vector store
record collection can pick and choose which ones they need to support by
importing one or more next to the VectorSearchBase class.
- Introduces VectorStoreTextSearch as a way to leverage text search
against vector stores
- Since this builds on TextSearch this is now the best way to create a
super powerfull RAG setup with your own data model!
- Adds all the related classes, samples and tests for the above.
- Also reorders the data folder, which might cause some slight breaking
changes for the few stores that have the new vector store model.
- Adds additional IndexKinds and DistanceFunctions to stay in sync with
dotnet.
- Renames VolatileStore and VolatileCollection to InMemoryVectorStore
and InMemoryVectorCollection.
Closes #6832 #6833
### Contribution Checklist
<!-- Before submitting this PR, please make sure: -->
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [ ] I didn't break anyone :smile:
---------
Co-authored-by: Tao Chen <taochen@microsoft.com>
2024-11-06 14:22:50 +01:00
|
|
|
try:
|
|
|
|
|
connection_params_present = any(pg_settings.get_connection_args().values())
|
|
|
|
|
except MemoryConnectorInitializationError:
|
|
|
|
|
connection_params_present = False
|
2024-10-07 17:08:19 -04:00
|
|
|
|
|
|
|
|
pytestmark = pytest.mark.skipif(
|
|
|
|
|
not (psycopg_pool_installed or connection_params_present),
|
|
|
|
|
reason="psycopg_pool is not installed" if not psycopg_pool_installed else "No connection parameters provided",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@vectorstoremodel
|
|
|
|
|
class SimpleDataModel(BaseModel):
|
2025-06-19 20:34:59 +02:00
|
|
|
id: Annotated[int, VectorStoreField("key")]
|
2024-10-07 17:08:19 -04:00
|
|
|
embedding: Annotated[
|
2025-06-19 20:34:59 +02:00
|
|
|
list[float] | str | None,
|
|
|
|
|
VectorStoreField(
|
|
|
|
|
"vector",
|
2024-10-07 17:08:19 -04:00
|
|
|
index_kind=IndexKind.HNSW,
|
|
|
|
|
dimensions=3,
|
Python: Introducing vector and text search (#9345)
### Motivation and Context
<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
1. Why is this change required?
2. What problem does it solve?
3. What scenario does it contribute to?
4. If it fixes an open issue, please link to the issue here.
-->
This PR does the following things:
- Introduces TextSearch abstractions, including implementation for Bing
- This consists of the TextSearch class, which implements three public
search methods, and handles the internals, the search methods are:
'search' returns a string, 'get_text_search_results' returns a
TextSearchResult object and 'get_search_results' returns a object native
to the search service (i.e. BingWebPages for Bing)
- This also has a method called "create_{search_method}' which returns a
KernelFunction based on the search method. This function can be adapted
by setting the parameters and has several adaptability options and
allows you to create a RAG pipeline easily with custom names and
descriptions of both the functions and the parameters!
- Introduces VectorSearch abstractions, including implementation for
Azure AI Search
- This consists of a VectorStoreBase class which handles all the
internal and three public interfaces, vectorized_search (supply a
vector), vectorizable_text_search (supply a string that get's vectorized
downstream), vector_text_search (supply a string), each vector store
record collection can pick and choose which ones they need to support by
importing one or more next to the VectorSearchBase class.
- Introduces VectorStoreTextSearch as a way to leverage text search
against vector stores
- Since this builds on TextSearch this is now the best way to create a
super powerfull RAG setup with your own data model!
- Adds all the related classes, samples and tests for the above.
- Also reorders the data folder, which might cause some slight breaking
changes for the few stores that have the new vector store model.
- Adds additional IndexKinds and DistanceFunctions to stay in sync with
dotnet.
- Renames VolatileStore and VolatileCollection to InMemoryVectorStore
and InMemoryVectorCollection.
Closes #6832 #6833
### Contribution Checklist
<!-- Before submitting this PR, please make sure: -->
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [ ] I didn't break anyone :smile:
---------
Co-authored-by: Tao Chen <taochen@microsoft.com>
2024-11-06 14:22:50 +01:00
|
|
|
distance_function=DistanceFunction.COSINE_SIMILARITY,
|
2024-10-07 17:08:19 -04:00
|
|
|
),
|
2025-01-30 10:44:49 -05:00
|
|
|
] = None
|
2024-10-07 17:08:19 -04:00
|
|
|
data: Annotated[
|
|
|
|
|
dict[str, Any],
|
2025-06-19 20:34:59 +02:00
|
|
|
VectorStoreField("data", type="JSONB"),
|
2024-10-07 17:08:19 -04:00
|
|
|
]
|
|
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
def model_post_init(self, context: Any) -> None:
|
|
|
|
|
if self.embedding is None:
|
|
|
|
|
self.embedding = self.data
|
|
|
|
|
|
2024-10-07 17:08:19 -04:00
|
|
|
|
|
|
|
|
def DataModelPandas(record) -> tuple:
|
2025-06-19 20:34:59 +02:00
|
|
|
definition = VectorStoreCollectionDefinition(
|
|
|
|
|
fields=[
|
|
|
|
|
VectorStoreField(
|
|
|
|
|
"vector",
|
2024-10-07 17:08:19 -04:00
|
|
|
name="embedding",
|
|
|
|
|
index_kind="hnsw",
|
|
|
|
|
dimensions=3,
|
Python: Introducing vector and text search (#9345)
### Motivation and Context
<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
1. Why is this change required?
2. What problem does it solve?
3. What scenario does it contribute to?
4. If it fixes an open issue, please link to the issue here.
-->
This PR does the following things:
- Introduces TextSearch abstractions, including implementation for Bing
- This consists of the TextSearch class, which implements three public
search methods, and handles the internals, the search methods are:
'search' returns a string, 'get_text_search_results' returns a
TextSearchResult object and 'get_search_results' returns a object native
to the search service (i.e. BingWebPages for Bing)
- This also has a method called "create_{search_method}' which returns a
KernelFunction based on the search method. This function can be adapted
by setting the parameters and has several adaptability options and
allows you to create a RAG pipeline easily with custom names and
descriptions of both the functions and the parameters!
- Introduces VectorSearch abstractions, including implementation for
Azure AI Search
- This consists of a VectorStoreBase class which handles all the
internal and three public interfaces, vectorized_search (supply a
vector), vectorizable_text_search (supply a string that get's vectorized
downstream), vector_text_search (supply a string), each vector store
record collection can pick and choose which ones they need to support by
importing one or more next to the VectorSearchBase class.
- Introduces VectorStoreTextSearch as a way to leverage text search
against vector stores
- Since this builds on TextSearch this is now the best way to create a
super powerfull RAG setup with your own data model!
- Adds all the related classes, samples and tests for the above.
- Also reorders the data folder, which might cause some slight breaking
changes for the few stores that have the new vector store model.
- Adds additional IndexKinds and DistanceFunctions to stay in sync with
dotnet.
- Renames VolatileStore and VolatileCollection to InMemoryVectorStore
and InMemoryVectorCollection.
Closes #6832 #6833
### Contribution Checklist
<!-- Before submitting this PR, please make sure: -->
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [ ] I didn't break anyone :smile:
---------
Co-authored-by: Tao Chen <taochen@microsoft.com>
2024-11-06 14:22:50 +01:00
|
|
|
distance_function="cosine_similarity",
|
2025-06-19 20:34:59 +02:00
|
|
|
type="float",
|
2024-10-07 17:08:19 -04:00
|
|
|
),
|
2025-06-19 20:34:59 +02:00
|
|
|
VectorStoreField("key", name="id", type="int"),
|
|
|
|
|
VectorStoreField("data", name="data", type="dict"),
|
|
|
|
|
],
|
2024-10-07 17:08:19 -04:00
|
|
|
container_mode=True,
|
|
|
|
|
to_dict=lambda x: x.to_dict(orient="records"),
|
|
|
|
|
from_dict=lambda x, **_: pd.DataFrame(x),
|
|
|
|
|
)
|
|
|
|
|
df = pd.DataFrame([record])
|
|
|
|
|
return definition, df
|
|
|
|
|
|
|
|
|
|
|
2024-12-12 22:08:46 +01:00
|
|
|
@pytest_asyncio.fixture
|
2024-10-07 17:08:19 -04:00
|
|
|
async def vector_store() -> AsyncGenerator[PostgresStore, None]:
|
Python: Introducing vector and text search (#9345)
### Motivation and Context
<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
1. Why is this change required?
2. What problem does it solve?
3. What scenario does it contribute to?
4. If it fixes an open issue, please link to the issue here.
-->
This PR does the following things:
- Introduces TextSearch abstractions, including implementation for Bing
- This consists of the TextSearch class, which implements three public
search methods, and handles the internals, the search methods are:
'search' returns a string, 'get_text_search_results' returns a
TextSearchResult object and 'get_search_results' returns a object native
to the search service (i.e. BingWebPages for Bing)
- This also has a method called "create_{search_method}' which returns a
KernelFunction based on the search method. This function can be adapted
by setting the parameters and has several adaptability options and
allows you to create a RAG pipeline easily with custom names and
descriptions of both the functions and the parameters!
- Introduces VectorSearch abstractions, including implementation for
Azure AI Search
- This consists of a VectorStoreBase class which handles all the
internal and three public interfaces, vectorized_search (supply a
vector), vectorizable_text_search (supply a string that get's vectorized
downstream), vector_text_search (supply a string), each vector store
record collection can pick and choose which ones they need to support by
importing one or more next to the VectorSearchBase class.
- Introduces VectorStoreTextSearch as a way to leverage text search
against vector stores
- Since this builds on TextSearch this is now the best way to create a
super powerfull RAG setup with your own data model!
- Adds all the related classes, samples and tests for the above.
- Also reorders the data folder, which might cause some slight breaking
changes for the few stores that have the new vector store model.
- Adds additional IndexKinds and DistanceFunctions to stay in sync with
dotnet.
- Renames VolatileStore and VolatileCollection to InMemoryVectorStore
and InMemoryVectorCollection.
Closes #6832 #6833
### Contribution Checklist
<!-- Before submitting this PR, please make sure: -->
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [ ] I didn't break anyone :smile:
---------
Co-authored-by: Tao Chen <taochen@microsoft.com>
2024-11-06 14:22:50 +01:00
|
|
|
try:
|
|
|
|
|
async with await pg_settings.create_connection_pool() as pool:
|
|
|
|
|
yield PostgresStore(connection_pool=pool)
|
|
|
|
|
except MemoryConnectorConnectionException:
|
|
|
|
|
pytest.skip("Postgres connection not available")
|
|
|
|
|
yield None
|
|
|
|
|
return
|
2024-10-07 17:08:19 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@asynccontextmanager
|
2025-01-30 10:44:49 -05:00
|
|
|
async def create_simple_collection(
|
|
|
|
|
vector_store: PostgresStore,
|
|
|
|
|
) -> AsyncGenerator[PostgresCollection[int, SimpleDataModel], None]:
|
2024-10-07 17:08:19 -04:00
|
|
|
"""Returns a collection with a unique name that is deleted after the context.
|
|
|
|
|
|
|
|
|
|
This can be moved to use a fixture with scope=function and loop_scope=session
|
|
|
|
|
after upgrade to pytest-asyncio 0.24. With the current version, the fixture
|
|
|
|
|
would both cache and use the event loop of the declared scope.
|
|
|
|
|
"""
|
|
|
|
|
suffix = str(uuid.uuid4()).replace("-", "")[:8]
|
|
|
|
|
collection_id = f"test_collection_{suffix}"
|
2025-06-19 20:34:59 +02:00
|
|
|
collection = vector_store.get_collection(collection_name=collection_id, record_type=SimpleDataModel)
|
2025-01-30 10:44:49 -05:00
|
|
|
assert isinstance(collection, PostgresCollection)
|
2025-06-19 20:34:59 +02:00
|
|
|
await collection.ensure_collection_exists()
|
Python: Introduce Pydantic settings (#6193)
### Motivation and Context
SK Python is tightly coupled to the use of a `.env` file to read all
secrets, keys, endpoints, and more. This doesn't scale well for users
who wish to be able to use environment variables with their SK
Applications. By introducing Pydantic Settings, it is possible to use
both environment variables as well as have a fall-back to a `.env` file
(via a `env_file_path` parameter), if desired.
By introducing Pydantic Settings, we are removing the requirement to
have to create Text/Embedding/Chat completion objects with an `api_key`
or other previously required information (in the case of
AzureChatCompletion that means an `endpoint`, an `api_key`, a
`deployment_name`, and an `api_version`). When the AI connector is
created, the Pydantic settings are loaded either via env vars or the
fall-back `.env` file, and that means the user can create a chat
completion object like:
```python
chat_completion = OpenAIChatCompletion(service_id="test")
```
or, to optionally override the `ai_model_id` env var:
```python
chat_completion = OpenAIChatCompletion(service_id="test", ai_model_id="gpt-4-1106")
```
Note: we have left the ability to specific an `api_key`/`org_id` for
`OpenAIChatCompletion` or a `deployment_name`, `endpoint`, `base_url`,
and `api_version` for `AzureChatCompletion` as before, but if your
settings are configured to use env vars/.env file then there is no need
to pass this information.
<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
1. Why is this change required?
2. What problem does it solve?
3. What scenario does it contribute to?
4. If it fixes an open issue, please link to the issue here.
-->
### Description
The PR introduces the use of Pydantic settings and removes the use of
the python-dotenv library.
- Closes #1779
- Updates notebooks, samples, code and tests to remove the explicit
config of api_key or other previous .env files values.
- Adds new unit test config using monkeypatch to simulate env variables
for testing
- All unit and integration tests passing
<!-- Describe your changes, the overall approach, the underlying design.
These notes will help understanding how your code works. Thanks! -->
### Contribution Checklist
<!-- Before submitting this PR, please make sure: -->
- [X] The code builds clean without any errors or warnings
- [X] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [X] All unit tests pass, and I have added new tests where possible
- [ ] I didn't break anyone :smile:
2024-05-16 07:44:40 -04:00
|
|
|
try:
|
2024-10-07 17:08:19 -04:00
|
|
|
yield collection
|
|
|
|
|
finally:
|
2025-06-19 20:34:59 +02:00
|
|
|
await collection.ensure_collection_deleted()
|
2023-07-11 14:52:08 -07:00
|
|
|
|
|
|
|
|
|
2024-10-07 17:08:19 -04:00
|
|
|
def test_create_store(vector_store):
|
Python: Introducing vector and text search (#9345)
### Motivation and Context
<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
1. Why is this change required?
2. What problem does it solve?
3. What scenario does it contribute to?
4. If it fixes an open issue, please link to the issue here.
-->
This PR does the following things:
- Introduces TextSearch abstractions, including implementation for Bing
- This consists of the TextSearch class, which implements three public
search methods, and handles the internals, the search methods are:
'search' returns a string, 'get_text_search_results' returns a
TextSearchResult object and 'get_search_results' returns a object native
to the search service (i.e. BingWebPages for Bing)
- This also has a method called "create_{search_method}' which returns a
KernelFunction based on the search method. This function can be adapted
by setting the parameters and has several adaptability options and
allows you to create a RAG pipeline easily with custom names and
descriptions of both the functions and the parameters!
- Introduces VectorSearch abstractions, including implementation for
Azure AI Search
- This consists of a VectorStoreBase class which handles all the
internal and three public interfaces, vectorized_search (supply a
vector), vectorizable_text_search (supply a string that get's vectorized
downstream), vector_text_search (supply a string), each vector store
record collection can pick and choose which ones they need to support by
importing one or more next to the VectorSearchBase class.
- Introduces VectorStoreTextSearch as a way to leverage text search
against vector stores
- Since this builds on TextSearch this is now the best way to create a
super powerfull RAG setup with your own data model!
- Adds all the related classes, samples and tests for the above.
- Also reorders the data folder, which might cause some slight breaking
changes for the few stores that have the new vector store model.
- Adds additional IndexKinds and DistanceFunctions to stay in sync with
dotnet.
- Renames VolatileStore and VolatileCollection to InMemoryVectorStore
and InMemoryVectorCollection.
Closes #6832 #6833
### Contribution Checklist
<!-- Before submitting this PR, please make sure: -->
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [ ] I didn't break anyone :smile:
---------
Co-authored-by: Tao Chen <taochen@microsoft.com>
2024-11-06 14:22:50 +01:00
|
|
|
assert vector_store is not None
|
2024-10-07 17:08:19 -04:00
|
|
|
assert vector_store.connection_pool is not None
|
2023-07-11 14:52:08 -07:00
|
|
|
|
|
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
async def test_ensure_collection_exists_exists_and_delete(vector_store: PostgresStore):
|
2024-10-07 17:08:19 -04:00
|
|
|
suffix = str(uuid.uuid4()).replace("-", "")[:8]
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
collection = vector_store.get_collection(collection_name=f"test_collection_{suffix}", record_type=SimpleDataModel)
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
does_exist_1 = await collection.collection_exists()
|
2024-10-07 17:08:19 -04:00
|
|
|
assert does_exist_1 is False
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
await collection.ensure_collection_exists()
|
|
|
|
|
does_exist_2 = await collection.collection_exists()
|
2024-10-07 17:08:19 -04:00
|
|
|
assert does_exist_2 is True
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
await collection.ensure_collection_deleted()
|
|
|
|
|
does_exist_3 = await collection.collection_exists()
|
2024-10-07 17:08:19 -04:00
|
|
|
assert does_exist_3 is False
|
2023-07-11 14:52:08 -07:00
|
|
|
|
|
|
|
|
|
2024-10-07 17:08:19 -04:00
|
|
|
async def test_list_collection_names(vector_store):
|
|
|
|
|
async with create_simple_collection(vector_store) as simple_collection:
|
|
|
|
|
simple_collection_id = simple_collection.collection_name
|
|
|
|
|
result = await vector_store.list_collection_names()
|
|
|
|
|
assert simple_collection_id in result
|
2023-07-11 14:52:08 -07:00
|
|
|
|
|
|
|
|
|
2024-10-07 17:08:19 -04:00
|
|
|
async def test_upsert_get_and_delete(vector_store: PostgresStore):
|
|
|
|
|
record = SimpleDataModel(id=1, embedding=[1.1, 2.2, 3.3], data={"key": "value"})
|
|
|
|
|
async with create_simple_collection(vector_store) as simple_collection:
|
|
|
|
|
result_before_upsert = await simple_collection.get(1)
|
|
|
|
|
assert result_before_upsert is None
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2024-10-07 17:08:19 -04:00
|
|
|
await simple_collection.upsert(record)
|
|
|
|
|
result = await simple_collection.get(1)
|
2024-08-19 15:45:02 +02:00
|
|
|
assert result is not None
|
2024-10-07 17:08:19 -04:00
|
|
|
assert result.id == record.id
|
|
|
|
|
assert result.embedding == record.embedding
|
|
|
|
|
assert result.data == record.data
|
|
|
|
|
|
|
|
|
|
# Check that the table has an index
|
|
|
|
|
connection_pool = simple_collection.connection_pool
|
|
|
|
|
async with connection_pool.connection() as conn, conn.cursor() as cur:
|
|
|
|
|
await cur.execute(
|
|
|
|
|
"SELECT indexname FROM pg_indexes WHERE tablename = %s", (simple_collection.collection_name,)
|
|
|
|
|
)
|
|
|
|
|
rows = await cur.fetchall()
|
|
|
|
|
index_names = [index[0] for index in rows]
|
|
|
|
|
assert any("embedding_idx" in index_name for index_name in index_names)
|
|
|
|
|
|
|
|
|
|
await simple_collection.delete(1)
|
|
|
|
|
result_after_delete = await simple_collection.get(1)
|
|
|
|
|
assert result_after_delete is None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def test_upsert_get_and_delete_pandas(vector_store):
|
|
|
|
|
record = SimpleDataModel(id=1, embedding=[1.1, 2.2, 3.3], data={"key": "value"})
|
|
|
|
|
definition, df = DataModelPandas(record.model_dump())
|
|
|
|
|
|
|
|
|
|
suffix = str(uuid.uuid4()).replace("-", "")[:8]
|
|
|
|
|
collection = vector_store.get_collection(
|
2025-06-19 20:34:59 +02:00
|
|
|
collection_name=f"test_collection_{suffix}",
|
|
|
|
|
record_type=pd.DataFrame,
|
|
|
|
|
definition=definition,
|
2024-10-07 17:08:19 -04:00
|
|
|
)
|
2025-06-19 20:34:59 +02:00
|
|
|
await collection.ensure_collection_exists()
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2024-08-19 15:45:02 +02:00
|
|
|
try:
|
2024-10-07 17:08:19 -04:00
|
|
|
result_before_upsert = await collection.get(1)
|
|
|
|
|
assert result_before_upsert is None
|
2023-07-11 14:52:08 -07:00
|
|
|
|
2024-10-07 17:08:19 -04:00
|
|
|
await collection.upsert(df)
|
|
|
|
|
result: pd.DataFrame = await collection.get(1)
|
|
|
|
|
assert result is not None
|
|
|
|
|
row = result.iloc[0]
|
|
|
|
|
assert row.id == record.id
|
|
|
|
|
assert row.embedding == record.embedding
|
|
|
|
|
assert row.data == record.data
|
|
|
|
|
|
|
|
|
|
await collection.delete(1)
|
|
|
|
|
result_after_delete = await collection.get(1)
|
|
|
|
|
assert result_after_delete is None
|
|
|
|
|
finally:
|
2025-06-19 20:34:59 +02:00
|
|
|
await collection.ensure_collection_deleted()
|
2024-10-07 17:08:19 -04:00
|
|
|
|
|
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
async def test_upsert_get_and_delete_multiple(vector_store: PostgresStore):
|
2024-10-07 17:08:19 -04:00
|
|
|
async with create_simple_collection(vector_store) as simple_collection:
|
|
|
|
|
record1 = SimpleDataModel(id=1, embedding=[1.1, 2.2, 3.3], data={"key": "value"})
|
|
|
|
|
record2 = SimpleDataModel(id=2, embedding=[4.4, 5.5, 6.6], data={"key": "value"})
|
|
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
result_before_upsert = await simple_collection.get([1, 2])
|
2024-10-07 17:08:19 -04:00
|
|
|
assert result_before_upsert is None
|
|
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
await simple_collection.upsert([record1, record2])
|
|
|
|
|
# Test get for the two existing keys and one non-existing key;
|
2024-10-07 17:08:19 -04:00
|
|
|
# this should return only the two existing records.
|
2025-06-19 20:34:59 +02:00
|
|
|
result = await simple_collection.get([1, 2, 3])
|
2024-08-19 15:45:02 +02:00
|
|
|
assert result is not None
|
2025-01-30 10:44:49 -05:00
|
|
|
assert isinstance(result, Sequence)
|
2024-08-19 15:45:02 +02:00
|
|
|
assert len(result) == 2
|
2024-10-07 17:08:19 -04:00
|
|
|
assert result[0] is not None
|
|
|
|
|
assert result[0].id == record1.id
|
|
|
|
|
assert result[0].embedding == record1.embedding
|
|
|
|
|
assert result[0].data == record1.data
|
|
|
|
|
assert result[1] is not None
|
|
|
|
|
assert result[1].id == record2.id
|
|
|
|
|
assert result[1].embedding == record2.embedding
|
|
|
|
|
assert result[1].data == record2.data
|
|
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
await simple_collection.delete([1, 2])
|
|
|
|
|
result_after_delete = await simple_collection.get([1, 2])
|
2024-10-07 17:08:19 -04:00
|
|
|
assert result_after_delete is None
|
2025-01-30 10:44:49 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
async def test_search(vector_store: PostgresStore):
|
|
|
|
|
async with create_simple_collection(vector_store) as simple_collection:
|
|
|
|
|
records = [
|
|
|
|
|
SimpleDataModel(id=1, embedding=[1.0, 0.0, 0.0], data={"key": "value1"}),
|
|
|
|
|
SimpleDataModel(id=2, embedding=[0.8, 0.2, 0.0], data={"key": "value2"}),
|
|
|
|
|
SimpleDataModel(id=3, embedding=[0.6, 0.0, 0.4], data={"key": "value3"}),
|
|
|
|
|
SimpleDataModel(id=4, embedding=[1.0, 1.0, 0.0], data={"key": "value4"}),
|
|
|
|
|
SimpleDataModel(id=5, embedding=[0.0, 1.0, 1.0], data={"key": "value5"}),
|
|
|
|
|
SimpleDataModel(id=6, embedding=[1.0, 0.0, 1.0], data={"key": "value6"}),
|
|
|
|
|
]
|
|
|
|
|
|
2025-06-19 20:34:59 +02:00
|
|
|
await simple_collection.upsert(records)
|
2025-01-30 10:44:49 -05:00
|
|
|
|
|
|
|
|
try:
|
2025-06-19 20:34:59 +02:00
|
|
|
search_results = await simple_collection.search(vector=[1.0, 0.0, 0.0], top=3, include_total_count=True)
|
2025-01-30 10:44:49 -05:00
|
|
|
assert search_results is not None
|
|
|
|
|
assert search_results.total_count == 3
|
|
|
|
|
assert {result.record.id async for result in search_results.results} == {1, 2, 3}
|
|
|
|
|
|
|
|
|
|
finally:
|
2025-06-19 20:34:59 +02:00
|
|
|
await simple_collection.delete([r.id for r in records])
|