Package `KorAPClient`

Functions

def expand_grid(dictionary)

Expand source code

def expand_grid(dictionary):
    """Create a pandas DataFrame from all combinations of inputs

    - **dictionary** - dict with variable names as  keys and their values as vectors

    Returns:
        DataFrame with column names as specified by the dictionary key and all combinations of the specified values
        in the rows.

    Example:
        ```
        $ df = expand_grid({"Year": range(2010, 2019), "Country": ["DE", "CH"] })

        $ df["vc"] = "textType=/Zeit.*/ & pubPlaceKey = " + df.Country + " & pubDate in " + list(map(str, df.Year))
        ```
    """

    return pd.DataFrame([row for row in product(*dictionary.values())],
                        columns=dictionary.keys())

Create a pandas DataFrame from all combinations of inputs

dictionary - dict with variable names as keys and their values as vectors

Returns

DataFrame with column names as specified by the dictionary key and all combinations of the specified values in the rows.

Example

$ df = expand_grid({"Year": range(2010, 2019), "Country": ["DE", "CH"] })

$ df["vc"] = "textType=/Zeit.*/ & pubPlaceKey = " + df.Country + " & pubDate in " + list(map(str, df.Year))

def my_cv(obj, cv)

Expand source code

def my_cv(obj, cv):
    if isinstance(obj, ri.StrSexpVector):
        for i in range(len(obj)):
            obj[i] = str(obj[i])
        return StrSexpVector((obj))
    else:
        return cv.rpy2py(obj)

def toDataFrame(obj)

Expand source code

def toDataFrame(obj):
    cv = get_conversion() # get the converter from current context
    names = []
    objects = []
    for i in range(len(obj)):
        if isinstance(obj[i], ri.ListSexpVector):
            list_name = obj.names[i] +  "." if not isinstance(obj.names, NULLType) else "l" + str(i) + "."
            for j in range(len(obj[i])):
                local_name = str(obj[i].names[j]) if not isinstance(obj[i].names, NULLType) else str(j)
                names.append(list_name + local_name)
                objects.append(obj[i][j])
        else:
            names.append(obj.names[i])
            objects.append(obj[i])


    return pd.DataFrame(
        {str(k): my_cv(objects[i], cv) for i, k in enumerate(names)}
    )

def to_str(obj)

Expand source code

@fix_lists_in_dataframes.rpy2py.register(StrSexpVector)
def to_str(obj):
    for i in range(len(obj)):
        obj[i] = str(obj[i])
    return "\t".join(obj)

Classes

class KorAPConnection (*args, **kwargs)

Expand source code

class KorAPConnection(RS4):
    """Connection to a KorAP server."""

    def __init__(self, *args, **kwargs):
        """Constructor keyword arguments:

        - **KorAPUrl** (default = `"https://korap.ids-mannheim.de/"`)
        - **apiVersion** (default = 'v1.0')
        - **apiUrl**
        - **accessToken** (default = `getAccessToken(KorAPUrl)`
        - **userAgent** (default = `"Python-KorAP-Client"`)
        - **timeout** (default = 110)
        - **verbose** (default = False)
        - **cache** (default = True)
        """
        if 'userAgent' not in kwargs:
            kwargs["userAgent"] = "Python-KorAP-Client"
        kco = KorAPClient.KorAPConnection(*args, **kwargs)
        super().__init__(kco)

    def corpusStats(self, *args, **kwargs):
        """Query the size of the whole corpus or a virtual corpus specified by the vc argument.

        - **vc** (default = "")
        - **verbose** (default = kco@verbose)
        - **as.df** (default = True)

        Returns:
            `DataFrame`|`RS4`

        Example:
            ```
            $ df = kcon.corpusStats("pubDate in 2018 & textType=/Zeit.*/ & pubPlaceKey=IT", **{"as.df": True})
            $ df["tokens"]
            12150897
            ```
        """
        default_kwargs = {"as.df": True}
        default_kwargs.update(kwargs)
        return KorAPClient.corpusStats(self, *args, **default_kwargs)

    def frequencyQuery(self, *args, **kwargs):
        """Query relative frequency of search term(s).

        - **query** - query string or list of query strings
        - **vc** - virtual corpus definition or list thereof  (default: "")
        - **conf.level** - confidence level of the returned confidence interval (default = 0.95)
        - **as.alternatives** - decides whether queries should be treated as mutually exclusive and exhaustive wrt. to some meaningful class (e.g. spelling variants of a certain word form) (default = False)
        - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
        - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
        - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
        - **accessRewriteFatal** - abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented) (default = `True`)
        - **verbose** - (default = `self.verbose`)
        - **expand** - bool that decides if `query` and `vc` parameters are expanded to all of their combinations (default = `len(vc) != len(query)`)

        Returns:
            DataFrame with columns `'query', 'totalResults', 'vc', 'webUIRequestUrl', 'total', 'f',
           'conf.low', 'conf.high'`.

        Example:
            ```
            $ kcon = KorAPConnection(verbose=True)
            $ kcon.frequencyQuery("Ameisenplage", vc=["pubDate in "+str(y) for y in range(2010,2015)])
                                  query  totalResults  ...      conf.low     conf.high
            1  Ameisenplage             3  ...  9.727696e-10  1.200289e-08
            2  Ameisenplage            12  ...  3.838218e-09  1.275717e-08
            3  Ameisenplage             5  ...  2.013352e-09  1.356500e-08
            4  Ameisenplage             6  ...  2.691331e-09  1.519888e-08
            5  Ameisenplage             3  ...  8.629463e-10  1.064780e-08
            ```
        """
        return KorAPClient.frequencyQuery(self, *args, **kwargs)

    def collocationScoreQuery(self, node, collocate, vc="", **kwargs):
        """Get collocation scores for given node(s) and collocate(s).

        - **node** - target word
        - **collocate** - collocate of target word
        - **vc** - virtual corpus definition or list thereof  (default: "")
        - **lemmatizeNodeQuery** - logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
        - **lemmatizeCollocateQuery** - logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
        - **leftContextSize** - size of the left context window
        - **rightContextSize** - size of the right context window
        - **scoreFunctions** - named list of R (!) score functions of the form function(O1, O2, O, N, E, window_size), see e.g. KorAPClient.pmi
        - **smoothingConstant** - smoothing constant will be added to all observed values

        Returns:
            DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w',  'leftContextSize',
               'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'`

        Example:
            ```
            $ kcon = KorAPConnection(verbose=True)
            $ df = kcon.collocationScoreQuery("Grund", "triftiger")
            ```
        """
        return KorAPClient.collocationScoreQuery(self, node, collocate, vc, **kwargs)

    def collocationAnalysis(self, node, vc="", **kwargs):
        """ **EXPERIMENTAL**: Performs a collocation analysis for the given node (or query) in the given virtual corpus.

        - **node** - target word or list of target words
        - **vc** - string or list of strings describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
        - **lemmatizeNodeQuery** - if True, node query will be lemmatized, i.e. x -> [tt/l=x]
        - **minOccur** - minimum absolute number of observed co-occurrences to consider a collocate candidate
        - **leftContextSize** - size of the left context window
        - **rightContextSize** - size of the right context window
        - **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample
        - **searchHitsSampleLimit** - limit the size of the search hits sample
        - **ignoreCollocateCase** - bool, set to True if collocate case should be ignored
        - **withinSpan** - KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`
        - **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
        - **stopwords** - vector of stopwords not to be considered as collocates
        - **seed** - seed for random page collecting order
        - **expand** - if True, node and vc parameters are expanded to all of their combinations

        Returns:
            DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w',  'leftContextSize',
               'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'`

        Details:
            The collocation analysis is currently implemented on the client side, as some of the functionality is not yet provided by the KorAP backend. Mainly for this reason it is very slow (several minutes, up to hours), but on the other hand very flexible. You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, and look for expression-internal collocates using the focus function (see examples and demo).
            To increase speed at the cost of accuracy and possible false negatives, you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
            Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web user interface.

        Example:
            ```
            $ kcon = KorAPConnection(verbose=True)
            $ df = kcon.collocationAnalysis("Grund")
            ```
        """
        return KorAPClient.collocationAnalysis(self, node, vc, **kwargs)

    def mergeDuplicateCollocates(self, *args, **kwargs):
        """Merge collocation analysis results for different context positions."""
        return KorAPClient.mergeDuplicateCollocates(*args, **kwargs)


    def corpusQuery(self, *args, **kwargs):
        """Query search term(s).

        - **query** - query string or list of query strings
        - **vc** - virtual corpus definition or list thereof (default: "")
        - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
        - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
        - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
        - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate",  "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`)
        - **verbose** - (default = `self.verbose`)

        Returns:
            `KorAPQuery`

        Example:
            ```
            $ kcon = KorAPConnection(verbose=True)
            $ q = kcon.corpusQuery("Ameisenplage")
            $ q = q.fetchAll()
            $ q.slots['collectedMatches']
                corpusSigle  ...                                          textClass
            1         WPD17  ...                                                NaN
            2         WPD17  ...                                                NaN
            3         WPD17  ...                                                NaN
            4         WPD17  ...                                                NaN
            5         WPD17  ...                                                NaN
            ..          ...  ...                                                ...
            126         Z83  ...                       freizeit-unterhaltung reisen
            127       MZE03  ...  freizeit-unterhaltung reisen natur-umwelt wett...
            128       MZE03  ...  freizeit-unterhaltung reisen staat-gesellschaf...
            129       MZE14  ...  wissenschaft populaerwissenschaft freizeit-unt...
            130       MZE00  ...                  wissenschaft populaerwissenschaft
            [130 rows x 6 columns]
            ```
        """
        return KorAPQuery(self, *args, **kwargs)
    
    def textMetadata(self, textSigle, **kwargs):
        """ Retrieves metadata for a text, identified by its sigle (id) using the corresponding KorAP API
        (see `Kustvakt Wiki https://github.com/KorAP/Kustvakt/wiki/Service:-Metadata-Retrieval`).

        - **textSigle** - unique text id (concatenation of corpus, document and text ids, separated by `/`, e.g. ) or list thereof

        Returns:
            DataFrame with columns for each metadata property. In case of errors, such as non-existing texts/sigles, the tibble will also contain a column called `errors`.
            If there are metadata columns you cannot make sense of, please ignore them. The function simply returns all the metadata it gets from the server.

        Example:
            ```
            $ kcon = KorAPConnection(verbose=True)
            $ kcon.textMetadata(["WUD17/A97/08542", "WUD17/B96/57558", "WUD17/A97/08541"])
            ```
        """
        return KorAPClient.textMetadata(self, textSigle, **kwargs)

Connection to a KorAP server.

Constructor keyword arguments:

KorAPUrl (default = "https://korap.ids-mannheim.de/")
apiVersion (default = 'v1.0')
apiUrl
accessToken (default = getAccessToken(KorAPUrl)
userAgent (default = "Python-KorAP-Client")
timeout (default = 110)
verbose (default = False)
cache (default = True)

Ancestors

rpy2.robjects.methods.RS4
rpy2.robjects.robject.RObjectMixin
abc.ABC
rpy2.rinterface.SexpS4
rpy2.rinterface_lib.sexp.Sexp
rpy2.rinterface_lib._rinterface_capi.SupportsSEXP

Methods

def collocationAnalysis(self, node, vc='', **kwargs)

Expand source code

def collocationAnalysis(self, node, vc="", **kwargs):
    """ **EXPERIMENTAL**: Performs a collocation analysis for the given node (or query) in the given virtual corpus.

    - **node** - target word or list of target words
    - **vc** - string or list of strings describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
    - **lemmatizeNodeQuery** - if True, node query will be lemmatized, i.e. x -> [tt/l=x]
    - **minOccur** - minimum absolute number of observed co-occurrences to consider a collocate candidate
    - **leftContextSize** - size of the left context window
    - **rightContextSize** - size of the right context window
    - **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample
    - **searchHitsSampleLimit** - limit the size of the search hits sample
    - **ignoreCollocateCase** - bool, set to True if collocate case should be ignored
    - **withinSpan** - KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`
    - **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
    - **stopwords** - vector of stopwords not to be considered as collocates
    - **seed** - seed for random page collecting order
    - **expand** - if True, node and vc parameters are expanded to all of their combinations

    Returns:
        DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w',  'leftContextSize',
           'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'`

    Details:
        The collocation analysis is currently implemented on the client side, as some of the functionality is not yet provided by the KorAP backend. Mainly for this reason it is very slow (several minutes, up to hours), but on the other hand very flexible. You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, and look for expression-internal collocates using the focus function (see examples and demo).
        To increase speed at the cost of accuracy and possible false negatives, you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
        Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web user interface.

    Example:
        ```
        $ kcon = KorAPConnection(verbose=True)
        $ df = kcon.collocationAnalysis("Grund")
        ```
    """
    return KorAPClient.collocationAnalysis(self, node, vc, **kwargs)

EXPERIMENTAL: Performs a collocation analysis for the given node (or query) in the given virtual corpus.

node - target word or list of target words
vc - string or list of strings describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
lemmatizeNodeQuery - if True, node query will be lemmatized, i.e. x -> [tt/l=x]
minOccur - minimum absolute number of observed co-occurrences to consider a collocate candidate
leftContextSize - size of the left context window
rightContextSize - size of the right context window
topCollocatesLimit - limit analysis to the n most frequent collocates in the search hits sample
searchHitsSampleLimit - limit the size of the search hits sample
ignoreCollocateCase - bool, set to True if collocate case should be ignored
withinSpan - KorAP span specification (see https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans) for collocations to be searched within. Defaults to base/s=s
exactFrequencies - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
stopwords - vector of stopwords not to be considered as collocates
seed - seed for random page collecting order
expand - if True, node and vc parameters are expanded to all of their combinations

Returns

DataFrame with columns 'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w', 'leftContextSize', 'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'

Details

The collocation analysis is currently implemented on the client side, as some of the functionality is not yet provided by the KorAP backend. Mainly for this reason it is very slow (several minutes, up to hours), but on the other hand very flexible. You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, and look for expression-internal collocates using the focus function (see examples and demo). To increase speed at the cost of accuracy and possible false negatives, you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE. Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web user interface.

Example

$ kcon = KorAPConnection(verbose=True)
$ df = kcon.collocationAnalysis("Grund")

def collocationScoreQuery(self, node, collocate, vc='', **kwargs)

Expand source code

def collocationScoreQuery(self, node, collocate, vc="", **kwargs):
    """Get collocation scores for given node(s) and collocate(s).

    - **node** - target word
    - **collocate** - collocate of target word
    - **vc** - virtual corpus definition or list thereof  (default: "")
    - **lemmatizeNodeQuery** - logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
    - **lemmatizeCollocateQuery** - logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
    - **leftContextSize** - size of the left context window
    - **rightContextSize** - size of the right context window
    - **scoreFunctions** - named list of R (!) score functions of the form function(O1, O2, O, N, E, window_size), see e.g. KorAPClient.pmi
    - **smoothingConstant** - smoothing constant will be added to all observed values

    Returns:
        DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w',  'leftContextSize',
           'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'`

    Example:
        ```
        $ kcon = KorAPConnection(verbose=True)
        $ df = kcon.collocationScoreQuery("Grund", "triftiger")
        ```
    """
    return KorAPClient.collocationScoreQuery(self, node, collocate, vc, **kwargs)

Get collocation scores for given node(s) and collocate(s).

node - target word
collocate - collocate of target word
vc - virtual corpus definition or list thereof (default: "")
lemmatizeNodeQuery - logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
lemmatizeCollocateQuery - logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
leftContextSize - size of the left context window
rightContextSize - size of the right context window
scoreFunctions - named list of R (!) score functions of the form function(O1, O2, O, N, E, window_size), see e.g. KorAPClient.pmi
smoothingConstant - smoothing constant will be added to all observed values

Returns

DataFrame with columns 'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w', 'leftContextSize', 'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'

Example

$ kcon = KorAPConnection(verbose=True)
$ df = kcon.collocationScoreQuery("Grund", "triftiger")

def corpusQuery(self, *args, **kwargs)

Expand source code

def corpusQuery(self, *args, **kwargs):
    """Query search term(s).

    - **query** - query string or list of query strings
    - **vc** - virtual corpus definition or list thereof (default: "")
    - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
    - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
    - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
    - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate",  "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`)
    - **verbose** - (default = `self.verbose`)

    Returns:
        `KorAPQuery`

    Example:
        ```
        $ kcon = KorAPConnection(verbose=True)
        $ q = kcon.corpusQuery("Ameisenplage")
        $ q = q.fetchAll()
        $ q.slots['collectedMatches']
            corpusSigle  ...                                          textClass
        1         WPD17  ...                                                NaN
        2         WPD17  ...                                                NaN
        3         WPD17  ...                                                NaN
        4         WPD17  ...                                                NaN
        5         WPD17  ...                                                NaN
        ..          ...  ...                                                ...
        126         Z83  ...                       freizeit-unterhaltung reisen
        127       MZE03  ...  freizeit-unterhaltung reisen natur-umwelt wett...
        128       MZE03  ...  freizeit-unterhaltung reisen staat-gesellschaf...
        129       MZE14  ...  wissenschaft populaerwissenschaft freizeit-unt...
        130       MZE00  ...                  wissenschaft populaerwissenschaft
        [130 rows x 6 columns]
        ```
    """
    return KorAPQuery(self, *args, **kwargs)

Query search term(s).

query - query string or list of query strings
vc - virtual corpus definition or list thereof (default: "")
KorAPUrl - instead of specifying the query and vc string parameters, you can copy your KorAP query URL here from the browser
metadataOnly - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
ql - query language: "poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql" (default = "poliqarp")
fields - (meta)data fields that will be fetched for every match (default = ["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass", "matchStart", "matchEnd"])
verbose - (default = self.verbose)

Returns

KorAPQuery

Example

$ kcon = KorAPConnection(verbose=True)
$ q = kcon.corpusQuery("Ameisenplage")
$ q = q.fetchAll()
$ q.slots['collectedMatches']
    corpusSigle  ...                                          textClass
1         WPD17  ...                                                NaN
2         WPD17  ...                                                NaN
3         WPD17  ...                                                NaN
4         WPD17  ...                                                NaN
5         WPD17  ...                                                NaN
..          ...  ...                                                ...
126         Z83  ...                       freizeit-unterhaltung reisen
127       MZE03  ...  freizeit-unterhaltung reisen natur-umwelt wett...
128       MZE03  ...  freizeit-unterhaltung reisen staat-gesellschaf...
129       MZE14  ...  wissenschaft populaerwissenschaft freizeit-unt...
130       MZE00  ...                  wissenschaft populaerwissenschaft
[130 rows x 6 columns]

def corpusStats(self, *args, **kwargs)

Expand source code

def corpusStats(self, *args, **kwargs):
    """Query the size of the whole corpus or a virtual corpus specified by the vc argument.

    - **vc** (default = "")
    - **verbose** (default = kco@verbose)
    - **as.df** (default = True)

    Returns:
        `DataFrame`|`RS4`

    Example:
        ```
        $ df = kcon.corpusStats("pubDate in 2018 & textType=/Zeit.*/ & pubPlaceKey=IT", **{"as.df": True})
        $ df["tokens"]
        12150897
        ```
    """
    default_kwargs = {"as.df": True}
    default_kwargs.update(kwargs)
    return KorAPClient.corpusStats(self, *args, **default_kwargs)

Query the size of the whole corpus or a virtual corpus specified by the vc argument.

vc (default = "")
verbose (default = kco@verbose)
as.df (default = True)

Returns

DataFrame|RS4

Example

$ df = kcon.corpusStats("pubDate in 2018 & textType=/Zeit.*/ & pubPlaceKey=IT", **{"as.df": True})
$ df["tokens"]
12150897

def frequencyQuery(self, *args, **kwargs)

Expand source code

def frequencyQuery(self, *args, **kwargs):
    """Query relative frequency of search term(s).

    - **query** - query string or list of query strings
    - **vc** - virtual corpus definition or list thereof  (default: "")
    - **conf.level** - confidence level of the returned confidence interval (default = 0.95)
    - **as.alternatives** - decides whether queries should be treated as mutually exclusive and exhaustive wrt. to some meaningful class (e.g. spelling variants of a certain word form) (default = False)
    - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
    - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
    - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
    - **accessRewriteFatal** - abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented) (default = `True`)
    - **verbose** - (default = `self.verbose`)
    - **expand** - bool that decides if `query` and `vc` parameters are expanded to all of their combinations (default = `len(vc) != len(query)`)

    Returns:
        DataFrame with columns `'query', 'totalResults', 'vc', 'webUIRequestUrl', 'total', 'f',
       'conf.low', 'conf.high'`.

    Example:
        ```
        $ kcon = KorAPConnection(verbose=True)
        $ kcon.frequencyQuery("Ameisenplage", vc=["pubDate in "+str(y) for y in range(2010,2015)])
                              query  totalResults  ...      conf.low     conf.high
        1  Ameisenplage             3  ...  9.727696e-10  1.200289e-08
        2  Ameisenplage            12  ...  3.838218e-09  1.275717e-08
        3  Ameisenplage             5  ...  2.013352e-09  1.356500e-08
        4  Ameisenplage             6  ...  2.691331e-09  1.519888e-08
        5  Ameisenplage             3  ...  8.629463e-10  1.064780e-08
        ```
    """
    return KorAPClient.frequencyQuery(self, *args, **kwargs)

Query relative frequency of search term(s).

query - query string or list of query strings
vc - virtual corpus definition or list thereof (default: "")
conf.level - confidence level of the returned confidence interval (default = 0.95)
as.alternatives - decides whether queries should be treated as mutually exclusive and exhaustive wrt. to some meaningful class (e.g. spelling variants of a certain word form) (default = False)
KorAPUrl - instead of specifying the query and vc string parameters, you can copy your KorAP query URL here from the browser
metadataOnly - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
ql - query language: "poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql" (default = "poliqarp")
accessRewriteFatal - abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented) (default = True)
verbose - (default = self.verbose)
expand - bool that decides if query and vc parameters are expanded to all of their combinations (default = len(vc) != len(query))

Returns

DataFrame with columns 'query', 'totalResults', 'vc', 'webUIRequestUrl', 'total', 'f', 'conf.low', 'conf.high'.

Example

$ kcon = KorAPConnection(verbose=True)
$ kcon.frequencyQuery("Ameisenplage", vc=["pubDate in "+str(y) for y in range(2010,2015)])
                      query  totalResults  ...      conf.low     conf.high
1  Ameisenplage             3  ...  9.727696e-10  1.200289e-08
2  Ameisenplage            12  ...  3.838218e-09  1.275717e-08
3  Ameisenplage             5  ...  2.013352e-09  1.356500e-08
4  Ameisenplage             6  ...  2.691331e-09  1.519888e-08
5  Ameisenplage             3  ...  8.629463e-10  1.064780e-08

def mergeDuplicateCollocates(self, *args, **kwargs)

Expand source code

def mergeDuplicateCollocates(self, *args, **kwargs):
    """Merge collocation analysis results for different context positions."""
    return KorAPClient.mergeDuplicateCollocates(*args, **kwargs)

Merge collocation analysis results for different context positions.

def textMetadata(self, textSigle, **kwargs)

Expand source code

def textMetadata(self, textSigle, **kwargs):
    """ Retrieves metadata for a text, identified by its sigle (id) using the corresponding KorAP API
    (see `Kustvakt Wiki https://github.com/KorAP/Kustvakt/wiki/Service:-Metadata-Retrieval`).

    - **textSigle** - unique text id (concatenation of corpus, document and text ids, separated by `/`, e.g. ) or list thereof

    Returns:
        DataFrame with columns for each metadata property. In case of errors, such as non-existing texts/sigles, the tibble will also contain a column called `errors`.
        If there are metadata columns you cannot make sense of, please ignore them. The function simply returns all the metadata it gets from the server.

    Example:
        ```
        $ kcon = KorAPConnection(verbose=True)
        $ kcon.textMetadata(["WUD17/A97/08542", "WUD17/B96/57558", "WUD17/A97/08541"])
        ```
    """
    return KorAPClient.textMetadata(self, textSigle, **kwargs)

Retrieves metadata for a text, identified by its sigle (id) using the corresponding KorAP API (see Kustvakt Wiki https://github.com/KorAP/Kustvakt/wiki/Service:-Metadata-Retrieval).

textSigle - unique text id (concatenation of corpus, document and text ids, separated by /, e.g. ) or list thereof

Returns

DataFrame with columns for each metadata property. In case of errors, such as non-existing texts/sigles, the tibble will also contain a column called errors. If there are metadata columns you cannot make sense of, please ignore them. The function simply returns all the metadata it gets from the server.

Example

$ kcon = KorAPConnection(verbose=True)
$ kcon.textMetadata(["WUD17/A97/08542", "WUD17/B96/57558", "WUD17/A97/08541"])

class KorAPQuery (*args, **kwargs)

Expand source code

class KorAPQuery(RS4):
    """Query to a KorAP server."""

    def __init__(self, *args, **kwargs):
        kco = KorAPClient.corpusQuery(*args, **kwargs)
        super().__init__(kco)

    def fetchNext(self, *args, **kwargs):
        """Fetch next couple of query results

        - **offset** - start offset for query results to fetch
        - **maxFetch** - maximum number of query results to fetch
        - **verbose**
        - **randomizePageOrder** - fetch result pages in pseudo random order if true. (default = `False`)

        Returns:
            `KorAPQuery`
        """

        res = KorAPClient.fetchNext(self, *args, **kwargs)
        with localconverter(fix_lists_in_dataframes):
            df = res.slots['collectedMatches']
        res.slots['collectedMatches'] = df
        return res

    def fetchRest(self, *args, **kwargs):
        """Fetch remaining query results

        - **verbose**

        Returns:
            `KorAPQuery`
        """
        res = KorAPClient.fetchRest(self, *args, **kwargs)
        with localconverter(fix_lists_in_dataframes):
            df = res.slots['collectedMatches']
        res.slots['collectedMatches'] = df
        return res

    def fetchAll(self, *args, **kwargs):
        """Fetch all query results

        - **verbose**

        Returns:
            `KorAPQuery`

        Example:
            See `KorAPConnection.corpusQuery`.
        """
        res = KorAPClient.fetchRest(self, *args, **kwargs)
        with localconverter(fix_lists_in_dataframes):
            df = res.slots['collectedMatches']
        res.slots['collectedMatches'] = df
        return res

Query to a KorAP server.

Ancestors

rpy2.robjects.methods.RS4
rpy2.robjects.robject.RObjectMixin
abc.ABC
rpy2.rinterface.SexpS4
rpy2.rinterface_lib.sexp.Sexp
rpy2.rinterface_lib._rinterface_capi.SupportsSEXP

Methods

def fetchAll(self, *args, **kwargs)

Expand source code

def fetchAll(self, *args, **kwargs):
    """Fetch all query results

    - **verbose**

    Returns:
        `KorAPQuery`

    Example:
        See `KorAPConnection.corpusQuery`.
    """
    res = KorAPClient.fetchRest(self, *args, **kwargs)
    with localconverter(fix_lists_in_dataframes):
        df = res.slots['collectedMatches']
    res.slots['collectedMatches'] = df
    return res

Fetch all query results

verbose

Returns

KorAPQuery

Example

See KorAPConnection.corpusQuery().

def fetchNext(self, *args, **kwargs)

Expand source code

def fetchNext(self, *args, **kwargs):
    """Fetch next couple of query results

    - **offset** - start offset for query results to fetch
    - **maxFetch** - maximum number of query results to fetch
    - **verbose**
    - **randomizePageOrder** - fetch result pages in pseudo random order if true. (default = `False`)

    Returns:
        `KorAPQuery`
    """

    res = KorAPClient.fetchNext(self, *args, **kwargs)
    with localconverter(fix_lists_in_dataframes):
        df = res.slots['collectedMatches']
    res.slots['collectedMatches'] = df
    return res

Fetch next couple of query results

offset - start offset for query results to fetch
maxFetch - maximum number of query results to fetch
verbose
randomizePageOrder - fetch result pages in pseudo random order if true. (default = False)

Returns

KorAPQuery

def fetchRest(self, *args, **kwargs)

Expand source code

def fetchRest(self, *args, **kwargs):
    """Fetch remaining query results

    - **verbose**

    Returns:
        `KorAPQuery`
    """
    res = KorAPClient.fetchRest(self, *args, **kwargs)
    with localconverter(fix_lists_in_dataframes):
        df = res.slots['collectedMatches']
    res.slots['collectedMatches'] = df
    return res

Fetch remaining query results

verbose

Returns

KorAPQuery