ai-content-maker/.venv/Lib/site-packages/srsly/_json_api.py

210 lines
6.9 KiB
Python

from typing import Union, Iterable, Sequence, Any, Optional, Iterator
import sys
import json as _builtin_json
import gzip
from . import ujson
from .util import force_path, force_string, FilePath, JSONInput, JSONOutput
def json_dumps(
data: JSONInput, indent: Optional[int] = 0, sort_keys: bool = False
) -> str:
"""Serialize an object to a JSON string.
data: The JSON-serializable data.
indent (int): Number of spaces used to indent JSON.
sort_keys (bool): Sort dictionary keys. Falls back to json module for now.
RETURNS (str): The serialized string.
"""
if sort_keys:
indent = None if indent == 0 else indent
result = _builtin_json.dumps(
data, indent=indent, separators=(",", ":"), sort_keys=sort_keys
)
else:
result = ujson.dumps(data, indent=indent, escape_forward_slashes=False)
return result
def json_loads(data: Union[str, bytes]) -> JSONOutput:
"""Deserialize unicode or bytes to a Python object.
data (str / bytes): The data to deserialize.
RETURNS: The deserialized Python object.
"""
# Avoid transforming the string '-' into the int '0'
if data == "-":
raise ValueError("Expected object or value")
return ujson.loads(data)
def read_json(path: FilePath) -> JSONOutput:
"""Load JSON from file or standard input.
path (FilePath): The file path. "-" for reading from stdin.
RETURNS (JSONOutput): The loaded JSON content.
"""
if path == "-": # reading from sys.stdin
data = sys.stdin.read()
return ujson.loads(data)
file_path = force_path(path)
with file_path.open("r", encoding="utf8") as f:
return ujson.load(f)
def read_gzip_json(path: FilePath) -> JSONOutput:
"""Load JSON from a gzipped file.
location (FilePath): The file path.
RETURNS (JSONOutput): The loaded JSON content.
"""
file_path = force_string(path)
with gzip.open(file_path, "r") as f:
return ujson.load(f)
def read_gzip_jsonl(path: FilePath, skip: bool = False) -> Iterator[JSONOutput]:
"""Read a gzipped .jsonl file and yield contents line by line.
Blank lines will always be skipped.
path (FilePath): The file path.
skip (bool): Skip broken lines and don't raise ValueError.
YIELDS (JSONOutput): The unpacked, deserialized Python objects.
"""
with gzip.open(force_path(path), "r") as f:
for line in _yield_json_lines(f, skip=skip):
yield line
def write_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
"""Create a .json file and dump contents or write to standard
output.
location (FilePath): The file path. "-" for writing to stdout.
data (JSONInput): The JSON-serializable data to output.
indent (int): Number of spaces used to indent JSON.
"""
json_data = json_dumps(data, indent=indent)
if path == "-": # writing to stdout
print(json_data)
else:
file_path = force_path(path, require_exists=False)
with file_path.open("w", encoding="utf8") as f:
f.write(json_data)
def write_gzip_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
"""Create a .json.gz file and dump contents.
path (FilePath): The file path.
data (JSONInput): The JSON-serializable data to output.
indent (int): Number of spaces used to indent JSON.
"""
json_data = json_dumps(data, indent=indent)
file_path = force_string(path)
with gzip.open(file_path, "w") as f:
f.write(json_data.encode("utf-8"))
def write_gzip_jsonl(
path: FilePath,
lines: Iterable[JSONInput],
append: bool = False,
append_new_line: bool = True,
) -> None:
"""Create a .jsonl.gz file and dump contents.
location (FilePath): The file path.
lines (Sequence[JSONInput]): The JSON-serializable contents of each line.
append (bool): Whether or not to append to the location. Appending to .gz files is generally not recommended, as it
doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly
compressed.
append_new_line (bool): Whether or not to write a new line before appending
to the file.
"""
mode = "a" if append else "w"
file_path = force_path(path, require_exists=False)
with gzip.open(file_path, mode=mode) as f:
if append and append_new_line:
f.write("\n".encode("utf-8"))
f.writelines([(json_dumps(line) + "\n").encode("utf-8") for line in lines])
def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]:
"""Read a .jsonl file or standard input and yield contents line by line.
Blank lines will always be skipped.
path (FilePath): The file path. "-" for reading from stdin.
skip (bool): Skip broken lines and don't raise ValueError.
YIELDS (JSONOutput): The loaded JSON contents of each line.
"""
if path == "-": # reading from sys.stdin
for line in _yield_json_lines(sys.stdin, skip=skip):
yield line
else:
file_path = force_path(path)
with file_path.open("r", encoding="utf8") as f:
for line in _yield_json_lines(f, skip=skip):
yield line
def write_jsonl(
path: FilePath,
lines: Iterable[JSONInput],
append: bool = False,
append_new_line: bool = True,
) -> None:
"""Create a .jsonl file and dump contents or write to standard output.
location (FilePath): The file path. "-" for writing to stdout.
lines (Sequence[JSONInput]): The JSON-serializable contents of each line.
append (bool): Whether or not to append to the location.
append_new_line (bool): Whether or not to write a new line before appending
to the file.
"""
if path == "-": # writing to stdout
for line in lines:
print(json_dumps(line))
else:
mode = "a" if append else "w"
file_path = force_path(path, require_exists=False)
with file_path.open(mode, encoding="utf-8") as f:
if append and append_new_line:
f.write("\n")
for line in lines:
f.write(json_dumps(line) + "\n")
def is_json_serializable(obj: Any) -> bool:
"""Check if a Python object is JSON-serializable.
obj: The object to check.
RETURNS (bool): Whether the object is JSON-serializable.
"""
if hasattr(obj, "__call__"):
# Check this separately here to prevent infinite recursions
return False
try:
ujson.dumps(obj)
return True
except (TypeError, OverflowError):
return False
def _yield_json_lines(
stream: Iterable[str], skip: bool = False
) -> Iterable[JSONOutput]:
line_no = 1
for line in stream:
line = line.strip()
if line == "":
continue
try:
yield ujson.loads(line)
except ValueError:
if skip:
continue
raise ValueError(f"Invalid JSON on line {line_no}: {line}")
line_no += 1