-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
134 lines (101 loc) · 4.05 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import re
import requests
def validate_link(url):
"""Validate if the given link matches the expected ChatGPT share link format."""
pattern = r"^https://chatgpt\.com/share/\S+$"
if not re.match(pattern, url):
return False
return True
def fetch_content_from_url(url):
"""Fetches HTML content from the given sharable URL.
Args:
url (str): The URL to fetch content from.
Returns:
str: The HTML content of the URL.
Raises:
requests.exceptions.RequestException: If the request to the URL fails.
"""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
raise requests.exceptions.RequestException(
f"Failed to fetch content from URL: {url}"
)
def extract_messages(content, pattern):
"""Extracts author, creation time, and message parts using a regex pattern.
Args:
content (str): The content to extract messages from.
pattern (str): The regex pattern to match messages.
Returns:
list: A list of tuples containing the author, creation time, and message parts.
"""
return re.findall(pattern, content, re.DOTALL)
def deduplicate_messages(matches):
"""Removes duplicates based on message content, keeping the earliest creation time.
Args:
matches (list): A list of tuples containing author, creation time, and message content.
Returns:
dict: A dictionary with unique message content as keys, and tuples of author and creation time as values.
"""
unique_data = {}
for match in matches:
author = match[0]
create_time = float(match[1])
parts_content = match[2].strip().strip('"').replace("\\n", "\n")
# Keep only the earliest entry by creation time
if (
parts_content not in unique_data
or create_time < unique_data[parts_content][1]
):
unique_data[parts_content] = (author, create_time)
return unique_data
def sort_messages_by_time(deduplicated_data):
"""Sorts the deduplicated messages by creation time.
Args:
deduplicated_data (dict): A dictionary of deduplicated messages.
Returns:
list: A list of tuples containing message content and a tuple of author and creation time, sorted by creation time.
"""
return sorted(deduplicated_data.items(), key=lambda x: x[1][1])
def write_to_md_file(
sorted_data,
title,
output_file_name,
directory="collection",
):
"""Writes the sorted data to a markdown file, with a default directory.
Args:
sorted_data (list): A list of sorted message content.
title (str): The title of the conversation.
output_file_name (str): The name of the output markdown file.
directory (str, optional): The directory where the markdown file will be saved.
Defaults to "collection".
"""
# Ensure the directory exists
os.makedirs(directory, exist_ok=True)
# Full output file path
output_file_path = os.path.join(directory, output_file_name)
with open(output_file_path, "w", encoding="utf-8") as md_file:
# Add the title to the top of the file
md_file.write(f'---\ntitle: "{title}"\n---\n\n')
for parts_content, (author, _) in sorted_data:
if author == "assistant":
md_file.write(f"**ChatGPT**:\n\n{parts_content}\n\n")
else:
md_file.write(f"**You**:\n\n{parts_content}\n\n")
md_file.write("---\n\n")
return output_file_path
def extract_title_from_response(response_text):
"""Extracts the title from the response text."""
# Define the regex pattern to find the title followed by the description
pattern = r'"title":"([^"]+)"'
# Use regex to search for the title followed by the correct description in the response text
match = re.search(pattern, response_text)
# If a match is found, print the title
if match:
title = match.group(1)
return title
else:
return "Title not found"